From 3a19407913e8e43d6b799a59bc0f6cae889b5446 Mon Sep 17 00:00:00 2001 From: Wang Lu Date: Thu, 9 Sep 2021 11:25:28 +0800 Subject: [PATCH 001/512] PCI/P2PDMA: Apply bus offset correctly in DMA address calculation The bus offset is bus address - physical address, so the calculation in __pci_p2pdma_map_sg should be: bus address = physical address + bus offset. Correct the dma_address computation in __pci_p2pdma_map_sg(). Link: https://lore.kernel.org/r/20210909032528.24517-1-wanglu@dapustor.com Signed-off-by: Wang Lu Signed-off-by: Bjorn Helgaas Reviewed-by: Logan Gunthorpe --- drivers/pci/p2pdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 50cdde3e9a8b..327882638b30 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -874,7 +874,7 @@ static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap, int i; for_each_sg(sg, s, nents, i) { - s->dma_address = sg_phys(s) - p2p_pgmap->bus_offset; + s->dma_address = sg_phys(s) + p2p_pgmap->bus_offset; sg_dma_len(s) = s->length; } From e3f4bd3462f6f796594ecc0dda7144ed2d1e5a26 Mon Sep 17 00:00:00 2001 From: Ingmar Klein Date: Fri, 9 Apr 2021 11:26:33 +0200 Subject: [PATCH 002/512] PCI: Mark Atheros QCA6174 to avoid bus reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When passing the Atheros QCA6174 through to a virtual machine, the VM hangs at the point where the ath10k driver loads. Add a quirk to avoid bus resets on this device, which avoids the hang. [bhelgaas: commit log] Link: https://lore.kernel.org/r/08982e05-b6e8-5a8d-24ab-da1488ee50a8@web.de Signed-off-by: Ingmar Klein Signed-off-by: Bjorn Helgaas Reviewed-by: Pali Rohár Cc: stable@vger.kernel.org --- drivers/pci/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4537d1ea14fd..6c957124f84d 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3612,6 +3612,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0032, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0033, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0034, quirk_no_bus_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003e, quirk_no_bus_reset); /* * Root port on some Cavium CN8xxx chips do not successfully complete a bus From 0e8ae5a6ff5952253cd7cc0260df838ab4c21009 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 30 Aug 2021 10:08:10 +0200 Subject: [PATCH 003/512] PCI/portdrv: Do not setup up IRQs if there are no users Avoid registering service IRQs if there is no service that offers them or no driver to register a handler against them. This saves IRQ vectors when they are limited (e.g. on x86) and also avoids that spurious events could hit a missing handler. Such spurious events need to be generated by the Jailhouse hypervisor for active MSI vectors when enabling or disabling itself. Link: https://lore.kernel.org/r/8f9a13ac-8ab1-15ac-06cb-c131b488a36f@siemens.com Signed-off-by: Jan Kiszka Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/portdrv_core.c | 47 +++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 3ee63968deaa..bf5440c3ca8a 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -166,9 +166,6 @@ static int pcie_init_service_irqs(struct pci_dev *dev, int *irqs, int mask) { int ret, i; - for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) - irqs[i] = -1; - /* * If we support PME but can't use MSI/MSI-X for it, we have to * fall back to INTx or other interrupts, e.g., a system shared @@ -317,8 +314,10 @@ static int pcie_device_init(struct pci_dev *pdev, int service, int irq) */ int pcie_port_device_register(struct pci_dev *dev) { - int status, capabilities, i, nr_service; - int irqs[PCIE_PORT_DEVICE_MAXSERVICES]; + int status, capabilities, irq_services, i, nr_service; + int irqs[PCIE_PORT_DEVICE_MAXSERVICES] = { + [0 ... PCIE_PORT_DEVICE_MAXSERVICES-1] = -1 + }; /* Enable PCI Express port device */ status = pci_enable_device(dev); @@ -331,18 +330,32 @@ int pcie_port_device_register(struct pci_dev *dev) return 0; pci_set_master(dev); - /* - * Initialize service irqs. Don't use service devices that - * require interrupts if there is no way to generate them. - * However, some drivers may have a polling mode (e.g. pciehp_poll_mode) - * that can be used in the absence of irqs. Allow them to determine - * if that is to be used. - */ - status = pcie_init_service_irqs(dev, irqs, capabilities); - if (status) { - capabilities &= PCIE_PORT_SERVICE_HP; - if (!capabilities) - goto error_disable; + + irq_services = 0; + if (IS_ENABLED(CONFIG_PCIE_PME)) + irq_services |= PCIE_PORT_SERVICE_PME; + if (IS_ENABLED(CONFIG_PCIEAER)) + irq_services |= PCIE_PORT_SERVICE_AER; + if (IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) + irq_services |= PCIE_PORT_SERVICE_HP; + if (IS_ENABLED(CONFIG_PCIE_DPC)) + irq_services |= PCIE_PORT_SERVICE_DPC; + irq_services &= capabilities; + + if (irq_services) { + /* + * Initialize service IRQs. Don't use service devices that + * require interrupts if there is no way to generate them. + * However, some drivers may have a polling mode (e.g. + * pciehp_poll_mode) that can be used in the absence of IRQs. + * Allow them to determine if that is to be used. + */ + status = pcie_init_service_irqs(dev, irqs, irq_services); + if (status) { + irq_services &= PCIE_PORT_SERVICE_HP; + if (!irq_services) + goto error_disable; + } } /* Allocate child services if any */ From 06dc660e6eb8817c4c379d2ca290ae0b3f77c69f Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 14 Sep 2021 01:27:08 +1000 Subject: [PATCH 004/512] PCI: Rename pcibios_add_device() to pcibios_device_add() The general convention for pcibios_* hooks is that they're named after the corresponding pci_* function they provide a hook for. The exception is pcibios_add_device() which provides a hook for pci_device_add(). Rename pcibios_add_device() to pcibios_device_add() so it matches pci_device_add(). Also, remove the export of the microblaze version. The only caller must be compiled as a built-in so there's no reason for the export. Link: https://lore.kernel.org/r/20210913152709.48013-1-oohall@gmail.com Signed-off-by: Oliver O'Halloran Signed-off-by: Bjorn Helgaas Acked-by: Niklas Schnelle # s390 --- arch/microblaze/pci/pci-common.c | 3 +-- arch/powerpc/kernel/pci-common.c | 2 +- arch/powerpc/platforms/powernv/pci-sriov.c | 2 +- arch/s390/pci/pci.c | 2 +- arch/sparc/kernel/pci.c | 2 +- arch/x86/pci/common.c | 2 +- drivers/pci/pci.c | 4 ++-- drivers/pci/probe.c | 4 ++-- include/linux/pci.h | 2 +- 9 files changed, 11 insertions(+), 12 deletions(-) diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 557585f1be41..622a4867f9e9 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -587,13 +587,12 @@ static void pcibios_fixup_resources(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_resources); -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { dev->irq = of_irq_parse_and_map_pci(dev, 0, 0); return 0; } -EXPORT_SYMBOL(pcibios_add_device); /* * Reparent resource children of pr that conflict with res diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index c3573430919d..6749905932f4 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1059,7 +1059,7 @@ void pcibios_bus_add_device(struct pci_dev *dev) ppc_md.pcibios_bus_add_device(dev); } -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { struct irq_domain *d; diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c index 28aac933a439..486c2937b159 100644 --- a/arch/powerpc/platforms/powernv/pci-sriov.c +++ b/arch/powerpc/platforms/powernv/pci-sriov.c @@ -54,7 +54,7 @@ * to "new_size", calculated above. Implementing this is a convoluted process * which requires several hooks in the PCI core: * - * 1. In pcibios_add_device() we call pnv_pci_ioda_fixup_iov(). + * 1. In pcibios_device_add() we call pnv_pci_ioda_fixup_iov(). * * At this point the device has been probed and the device's BARs are sized, * but no resource allocations have been done. The SR-IOV BARs are sized diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index e7e6788d75a8..ded3321b7208 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -561,7 +561,7 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev) zdev->has_resources = 0; } -int pcibios_add_device(struct pci_dev *pdev) +int pcibios_device_add(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); struct resource *res; diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 9c2b720bfd20..31b0c1983286 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -1010,7 +1010,7 @@ void pcibios_set_master(struct pci_dev *dev) } #ifdef CONFIG_PCI_IOV -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { struct pci_dev *pdev; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3507f456fcd0..9e1e6b8d8876 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -632,7 +632,7 @@ static void set_dev_domain_options(struct pci_dev *pdev) pdev->hotplug_user_indicators = 1; } -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { struct pci_setup_rom *rom; struct irq_domain *msidom; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ce2ab62b64cf..c63598c1cdd8 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2091,14 +2091,14 @@ void pcim_pin_device(struct pci_dev *pdev) EXPORT_SYMBOL(pcim_pin_device); /* - * pcibios_add_device - provide arch specific hooks when adding device dev + * pcibios_device_add - provide arch specific hooks when adding device dev * @dev: the PCI device being added * * Permits the platform to provide architecture specific functionality when * devices are added. This is the default implementation. Architecture * implementations can override this. */ -int __weak pcibios_add_device(struct pci_dev *dev) +int __weak pcibios_device_add(struct pci_dev *dev) { return 0; } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index d9fc02a71baa..2ba43b6adf31 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2450,7 +2450,7 @@ static struct irq_domain *pci_dev_msi_domain(struct pci_dev *dev) struct irq_domain *d; /* - * If a domain has been set through the pcibios_add_device() + * If a domain has been set through the pcibios_device_add() * callback, then this is the one (platform code knows best). */ d = dev_get_msi_domain(&dev->dev); @@ -2518,7 +2518,7 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) list_add_tail(&dev->bus_list, &bus->devices); up_write(&pci_bus_sem); - ret = pcibios_add_device(dev); + ret = pcibios_device_add(dev); WARN_ON(ret < 0); /* Set up MSI IRQ domain */ diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..7e0ce3a4d5a1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2126,7 +2126,7 @@ void pcibios_disable_device(struct pci_dev *dev); void pcibios_set_master(struct pci_dev *dev); int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state); -int pcibios_add_device(struct pci_dev *dev); +int pcibios_device_add(struct pci_dev *dev); void pcibios_release_device(struct pci_dev *dev); #ifdef CONFIG_PCI void pcibios_penalize_isa_irq(int irq, int active); From 9a0a1417d3bb91dba9f95fef9771954ff72ede19 Mon Sep 17 00:00:00 2001 From: Pranay Sanghai Date: Sat, 18 Sep 2021 13:18:02 -0700 Subject: [PATCH 005/512] PCI: Tidy comments Make comments follow multi-line comment conventions. No functional change intended. Link: https://lore.kernel.org/r/YUZJenW2UCA4Qu0O@pranay-desktop Signed-off-by: Pranay Sanghai Signed-off-by: Bjorn Helgaas --- drivers/pci/setup-irq.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/pci/setup-irq.c b/drivers/pci/setup-irq.c index 7129494754dd..cc7d26b015f3 100644 --- a/drivers/pci/setup-irq.c +++ b/drivers/pci/setup-irq.c @@ -8,7 +8,6 @@ * David Miller (davem@redhat.com) */ - #include #include #include @@ -28,25 +27,26 @@ void pci_assign_irq(struct pci_dev *dev) return; } - /* If this device is not on the primary bus, we need to figure out - which interrupt pin it will come in on. We know which slot it - will come in on 'cos that slot is where the bridge is. Each - time the interrupt line passes through a PCI-PCI bridge we must - apply the swizzle function. */ - + /* + * If this device is not on the primary bus, we need to figure out + * which interrupt pin it will come in on. We know which slot it + * will come in on because that slot is where the bridge is. Each + * time the interrupt line passes through a PCI-PCI bridge we must + * apply the swizzle function. + */ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); /* Cope with illegal. */ if (pin > 4) pin = 1; if (pin) { - /* Follow the chain of bridges, swizzling as we go. */ + /* Follow the chain of bridges, swizzling as we go. */ if (hbrg->swizzle_irq) slot = (*(hbrg->swizzle_irq))(dev, &pin); /* - * If a swizzling function is not used map_irq must - * ignore slot + * If a swizzling function is not used, map_irq() must + * ignore slot. */ irq = (*(hbrg->map_irq))(dev, slot, pin); if (irq == -1) @@ -56,7 +56,9 @@ void pci_assign_irq(struct pci_dev *dev) pci_dbg(dev, "assign IRQ: got %d\n", dev->irq); - /* Always tell the device, so the driver knows what is - the real IRQ to use; the device does not use it. */ + /* + * Always tell the device, so the driver knows what is the real IRQ + * to use; the device does not use it. + */ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } From af9d82626c8f8547910e3e22c76ced3f5d5f5286 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Aug 2021 14:20:51 +0200 Subject: [PATCH 006/512] PCI/ACPI: Remove OSC_PCI_SUPPORT_MASKS and OSC_PCI_CONTROL_MASKS These masks are only used internally in the PCI Host Bridge _OSC negotiation code, which already makes sure nothing outside of these masks is set. Remove the masks and simplify the code. Suggested-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20210824122054.29481-2-joro@8bytes.org Signed-off-by: Joerg Roedel Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- drivers/acpi/pci_root.c | 9 +++------ include/linux/acpi.h | 2 -- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index d7deedf3548e..0c3030a58219 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -199,18 +199,15 @@ static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, acpi_status status; u32 result, capbuf[3]; - support &= OSC_PCI_SUPPORT_MASKS; support |= root->osc_support_set; capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = support; - if (control) { - *control &= OSC_PCI_CONTROL_MASKS; + if (control) capbuf[OSC_CONTROL_DWORD] = *control | root->osc_control_set; - } else { + else /* Run _OSC query only with existing controls. */ capbuf[OSC_CONTROL_DWORD] = root->osc_control_set; - } status = acpi_pci_run_osc(root->device->handle, capbuf, &result); if (ACPI_SUCCESS(status)) { @@ -357,7 +354,7 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r if (!mask) return AE_BAD_PARAMETER; - ctrl = *mask & OSC_PCI_CONTROL_MASKS; + ctrl = *mask; if ((ctrl & req) != req) return AE_TYPE; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 974d497a897d..eb59fd3052c0 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -577,7 +577,6 @@ extern u32 osc_sb_native_usb4_control; #define OSC_PCI_MSI_SUPPORT 0x00000010 #define OSC_PCI_EDR_SUPPORT 0x00000080 #define OSC_PCI_HPX_TYPE_3_SUPPORT 0x00000100 -#define OSC_PCI_SUPPORT_MASKS 0x0000019f /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */ #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 0x00000001 @@ -587,7 +586,6 @@ extern u32 osc_sb_native_usb4_control; #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL 0x00000010 #define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 #define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080 -#define OSC_PCI_CONTROL_MASKS 0x000000bf #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 From 4c6f6060b7c4fe058981d17784684429c5491243 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Aug 2021 14:20:52 +0200 Subject: [PATCH 007/512] PCI/ACPI: Move supported and control calculations to separate functions Move the calculations of supported and controlled _OSC features out of negotiate_os_control() into separate functions. Link: https://lore.kernel.org/r/20210824122054.29481-3-joro@8bytes.org Signed-off-by: Joerg Roedel Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- drivers/acpi/pci_root.c | 93 ++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 0c3030a58219..ed4e6b55e9bc 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -396,6 +396,59 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r return AE_OK; } +static u32 calculate_support(void) +{ + u32 support; + + /* + * All supported architectures that use ACPI have support for + * PCI domains, so we indicate this in _OSC support capabilities. + */ + support = OSC_PCI_SEGMENT_GROUPS_SUPPORT; + support |= OSC_PCI_HPX_TYPE_3_SUPPORT; + if (pci_ext_cfg_avail()) + support |= OSC_PCI_EXT_CONFIG_SUPPORT; + if (pcie_aspm_support_enabled()) + support |= OSC_PCI_ASPM_SUPPORT | OSC_PCI_CLOCK_PM_SUPPORT; + if (pci_msi_enabled()) + support |= OSC_PCI_MSI_SUPPORT; + if (IS_ENABLED(CONFIG_PCIE_EDR)) + support |= OSC_PCI_EDR_SUPPORT; + + return support; +} + +static u32 calculate_control(void) +{ + u32 control; + + control = OSC_PCI_EXPRESS_CAPABILITY_CONTROL + | OSC_PCI_EXPRESS_PME_CONTROL; + + if (IS_ENABLED(CONFIG_PCIEASPM)) + control |= OSC_PCI_EXPRESS_LTR_CONTROL; + + if (IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) + control |= OSC_PCI_EXPRESS_NATIVE_HP_CONTROL; + + if (IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC)) + control |= OSC_PCI_SHPC_NATIVE_HP_CONTROL; + + if (pci_aer_available()) + control |= OSC_PCI_EXPRESS_AER_CONTROL; + + /* + * Per the Downstream Port Containment Related Enhancements ECN to + * the PCI Firmware Spec, r3.2, sec 4.5.1, table 4-5, + * OSC_PCI_EXPRESS_DPC_CONTROL indicates the OS supports both DPC + * and EDR. + */ + if (IS_ENABLED(CONFIG_PCIE_DPC) && IS_ENABLED(CONFIG_PCIE_EDR)) + control |= OSC_PCI_EXPRESS_DPC_CONTROL; + + return control; +} + static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, bool is_pcie) { @@ -416,20 +469,7 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, return; } - /* - * All supported architectures that use ACPI have support for - * PCI domains, so we indicate this in _OSC support capabilities. - */ - support = OSC_PCI_SEGMENT_GROUPS_SUPPORT; - support |= OSC_PCI_HPX_TYPE_3_SUPPORT; - if (pci_ext_cfg_avail()) - support |= OSC_PCI_EXT_CONFIG_SUPPORT; - if (pcie_aspm_support_enabled()) - support |= OSC_PCI_ASPM_SUPPORT | OSC_PCI_CLOCK_PM_SUPPORT; - if (pci_msi_enabled()) - support |= OSC_PCI_MSI_SUPPORT; - if (IS_ENABLED(CONFIG_PCIE_EDR)) - support |= OSC_PCI_EDR_SUPPORT; + support = calculate_support(); decode_osc_support(root, "OS supports", support); status = acpi_pci_osc_support(root, support); @@ -456,31 +496,8 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, return; } - control = OSC_PCI_EXPRESS_CAPABILITY_CONTROL - | OSC_PCI_EXPRESS_PME_CONTROL; + requested = control = calculate_control(); - if (IS_ENABLED(CONFIG_PCIEASPM)) - control |= OSC_PCI_EXPRESS_LTR_CONTROL; - - if (IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) - control |= OSC_PCI_EXPRESS_NATIVE_HP_CONTROL; - - if (IS_ENABLED(CONFIG_HOTPLUG_PCI_SHPC)) - control |= OSC_PCI_SHPC_NATIVE_HP_CONTROL; - - if (pci_aer_available()) - control |= OSC_PCI_EXPRESS_AER_CONTROL; - - /* - * Per the Downstream Port Containment Related Enhancements ECN to - * the PCI Firmware Spec, r3.2, sec 4.5.1, table 4-5, - * OSC_PCI_EXPRESS_DPC_CONTROL indicates the OS supports both DPC - * and EDR. - */ - if (IS_ENABLED(CONFIG_PCIE_DPC) && IS_ENABLED(CONFIG_PCIE_EDR)) - control |= OSC_PCI_EXPRESS_DPC_CONTROL; - - requested = control; status = acpi_pci_osc_control_set(handle, &control, OSC_PCI_EXPRESS_CAPABILITY_CONTROL); if (ACPI_SUCCESS(status)) { From 87f1f87a16818c36431391ea8c30ea926cab917f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Aug 2021 14:20:53 +0200 Subject: [PATCH 008/512] PCI/ACPI: Move _OSC query checks to separate function Move the checks about whether the _OSC controls are requested from the firmware to a separate function. Link: https://lore.kernel.org/r/20210824122054.29481-4-joro@8bytes.org Signed-off-by: Joerg Roedel Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- drivers/acpi/pci_root.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index ed4e6b55e9bc..f12e512bcddc 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -449,6 +449,24 @@ static u32 calculate_control(void) return control; } +static bool os_control_query_checks(struct acpi_pci_root *root, u32 support) +{ + struct acpi_device *device = root->device; + + if (pcie_ports_disabled) { + dev_info(&device->dev, "PCIe port services disabled; not requesting _OSC control\n"); + return false; + } + + if ((support & ACPI_PCIE_REQ_SUPPORT) != ACPI_PCIE_REQ_SUPPORT) { + decode_osc_support(root, "not requesting OS control; OS requires", + ACPI_PCIE_REQ_SUPPORT); + return false; + } + + return true; +} + static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, bool is_pcie) { @@ -485,16 +503,8 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, return; } - if (pcie_ports_disabled) { - dev_info(&device->dev, "PCIe port services disabled; not requesting _OSC control\n"); + if (!os_control_query_checks(root, support)) return; - } - - if ((support & ACPI_PCIE_REQ_SUPPORT) != ACPI_PCIE_REQ_SUPPORT) { - decode_osc_support(root, "not requesting OS control; OS requires", - ACPI_PCIE_REQ_SUPPORT); - return; - } requested = control = calculate_control(); From 6bc779ee05d4fa66c99096dbf88ff61acbb8a887 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Aug 2021 14:20:54 +0200 Subject: [PATCH 009/512] PCI/ACPI: Check for _OSC support in acpi_pci_osc_control_set() Get rid of acpi_pci_osc_support() and check for _OSC supported features directly in acpi_pci_osc_control_set(). There is no point in doing an unconditional _OSC query with control=0 even when the kernel later wants to take control over more features. This saves one _OSC query and simplifies the code by getting rid of the acpi_pci_osc_support() function. As a side effect, the !control checks in acpi_pci_query_osc() can also be removed. Link: https://lore.kernel.org/r/20210824122054.29481-5-joro@8bytes.org Signed-off-by: Joerg Roedel Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- drivers/acpi/pci_root.c | 81 ++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 49 deletions(-) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index f12e512bcddc..ab2f7dfb0c44 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -203,26 +203,16 @@ static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = support; - if (control) - capbuf[OSC_CONTROL_DWORD] = *control | root->osc_control_set; - else - /* Run _OSC query only with existing controls. */ - capbuf[OSC_CONTROL_DWORD] = root->osc_control_set; + capbuf[OSC_CONTROL_DWORD] = *control | root->osc_control_set; status = acpi_pci_run_osc(root->device->handle, capbuf, &result); if (ACPI_SUCCESS(status)) { root->osc_support_set = support; - if (control) - *control = result; + *control = result; } return status; } -static acpi_status acpi_pci_osc_support(struct acpi_pci_root *root, u32 flags) -{ - return acpi_pci_query_osc(root, flags, NULL); -} - struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle) { struct acpi_pci_root *root; @@ -345,8 +335,9 @@ EXPORT_SYMBOL_GPL(acpi_get_pci_dev); * _OSC bits the BIOS has granted control of, but its contents are meaningless * on failure. **/ -static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req) +static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 support) { + u32 req = OSC_PCI_EXPRESS_CAPABILITY_CONTROL; struct acpi_pci_root *root; acpi_status status; u32 ctrl, capbuf[3]; @@ -354,22 +345,16 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r if (!mask) return AE_BAD_PARAMETER; - ctrl = *mask; - if ((ctrl & req) != req) - return AE_TYPE; - root = acpi_pci_find_root(handle); if (!root) return AE_NOT_EXIST; - *mask = ctrl | root->osc_control_set; - /* No need to evaluate _OSC if the control was already granted. */ - if ((root->osc_control_set & ctrl) == ctrl) - return AE_OK; + ctrl = *mask; + *mask |= root->osc_control_set; /* Need to check the available controls bits before requesting them. */ - while (*mask) { - status = acpi_pci_query_osc(root, root->osc_support_set, mask); + do { + status = acpi_pci_query_osc(root, support, mask); if (ACPI_FAILURE(status)) return status; if (ctrl == *mask) @@ -377,7 +362,11 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r decode_osc_control(root, "platform does not support", ctrl & ~(*mask)); ctrl = *mask; - } + } while (*mask); + + /* No need to request _OSC if the control was already granted. */ + if ((root->osc_control_set & ctrl) == ctrl) + return AE_OK; if ((ctrl & req) != req) { decode_osc_control(root, "not requesting control; platform does not support", @@ -470,7 +459,7 @@ static bool os_control_query_checks(struct acpi_pci_root *root, u32 support) static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, bool is_pcie) { - u32 support, control, requested; + u32 support, control = 0, requested = 0; acpi_status status; struct acpi_device *device = root->device; acpi_handle handle = device->handle; @@ -490,28 +479,15 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, support = calculate_support(); decode_osc_support(root, "OS supports", support); - status = acpi_pci_osc_support(root, support); - if (ACPI_FAILURE(status)) { - *no_aspm = 1; - /* _OSC is optional for PCI host bridges */ - if ((status == AE_NOT_FOUND) && !is_pcie) - return; + if (os_control_query_checks(root, support)) + requested = control = calculate_control(); - dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n", - acpi_format_exception(status)); - return; - } - - if (!os_control_query_checks(root, support)) - return; - - requested = control = calculate_control(); - - status = acpi_pci_osc_control_set(handle, &control, - OSC_PCI_EXPRESS_CAPABILITY_CONTROL); + status = acpi_pci_osc_control_set(handle, &control, support); if (ACPI_SUCCESS(status)) { - decode_osc_control(root, "OS now controls", control); + if (control) + decode_osc_control(root, "OS now controls", control); + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_ASPM) { /* * We have ASPM control, but the FADT indicates that @@ -522,11 +498,6 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, *no_aspm = 1; } } else { - decode_osc_control(root, "OS requested", requested); - decode_osc_control(root, "platform willing to grant", control); - dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n", - acpi_format_exception(status)); - /* * We want to disable ASPM here, but aspm_disabled * needs to remain in its state from boot so that we @@ -535,6 +506,18 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, * root scan. */ *no_aspm = 1; + + /* _OSC is optional for PCI host bridges */ + if ((status == AE_NOT_FOUND) && !is_pcie) + return; + + if (control) { + decode_osc_control(root, "OS requested", requested); + decode_osc_control(root, "platform willing to grant", control); + } + + dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n", + acpi_format_exception(status)); } } From 95e83e219d68956ba4fed9326683e4d2bd3e39a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 15 Sep 2021 23:01:25 +0000 Subject: [PATCH 010/512] PCI/sysfs: Check CAP_SYS_ADMIN before parsing user input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check if the "CAP_SYS_ADMIN" capability flag is set before parsing user input as it makes more sense to first check whether the current user actually has the right permissions before accepting any input from such user. This will also make order in which enable_store() and msi_bus_store() perform the "CAP_SYS_ADMIN" capability check consistent with other PCI-related sysfs objects that first verify whether user has this capability set. Link: https://lore.kernel.org/r/20210915230127.2495723-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-sysfs.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 7fb5cd17cc98..6832e161be1c 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -273,15 +273,16 @@ static ssize_t enable_store(struct device *dev, struct device_attribute *attr, { struct pci_dev *pdev = to_pci_dev(dev); unsigned long val; - ssize_t result = kstrtoul(buf, 0, &val); - - if (result < 0) - return result; + ssize_t result; /* this can crash the machine when done on the "wrong" device */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; + result = kstrtoul(buf, 0, &val); + if (result < 0) + return result; + device_lock(dev); if (dev->driver) result = -EBUSY; @@ -378,12 +379,12 @@ static ssize_t msi_bus_store(struct device *dev, struct device_attribute *attr, struct pci_bus *subordinate = pdev->subordinate; unsigned long val; - if (kstrtoul(buf, 0, &val) < 0) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; + /* * "no_msi" and "bus_flags" only affect what happens when a driver * requests MSI or MSI-X. They don't affect any drivers that have From 36f354ec7bf92f8aaf09eaf3b261ea06c25ec337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 15 Sep 2021 23:01:26 +0000 Subject: [PATCH 011/512] PCI/sysfs: Return -EINVAL consistently from "store" functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most of the "store" functions that handle userspace input via sysfs return -EINVAL should the value fail validation and/or type conversion. This error code is a clear message to userspace that the value is not a valid input. However, some of the "show" functions return input parsing error codes as-is, which may be either -EINVAL or -ERANGE. The former would often be from kstrtobool(), and the latter typically from other kstr*() functions such as kstrtou8(), kstrtou32(), kstrtoint(), etc. -EINVAL is commonly returned as the error code to indicate that the value provided is invalid, but -ERANGE is not very useful in userspace. Therefore, normalize the return error code to be -EINVAL for when the validation and/or type conversion fails. Link: https://lore.kernel.org/r/20210915230127.2495723-2-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/endpoint/functions/pci-epf-ntb.c | 18 ++++------ drivers/pci/endpoint/pci-ep-cfs.c | 35 +++++++------------- drivers/pci/iov.c | 14 ++++---- drivers/pci/pci-sysfs.c | 20 +++++------ 4 files changed, 33 insertions(+), 54 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-ntb.c b/drivers/pci/endpoint/functions/pci-epf-ntb.c index 8b4756159f15..e67489144349 100644 --- a/drivers/pci/endpoint/functions/pci-epf-ntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-ntb.c @@ -1947,11 +1947,9 @@ static ssize_t epf_ntb_##_name##_store(struct config_item *item, \ struct config_group *group = to_config_group(item); \ struct epf_ntb *ntb = to_epf_ntb(group); \ u32 val; \ - int ret; \ \ - ret = kstrtou32(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou32(page, 0, &val) < 0) \ + return -EINVAL; \ \ ntb->_name = val; \ \ @@ -1980,11 +1978,9 @@ static ssize_t epf_ntb_##_name##_store(struct config_item *item, \ struct device *dev = &ntb->epf->dev; \ int win_no; \ u64 val; \ - int ret; \ \ - ret = kstrtou64(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou64(page, 0, &val) < 0) \ + return -EINVAL; \ \ if (sscanf(#_name, "mw%d", &win_no) != 1) \ return -EINVAL; \ @@ -2005,11 +2001,9 @@ static ssize_t epf_ntb_num_mws_store(struct config_item *item, struct config_group *group = to_config_group(item); struct epf_ntb *ntb = to_epf_ntb(group); u32 val; - int ret; - ret = kstrtou32(page, 0, &val); - if (ret) - return ret; + if (kstrtou32(page, 0, &val) < 0) + return -EINVAL; if (val > MAX_MW) return -EINVAL; diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c index 999911801877..19bc0e828c0c 100644 --- a/drivers/pci/endpoint/pci-ep-cfs.c +++ b/drivers/pci/endpoint/pci-ep-cfs.c @@ -175,9 +175,8 @@ static ssize_t pci_epc_start_store(struct config_item *item, const char *page, epc = epc_group->epc; - ret = kstrtobool(page, &start); - if (ret) - return ret; + if (kstrtobool(page, &start) < 0) + return -EINVAL; if (!start) { pci_epc_stop(epc); @@ -329,13 +328,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item, \ const char *page, size_t len) \ { \ u32 val; \ - int ret; \ struct pci_epf *epf = to_pci_epf_group(item)->epf; \ if (WARN_ON_ONCE(!epf->header)) \ return -EINVAL; \ - ret = kstrtou32(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou32(page, 0, &val) < 0) \ + return -EINVAL; \ epf->header->_name = val; \ return len; \ } @@ -345,13 +342,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item, \ const char *page, size_t len) \ { \ u16 val; \ - int ret; \ struct pci_epf *epf = to_pci_epf_group(item)->epf; \ if (WARN_ON_ONCE(!epf->header)) \ return -EINVAL; \ - ret = kstrtou16(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou16(page, 0, &val) < 0) \ + return -EINVAL; \ epf->header->_name = val; \ return len; \ } @@ -361,13 +356,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item, \ const char *page, size_t len) \ { \ u8 val; \ - int ret; \ struct pci_epf *epf = to_pci_epf_group(item)->epf; \ if (WARN_ON_ONCE(!epf->header)) \ return -EINVAL; \ - ret = kstrtou8(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou8(page, 0, &val) < 0) \ + return -EINVAL; \ epf->header->_name = val; \ return len; \ } @@ -376,11 +369,9 @@ static ssize_t pci_epf_msi_interrupts_store(struct config_item *item, const char *page, size_t len) { u8 val; - int ret; - ret = kstrtou8(page, 0, &val); - if (ret) - return ret; + if (kstrtou8(page, 0, &val) < 0) + return -EINVAL; to_pci_epf_group(item)->epf->msi_interrupts = val; @@ -398,11 +389,9 @@ static ssize_t pci_epf_msix_interrupts_store(struct config_item *item, const char *page, size_t len) { u16 val; - int ret; - ret = kstrtou16(page, 0, &val); - if (ret) - return ret; + if (kstrtou16(page, 0, &val) < 0) + return -EINVAL; to_pci_epf_group(item)->epf->msix_interrupts = val; diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index dafdc652fcd0..0267977c9f17 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -183,11 +183,10 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev, { struct pci_dev *vf_dev = to_pci_dev(dev); struct pci_dev *pdev = pci_physfn(vf_dev); - int val, ret; + int val, ret = 0; - ret = kstrtoint(buf, 0, &val); - if (ret) - return ret; + if (kstrtoint(buf, 0, &val) < 0) + return -EINVAL; if (val < 0) return -EINVAL; @@ -376,12 +375,11 @@ static ssize_t sriov_numvfs_store(struct device *dev, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - int ret; + int ret = 0; u16 num_vfs; - ret = kstrtou16(buf, 0, &num_vfs); - if (ret < 0) - return ret; + if (kstrtou16(buf, 0, &num_vfs) < 0) + return -EINVAL; if (num_vfs > pci_sriov_get_totalvfs(pdev)) return -ERANGE; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 6832e161be1c..a092fc0c665d 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -273,15 +273,14 @@ static ssize_t enable_store(struct device *dev, struct device_attribute *attr, { struct pci_dev *pdev = to_pci_dev(dev); unsigned long val; - ssize_t result; + ssize_t result = 0; /* this can crash the machine when done on the "wrong" device */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; - result = kstrtoul(buf, 0, &val); - if (result < 0) - return result; + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; device_lock(dev); if (dev->driver) @@ -313,14 +312,13 @@ static ssize_t numa_node_store(struct device *dev, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - int node, ret; + int node; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = kstrtoint(buf, 0, &node); - if (ret) - return ret; + if (kstrtoint(buf, 0, &node) < 0) + return -EINVAL; if ((node < 0 && node != NUMA_NO_NODE) || node >= MAX_NUMNODES) return -EINVAL; @@ -1340,10 +1338,10 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr, { struct pci_dev *pdev = to_pci_dev(dev); unsigned long val; - ssize_t result = kstrtoul(buf, 0, &val); + ssize_t result; - if (result < 0) - return result; + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; if (val != 1) return -EINVAL; From e0f7b19223582c302f5736e93927aafde9458d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 15 Sep 2021 23:01:27 +0000 Subject: [PATCH 012/512] PCI: Use kstrtobool() directly, sans strtobool() wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit strtobool() is a wrapper around kstrtobool() that has been added for backward compatibility. There is no reason to use the old API, so use kstrtobool() directly. Related: ef951599074b ("lib: move strtobool() to kstrtobool()") Link: https://lore.kernel.org/r/20210915230127.2495723-3-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/p2pdma.c | 6 +++--- drivers/pci/pcie/aspm.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 50cdde3e9a8b..4fccdcf9186f 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -943,7 +943,7 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs); * * Parses an attribute value to decide whether to enable p2pdma. * The value can select a PCI device (using its full BDF device - * name) or a boolean (in any format strtobool() accepts). A false + * name) or a boolean (in any format kstrtobool() accepts). A false * value disables p2pdma, a true value expects the caller * to automatically find a compatible device and specifying a PCI device * expects the caller to use the specific provider. @@ -975,11 +975,11 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, } else if ((page[0] == '0' || page[0] == '1') && !iscntrl(page[1])) { /* * If the user enters a PCI device that doesn't exist - * like "0000:01:00.1", we don't want strtobool to think + * like "0000:01:00.1", we don't want kstrtobool to think * it's a '0' when it's clearly not what the user wanted. * So we require 0's and 1's to be exactly one character. */ - } else if (!strtobool(page, use_p2pdma)) { + } else if (!kstrtobool(page, use_p2pdma)) { return 0; } diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 013a47f587ce..52c74682601a 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -1219,7 +1219,7 @@ static ssize_t aspm_attr_store_common(struct device *dev, struct pcie_link_state *link = pcie_aspm_get_link(pdev); bool state_enable; - if (strtobool(buf, &state_enable) < 0) + if (kstrtobool(buf, &state_enable) < 0) return -EINVAL; down_read(&pci_bus_sem); @@ -1276,7 +1276,7 @@ static ssize_t clkpm_store(struct device *dev, struct pcie_link_state *link = pcie_aspm_get_link(pdev); bool state_enable; - if (strtobool(buf, &state_enable) < 0) + if (kstrtobool(buf, &state_enable) < 0) return -EINVAL; down_read(&pci_bus_sem); From 7c3855c423b17f6ca211858afb0cef20569914c7 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 13 Jul 2021 20:50:07 +0800 Subject: [PATCH 013/512] PCI: Coalesce host bridge contiguous apertures Built-in graphics at 07:00.0 on HP EliteDesk 805 G6 doesn't work because graphics can't get the BAR it needs. The BIOS configuration is correct: BARs 0 and 2 both fit in the 00:08.1 bridge window. But that 00:08.1 window covers two host bridge apertures from _CRS. Previously we assumed this was illegal, so we clipped the window to fit into one aperture (see 0f7e7aee2f37 ("PCI: Add pci_bus_clip_resource() to clip to fit upstream window")). pci_bus 0000:00: root bus resource [mem 0x10020200000-0x100303fffff window] pci_bus 0000:00: root bus resource [mem 0x10030400000-0x100401fffff window] pci 0000:00:08.1: bridge window [mem 0x10030000000-0x100401fffff 64bit pref] pci 0000:07:00.0: reg 0x10: [mem 0x10030000000-0x1003fffffff 64bit pref] pci 0000:07:00.0: reg 0x18: [mem 0x10040000000-0x100401fffff 64bit pref] pci 0000:00:08.1: can't claim BAR 15 [mem 0x10030000000-0x100401fffff 64bit pref]: no compatible bridge window pci 0000:00:08.1: [mem 0x10030000000-0x100401fffff 64bit pref] clipped to [mem 0x10030000000-0x100303fffff 64bit pref] pci 0000:00:08.1: bridge window [mem 0x10030000000-0x100303fffff 64bit pref] pci 0000:07:00.0: can't claim BAR 0 [mem 0x10030000000-0x1003fffffff 64bit pref]: no compatible bridge window pci 0000:07:00.0: can't claim BAR 2 [mem 0x10040000000-0x100401fffff 64bit pref]: no compatible bridge window However, the host bridge apertures are contiguous, so there's no need to clip in this case. Coalesce contiguous apertures so we can allocate from the entire contiguous region. Previous commit 65db04053efe ("PCI: Coalesce host bridge contiguous apertures") was similar but sorted the apertures, and Guenter Roeck reported a regression in ppc:sam460ex qemu emulation from nvme; see https://lore.kernel.org/all/20210709231529.GA3270116@roeck-us.net/ [bhelgaas: commit log] Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212013 Suggested-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20210713125007.1260304-1-kai.heng.feng@canonical.com Signed-off-by: Kai-Heng Feng Signed-off-by: Bjorn Helgaas Cc: Guenter Roeck --- drivers/pci/probe.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index d9fc02a71baa..3459f460dbd8 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -883,11 +883,11 @@ static void pci_set_bus_msi_domain(struct pci_bus *bus) static int pci_register_host_bridge(struct pci_host_bridge *bridge) { struct device *parent = bridge->dev.parent; - struct resource_entry *window, *n; + struct resource_entry *window, *next, *n; struct pci_bus *bus, *b; - resource_size_t offset; + resource_size_t offset, next_offset; LIST_HEAD(resources); - struct resource *res; + struct resource *res, *next_res; char addr[64], *fmt; const char *name; int err; @@ -970,11 +970,34 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) if (nr_node_ids > 1 && pcibus_to_node(bus) == NUMA_NO_NODE) dev_warn(&bus->dev, "Unknown NUMA node; performance will be reduced\n"); - /* Add initial resources to the bus */ + /* Coalesce contiguous windows */ resource_list_for_each_entry_safe(window, n, &resources) { - list_move_tail(&window->node, &bridge->windows); + if (list_is_last(&window->node, &resources)) + break; + + next = list_next_entry(window, node); offset = window->offset; res = window->res; + next_offset = next->offset; + next_res = next->res; + + if (res->flags != next_res->flags || offset != next_offset) + continue; + + if (res->end + 1 == next_res->start) { + next_res->start = res->start; + res->flags = res->start = res->end = 0; + } + } + + /* Add initial resources to the bus */ + resource_list_for_each_entry_safe(window, n, &resources) { + offset = window->offset; + res = window->res; + if (!res->end) + continue; + + list_move_tail(&window->node, &bridge->windows); if (res->flags & IORESOURCE_BUS) pci_bus_insert_busn_res(bus, bus->number, res->end); From 3a7fb86758c96cc255ecc820ac47168047fe3dfa Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Mon, 31 May 2021 10:59:31 +0200 Subject: [PATCH 014/512] PCI: dwc: Export more symbols to allow modular drivers These symbols are used by the pci-dra7xx driver. Export them to allow building pci-dra7xx as a module. Link: https://lore.kernel.org/r/20210531085934.2662457-2-luca@lucaceresoli.net Signed-off-by: Luca Ceresoli Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/pcie-designware-ep.c | 1 + drivers/pci/controller/dwc/pcie-designware.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 998b698f4085..27e4735f577e 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -83,6 +83,7 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar) for (func_no = 0; func_no < funcs; func_no++) __dw_pcie_ep_reset_bar(pci, func_no, bar, 0); } +EXPORT_SYMBOL_GPL(dw_pcie_ep_reset_bar); static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie_ep *ep, u8 func_no, u8 cap_ptr, u8 cap) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index a945f0c0e73d..850b4533f4ef 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -538,6 +538,7 @@ int dw_pcie_link_up(struct dw_pcie *pci) return ((val & PCIE_PORT_DEBUG1_LINK_UP) && (!(val & PCIE_PORT_DEBUG1_LINK_IN_TRAINING))); } +EXPORT_SYMBOL_GPL(dw_pcie_link_up); void dw_pcie_upconfig_setup(struct dw_pcie *pci) { From 3b868d150efd3c586762cee4410cfc75f46d2a07 Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Mon, 31 May 2021 10:59:32 +0200 Subject: [PATCH 015/512] PCI: dra7xx: Make it a kernel module Enable building the driver as a loadable kernel module. Link: https://lore.kernel.org/r/20210531085934.2662457-3-luca@lucaceresoli.net Signed-off-by: Luca Ceresoli Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/Kconfig | 6 +++--- drivers/pci/controller/dwc/pci-dra7xx.c | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 76c0a63a3f64..9892a1135678 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -17,10 +17,10 @@ config PCIE_DW_EP select PCIE_DW config PCI_DRA7XX - bool + tristate config PCI_DRA7XX_HOST - bool "TI DRA7xx PCIe controller Host Mode" + tristate "TI DRA7xx PCIe controller Host Mode" depends on SOC_DRA7XX || COMPILE_TEST depends on PCI_MSI_IRQ_DOMAIN depends on OF && HAS_IOMEM && TI_PIPE3 @@ -36,7 +36,7 @@ config PCI_DRA7XX_HOST This uses the DesignWare core. config PCI_DRA7XX_EP - bool "TI DRA7xx PCIe controller Endpoint Mode" + tristate "TI DRA7xx PCIe controller Endpoint Mode" depends on SOC_DRA7XX || COMPILE_TEST depends on PCI_ENDPOINT depends on OF && HAS_IOMEM && TI_PIPE3 diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c index fbbb78f6885e..e8418446039d 100644 --- a/drivers/pci/controller/dwc/pci-dra7xx.c +++ b/drivers/pci/controller/dwc/pci-dra7xx.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -607,6 +608,7 @@ static const struct of_device_id of_dra7xx_pcie_match[] = { }, {}, }; +MODULE_DEVICE_TABLE(of, of_dra7xx_pcie_match); /* * dra7xx_pcie_unaligned_memaccess: workaround for AM572x/AM571x Errata i870 @@ -943,4 +945,8 @@ static struct platform_driver dra7xx_pcie_driver = { }, .shutdown = dra7xx_pcie_shutdown, }; -builtin_platform_driver(dra7xx_pcie_driver); +module_platform_driver(dra7xx_pcie_driver); + +MODULE_AUTHOR("Kishon Vijay Abraham I "); +MODULE_DESCRIPTION("PCIe controller driver for TI DRA7xx SoCs"); +MODULE_LICENSE("GPL v2"); From b9a6943dc891f7c4aec41f07e6b60ca2b31a3b2e Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Mon, 31 May 2021 10:59:33 +0200 Subject: [PATCH 016/512] PCI: dra7xx: Remove unused include Unused since commit e259c2926c01 ("PCI: pci-dra7xx: Prepare for deferred probe with module_platform_driver"). Link: https://lore.kernel.org/r/20210531085934.2662457-4-luca@lucaceresoli.net Signed-off-by: Luca Ceresoli Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/pci-dra7xx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c index e8418446039d..f8b1221030b4 100644 --- a/drivers/pci/controller/dwc/pci-dra7xx.c +++ b/drivers/pci/controller/dwc/pci-dra7xx.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include From 5af9405397bfb90d6adab61d58f4d94c78166698 Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Mon, 31 May 2021 10:59:34 +0200 Subject: [PATCH 017/512] PCI: dra7xx: Get an optional clock If the clock is provided externally we need to make sure it is enabled before starting PCI scan. Link: https://lore.kernel.org/r/20210531085934.2662457-5-luca@lucaceresoli.net Signed-off-by: Luca Ceresoli Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/pci-dra7xx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c index f8b1221030b4..a4221f6f3629 100644 --- a/drivers/pci/controller/dwc/pci-dra7xx.c +++ b/drivers/pci/controller/dwc/pci-dra7xx.c @@ -7,6 +7,7 @@ * Authors: Kishon Vijay Abraham I */ +#include #include #include #include @@ -90,6 +91,7 @@ struct dra7xx_pcie { int phy_count; /* DT phy-names count */ struct phy **phy; struct irq_domain *irq_domain; + struct clk *clk; enum dw_pcie_device_mode mode; }; @@ -741,6 +743,15 @@ static int dra7xx_pcie_probe(struct platform_device *pdev) if (!link) return -ENOMEM; + dra7xx->clk = devm_clk_get_optional(dev, NULL); + if (IS_ERR(dra7xx->clk)) + return dev_err_probe(dev, PTR_ERR(dra7xx->clk), + "clock request failed"); + + ret = clk_prepare_enable(dra7xx->clk); + if (ret) + return ret; + for (i = 0; i < phy_count; i++) { snprintf(name, sizeof(name), "pcie-phy%d", i); phy[i] = devm_phy_get(dev, name); @@ -926,6 +937,8 @@ static void dra7xx_pcie_shutdown(struct platform_device *pdev) pm_runtime_disable(dev); dra7xx_pcie_disable_phy(dra7xx); + + clk_disable_unprepare(dra7xx->clk); } static const struct dev_pm_ops dra7xx_pcie_pm_ops = { From 894682f0a9b34dcb6e8d2ee2d5f6380b24a5b2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 27 Sep 2021 15:43:56 +0200 Subject: [PATCH 018/512] PCI: xgene: Use PCI_VENDOR_ID_AMCC macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Header file linux/pci_ids.h defines AMCC vendor id (0x10e8) macro named PCI_VENDOR_ID_AMCC. So use this macro instead of driver custom macro. Link: https://lore.kernel.org/r/20210927134356.11799-1-pali@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński --- drivers/pci/controller/pci-xgene.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-xgene.c b/drivers/pci/controller/pci-xgene.c index e64536047b65..56d0d50338c8 100644 --- a/drivers/pci/controller/pci-xgene.c +++ b/drivers/pci/controller/pci-xgene.c @@ -48,7 +48,6 @@ #define EN_COHERENCY 0xF0000000 #define EN_REG 0x00000001 #define OB_LO_IO 0x00000002 -#define XGENE_PCIE_VENDORID 0x10E8 #define XGENE_PCIE_DEVICEID 0xE004 #define SZ_1T (SZ_1G*1024ULL) #define PIPE_PHY_RATE_RD(src) ((0xc000 & (u32)(src)) >> 0xe) @@ -560,7 +559,7 @@ static int xgene_pcie_setup(struct xgene_pcie_port *port) xgene_pcie_clear_config(port); /* setup the vendor and device IDs correctly */ - val = (XGENE_PCIE_DEVICEID << 16) | XGENE_PCIE_VENDORID; + val = (XGENE_PCIE_DEVICEID << 16) | PCI_VENDOR_ID_AMCC; xgene_pcie_writel(port, BRIDGE_CFG_0, val); ret = xgene_pcie_map_ranges(port); From a2258831d12d7845946f9c98f08d65aad9aa1121 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Wed, 1 Sep 2021 14:09:17 +0900 Subject: [PATCH 019/512] PCI: endpoint: Use sysfs_emit() in "show" functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert sprintf() in sysfs "show" functions to sysfs_emit() in order to check for buffer overruns in sysfs outputs. Link: https://lore.kernel.org/r/1630472957-26857-1-git-send-email-hayashi.kunihiko@socionext.com Signed-off-by: Kunihiko Hayashi Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński --- drivers/pci/endpoint/functions/pci-epf-ntb.c | 4 ++-- drivers/pci/endpoint/pci-ep-cfs.c | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-ntb.c b/drivers/pci/endpoint/functions/pci-epf-ntb.c index 8b4756159f15..99266f05739a 100644 --- a/drivers/pci/endpoint/functions/pci-epf-ntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-ntb.c @@ -1937,7 +1937,7 @@ static ssize_t epf_ntb_##_name##_show(struct config_item *item, \ struct config_group *group = to_config_group(item); \ struct epf_ntb *ntb = to_epf_ntb(group); \ \ - return sprintf(page, "%d\n", ntb->_name); \ + return sysfs_emit(page, "%d\n", ntb->_name); \ } #define EPF_NTB_W(_name) \ @@ -1968,7 +1968,7 @@ static ssize_t epf_ntb_##_name##_show(struct config_item *item, \ \ sscanf(#_name, "mw%d", &win_no); \ \ - return sprintf(page, "%lld\n", ntb->mws_size[win_no - 1]); \ + return sysfs_emit(page, "%lld\n", ntb->mws_size[win_no - 1]); \ } #define EPF_NTB_MW_W(_name) \ diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c index 999911801877..5a0394a38675 100644 --- a/drivers/pci/endpoint/pci-ep-cfs.c +++ b/drivers/pci/endpoint/pci-ep-cfs.c @@ -198,8 +198,7 @@ static ssize_t pci_epc_start_store(struct config_item *item, const char *page, static ssize_t pci_epc_start_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", - to_pci_epc_group(item)->start); + return sysfs_emit(page, "%d\n", to_pci_epc_group(item)->start); } CONFIGFS_ATTR(pci_epc_, start); @@ -321,7 +320,7 @@ static ssize_t pci_epf_##_name##_show(struct config_item *item, char *page) \ struct pci_epf *epf = to_pci_epf_group(item)->epf; \ if (WARN_ON_ONCE(!epf->header)) \ return -EINVAL; \ - return sprintf(page, "0x%04x\n", epf->header->_name); \ + return sysfs_emit(page, "0x%04x\n", epf->header->_name); \ } #define PCI_EPF_HEADER_W_u32(_name) \ @@ -390,8 +389,8 @@ static ssize_t pci_epf_msi_interrupts_store(struct config_item *item, static ssize_t pci_epf_msi_interrupts_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", - to_pci_epf_group(item)->epf->msi_interrupts); + return sysfs_emit(page, "%d\n", + to_pci_epf_group(item)->epf->msi_interrupts); } static ssize_t pci_epf_msix_interrupts_store(struct config_item *item, @@ -412,8 +411,8 @@ static ssize_t pci_epf_msix_interrupts_store(struct config_item *item, static ssize_t pci_epf_msix_interrupts_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", - to_pci_epf_group(item)->epf->msix_interrupts); + return sysfs_emit(page, "%d\n", + to_pci_epf_group(item)->epf->msix_interrupts); } PCI_EPF_HEADER_R(vendorid) From b860b9346e2d5667fbae2cefc571bdb6ce665b53 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 13 Sep 2021 16:08:33 +0200 Subject: [PATCH 020/512] s390/ftrace: remove dead code ftrace_shared_hotpatch_trampoline() never returns NULL, therefore quite a bit of code can be removed. Acked-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/ftrace.c | 86 +++------------------------------------ 1 file changed, 6 insertions(+), 80 deletions(-) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 1d94ffdf347b..5d0c45c13b5f 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -80,17 +80,6 @@ asm( #ifdef CONFIG_MODULES static char *ftrace_plt; - -asm( - " .data\n" - "ftrace_plt_template:\n" - " basr %r1,%r0\n" - " lg %r1,0f-.(%r1)\n" - " br %r1\n" - "0: .quad ftrace_caller\n" - "ftrace_plt_template_end:\n" - " .previous\n" -); #endif /* CONFIG_MODULES */ static const char *ftrace_shared_hotpatch_trampoline(const char **end) @@ -116,7 +105,7 @@ static const char *ftrace_shared_hotpatch_trampoline(const char **end) bool ftrace_need_init_nop(void) { - return ftrace_shared_hotpatch_trampoline(NULL); + return true; } int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) @@ -175,28 +164,6 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return 0; } -static void ftrace_generate_nop_insn(struct ftrace_insn *insn) -{ - /* brcl 0,0 */ - insn->opc = 0xc004; - insn->disp = 0; -} - -static void ftrace_generate_call_insn(struct ftrace_insn *insn, - unsigned long ip) -{ - unsigned long target; - - /* brasl r0,ftrace_caller */ - target = FTRACE_ADDR; -#ifdef CONFIG_MODULES - if (is_module_addr((void *)ip)) - target = (unsigned long)ftrace_plt; -#endif /* CONFIG_MODULES */ - insn->opc = 0xc005; - insn->disp = (target - ip) / 2; -} - static void brcl_disable(void *brcl) { u8 op = 0x04; /* set mask field to zero */ @@ -207,23 +174,7 @@ static void brcl_disable(void *brcl) int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - struct ftrace_insn orig, new, old; - - if (ftrace_shared_hotpatch_trampoline(NULL)) { - brcl_disable((void *)rec->ip); - return 0; - } - - if (copy_from_kernel_nofault(&old, (void *) rec->ip, sizeof(old))) - return -EFAULT; - /* Replace ftrace call with a nop. */ - ftrace_generate_call_insn(&orig, rec->ip); - ftrace_generate_nop_insn(&new); - - /* Verify that the to be replaced code matches what we expect. */ - if (memcmp(&orig, &old, sizeof(old))) - return -EINVAL; - s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + brcl_disable((void *)rec->ip); return 0; } @@ -236,23 +187,7 @@ static void brcl_enable(void *brcl) int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - struct ftrace_insn orig, new, old; - - if (ftrace_shared_hotpatch_trampoline(NULL)) { - brcl_enable((void *)rec->ip); - return 0; - } - - if (copy_from_kernel_nofault(&old, (void *) rec->ip, sizeof(old))) - return -EFAULT; - /* Replace nop with an ftrace call. */ - ftrace_generate_nop_insn(&orig); - ftrace_generate_call_insn(&new, rec->ip); - - /* Verify that the to be replaced code matches what we expect. */ - if (memcmp(&orig, &old, sizeof(old))) - return -EINVAL; - s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + brcl_enable((void *)rec->ip); return 0; } @@ -269,10 +204,7 @@ int __init ftrace_dyn_arch_init(void) void arch_ftrace_update_code(int command) { - if (ftrace_shared_hotpatch_trampoline(NULL)) - ftrace_modify_all_code(command); - else - ftrace_run_stop_machine(command); + ftrace_modify_all_code(command); } static void __ftrace_sync(void *dummy) @@ -281,10 +213,8 @@ static void __ftrace_sync(void *dummy) int ftrace_arch_code_modify_post_process(void) { - if (ftrace_shared_hotpatch_trampoline(NULL)) { - /* Send SIGP to the other CPUs, so they see the new code. */ - smp_call_function(__ftrace_sync, NULL, 1); - } + /* Send SIGP to the other CPUs, so they see the new code. */ + smp_call_function(__ftrace_sync, NULL, 1); return 0; } @@ -299,10 +229,6 @@ static int __init ftrace_plt_init(void) panic("cannot allocate ftrace plt\n"); start = ftrace_shared_hotpatch_trampoline(&end); - if (!start) { - start = ftrace_plt_template; - end = ftrace_plt_template_end; - } memcpy(ftrace_plt, start, end - start); set_memory_ro((unsigned long)ftrace_plt, 1); return 0; From 4df898dc06da83052c73a2ce9a6a4df5640a0905 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 6 Sep 2021 11:25:38 +0200 Subject: [PATCH 021/512] s390/kprobes: add sanity check Check whether the specified address points to the start of an instruction to prevent users from setting a kprobe in the mid of an instruction which would crash the kernel. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/kprobes.c | 48 +++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 52d056a5f89f..0093c326a239 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -120,9 +120,55 @@ static void s390_free_insn_slot(struct kprobe *p) } NOKPROBE_SYMBOL(s390_free_insn_slot); +/* Check if paddr is at an instruction boundary */ +static bool can_probe(unsigned long paddr) +{ + unsigned long addr, offset = 0; + kprobe_opcode_t insn; + struct kprobe *kp; + + if (paddr & 0x01) + return false; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return false; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn))) + return false; + + if (insn >> 8 == 0) { + if (insn != BREAKPOINT_INSTRUCTION) { + /* + * Note that QEMU inserts opcode 0x0000 to implement + * software breakpoints for guests. Since the size of + * the original instruction is unknown, stop following + * instructions and prevent setting a kprobe. + */ + return false; + } + /* + * Check if the instruction has been modified by another + * kprobe, in which case the original instruction is + * decoded. + */ + kp = get_kprobe((void *)addr); + if (!kp) { + /* not a kprobe */ + return false; + } + insn = kp->opcode; + } + addr += insn_length(insn >> 8); + } + return addr == paddr; +} + int arch_prepare_kprobe(struct kprobe *p) { - if ((unsigned long) p->addr & 0x01) + if (!can_probe((unsigned long)p->addr)) return -EINVAL; /* Make sure the probe isn't going on a difficult instruction */ if (probe_is_prohibited_opcode(p->addr)) From 1c8174fdc798489159a79466fca782daa231219a Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 7 Sep 2021 16:43:29 +0200 Subject: [PATCH 022/512] s390/pci: tolerate inconsistent handle in recover Since commit 8256adda1f44 ("s390/pci: handle FH state mismatch only on disable") zpci_disable_device() returns -EINVAL when the platform detects an attempt to disable a PCI function that it sees as already disabled. In most situations we want to abort whenever this happens and abort is possible since it either means that the device vanished but we haven't gotten an availability event yet, or the FH got out of sync which should not happen. Unfortunately there is an inconsistency between the LPAR and z/VM hypervisors on whether error events for PCI functions contain an an enabled or a general handle. So under z/VM it can happen that our most up to date function handle is enabled but trying to disable the function results in the aforementioned error. Since recover is designed to be used to recover functions from the error state let's make it robust to this inconsistency by explicitly treating it as a successful disable. Acked-by: Pierre Morel Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_sysfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index 335c281811c7..cae280e5c047 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -90,6 +90,14 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, if (zdev_enabled(zdev)) { ret = zpci_disable_device(zdev); + /* + * Due to a z/VM vs LPAR inconsistency in the error + * state the FH may indicate an enabled device but + * disable says the device is already disabled don't + * treat it as an error here. + */ + if (ret == -EINVAL) + ret = 0; if (ret) goto out; } From fa172f043f5bc21c357c54a6ca2e9c8acd18c3db Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Wed, 15 Sep 2021 13:39:16 +0200 Subject: [PATCH 023/512] s390/cio: unregister the subchannel while purging The cio_ignore list is used to create and maintain the list of devices which is to be ignored by Linux. During boot-time, this list is adjusted and accommodate all the devices which are configured on the HMC interface. Once these devices are accessible, they are then available to Linux and set online. cio_ignore purge function should align with this functionality. But currently, the subchannel associated with the offline-devices are not unregistered during purge. Add an explicit subchannel-unregister function in the purge_fn callback. Signed-off-by: Vineeth Vijayan Reviewed-by: Peter Oberparleiter Signed-off-by: Vasily Gorbik --- drivers/s390/cio/device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 8d14569823d7..07a17613fab5 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1322,6 +1322,7 @@ static int purge_fn(struct device *dev, void *data) { struct ccw_device *cdev = to_ccwdev(dev); struct ccw_dev_id *id = &cdev->private->dev_id; + struct subchannel *sch = to_subchannel(cdev->dev.parent); spin_lock_irq(cdev->ccwlock); if (is_blacklisted(id->ssid, id->devno) && @@ -1330,6 +1331,7 @@ static int purge_fn(struct device *dev, void *data) CIO_MSG_EVENT(3, "ccw: purging 0.%x.%04x\n", id->ssid, id->devno); ccw_device_sched_todo(cdev, CDEV_TODO_UNREG); + css_sched_sch_todo(sch, SCH_TODO_UNREG); atomic_set(&cdev->private->onoff, 0); } spin_unlock_irq(cdev->ccwlock); From 6526a597a2e856df9ae94512f9903caccd5196d6 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 15 Sep 2021 17:08:44 +0200 Subject: [PATCH 024/512] s390/pci: add simpler s390dbf traces for events We often need to figure out what operations were performed in response to an error or availability event. The operations are easily accessible in s390dbf/pci_msg but the events have to be correlated with these from either the kernel log or s390dbf/pci_err. Improve this situation by logging the most important data from error and availability events that is the FID, PEC and FH together with the operations. Reviewed-by: Matthew Rosato Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_event.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index c856f80cb21b..461bfb12a3a3 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -52,6 +52,8 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); struct pci_dev *pdev = NULL; + zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n", + ccdf->fid, ccdf->fh, ccdf->pec); zpci_err("error CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); @@ -96,6 +98,8 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); enum zpci_state state; + zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n", + ccdf->fid, ccdf->fh, ccdf->pec); zpci_err("avail CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); From 0c3812c347bfb0dc213556a195e79850c55702f5 Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Fri, 17 Sep 2021 15:04:01 +0200 Subject: [PATCH 025/512] s390/cio: derive cdev information only for IO-subchannels cdev->online for the purge function must not be checked for the non-IO subchannel type. Make sure that we are deriving the cdev only from sch-type SUBCHANNEL_TYPE_IO. Signed-off-by: Vineeth Vijayan Reviewed-by: Peter Oberparleiter Signed-off-by: Vasily Gorbik --- drivers/s390/cio/css.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 44461928aab8..2bc55ccf3f23 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -792,10 +792,13 @@ static int __unset_online(struct device *dev, void *data) { struct idset *set = data; struct subchannel *sch = to_subchannel(dev); - struct ccw_device *cdev = sch_get_cdev(sch); + struct ccw_device *cdev; - if (cdev && cdev->online) - idset_sch_del(set, sch->schid); + if (sch->st == SUBCHANNEL_TYPE_IO) { + cdev = sch_get_cdev(sch); + if (cdev && cdev->online) + idset_sch_del(set, sch->schid); + } return 0; } From 54235d5cfea05f2891dca71d51d7ab097b53d22b Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Thu, 23 Sep 2021 12:42:02 +0200 Subject: [PATCH 026/512] s390/sclp_sd: fix warnings about missing parameter description Fix these warnings that are reported when compiling with W=1: drivers/s390/char/sclp_sd.c:132: warning: Function parameter or member 'listener' not described in 'sclp_sd_listener_init' drivers/s390/char/sclp_sd.c:408: warning: Function parameter or member 'cookie' not described in 'sclp_sd_file_update_async' drivers/s390/char/sclp_sd.c:422: warning: Function parameter or member 'attr' not described in 'reload_store' drivers/s390/char/sclp_sd.c:422: warning: Function parameter or member 'buf' not described in 'reload_store' drivers/s390/char/sclp_sd.c:422: warning: Function parameter or member 'count' not described in 'reload_store' drivers/s390/char/sclp_sd.c:457: warning: Function parameter or member 'file' not described in 'data_read' drivers/s390/char/sclp_sd.c:457: warning: Function parameter or member 'attr' not described in 'data_read' Signed-off-by: Peter Oberparleiter Signed-off-by: Vasily Gorbik --- drivers/s390/char/sclp_sd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/s390/char/sclp_sd.c b/drivers/s390/char/sclp_sd.c index 1e244f78f192..a5dd4e9f5b1b 100644 --- a/drivers/s390/char/sclp_sd.c +++ b/drivers/s390/char/sclp_sd.c @@ -122,6 +122,7 @@ static void sclp_sd_listener_remove(struct sclp_sd_listener *listener) /** * sclp_sd_listener_init() - Initialize a Store Data response listener + * @listener: Response listener to initialize * @id: Event ID to listen for * * Initialize a listener for asynchronous Store Data responses. This listener @@ -403,6 +404,7 @@ static int sclp_sd_file_update(struct sclp_sd_file *sd_file) /** * sclp_sd_file_update_async() - Wrapper for asynchronous update call * @data: Object to update + * @cookie: Unused */ static void sclp_sd_file_update_async(void *data, async_cookie_t cookie) { @@ -414,6 +416,9 @@ static void sclp_sd_file_update_async(void *data, async_cookie_t cookie) /** * reload_store() - Store function for "reload" sysfs attribute * @kobj: Kobject of sclp_sd_file object + * @attr: Reload attribute + * @buf: Data written to sysfs attribute + * @count: Count of bytes written * * Initiate a reload of the data associated with an sclp_sd_file object. */ @@ -441,8 +446,10 @@ static struct kobj_type sclp_sd_file_ktype = { }; /** - * data_read() - Read function for "read" sysfs attribute + * data_read() - Read function for "data" sysfs attribute + * @file: Open file pointer * @kobj: Kobject of sclp_sd_file object + * @attr: Data attribute * @buffer: Target buffer * @off: Requested file offset * @size: Requested number of bytes From f768a20c0a6e5f2396b9ab99bbbfd39d91228df9 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 24 Sep 2021 15:03:07 +0200 Subject: [PATCH 027/512] s390/ftrace: add FTRACE_GEN_NOP_ASM macro FTRACE_GEN_NOP_ASM(name) can be used to generate assembly functions with the required information added to allow tracing via kprobes/ftrace. It adds the nop instruction which will be patched by ftrace later. If the compiler supports -mnop-mcount it will also add an entry to the __mcount_loc section. Signed-off-by: Sven Schnelle Acked-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ftrace.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index e8b460f39c58..d1841f4176be 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -68,4 +68,32 @@ static inline bool arch_syscall_match_sym_name(const char *sym, } #endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_NOP_INSN .word 0xc004, 0x0000, 0x0000 /* brcl 0,0 */ + +#ifndef CC_USING_HOTPATCH + +#define FTRACE_GEN_MCOUNT_RECORD(name) \ + .section __mcount_loc, "a", @progbits; \ + .quad name; \ + .previous; + +#else /* !CC_USING_HOTPATCH */ + +#define FTRACE_GEN_MCOUNT_RECORD(name) + +#endif /* !CC_USING_HOTPATCH */ + +#define FTRACE_GEN_NOP_ASM(name) \ + FTRACE_GEN_MCOUNT_RECORD(name) \ + FTRACE_NOP_INSN + +#else /* CONFIG_FUNCTION_TRACER */ + +#define FTRACE_GEN_NOP_ASM(name) + +#endif /* CONFIG_FUNCTION_TRACER */ + #endif /* _ASM_S390_FTRACE_H */ From d340d28a968ec479d0ed3c38ab716ed821d82ad8 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 9 Sep 2021 20:59:17 +0200 Subject: [PATCH 028/512] kprobes: add testcases for s390 Add a few testcases to make sure that it's not possible to place a kprobe in the mid of an instruction on s390. Signed-off-by: Sven Schnelle Acked-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 12 +++++ arch/s390/configs/debug_defconfig | 1 + arch/s390/lib/Makefile | 2 + arch/s390/lib/test_kprobes.c | 75 +++++++++++++++++++++++++++++++ arch/s390/lib/test_kprobes.h | 10 +++++ arch/s390/lib/test_kprobes_asm.S | 45 +++++++++++++++++++ 6 files changed, 145 insertions(+) create mode 100644 arch/s390/lib/test_kprobes.c create mode 100644 arch/s390/lib/test_kprobes.h create mode 100644 arch/s390/lib/test_kprobes_asm.S diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b86de61b8caa..4d63662a919d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -946,4 +946,16 @@ config S390_UNWIND_SELFTEST Say N if you are unsure. +config S390_KPROBES_SANITY_TEST + def_tristate n + prompt "Enable s390 specific kprobes tests" + depends on KPROBES + depends on KUNIT + help + This option enables an s390 specific kprobes test module. This option + is not useful for distributions or general kernels, but only for kernel + developers working on architecture code. + + Say N if you are unsure. + endmenu diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6aad18ee131d..1818d67c2c1c 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -62,6 +62,7 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_S390_UNWIND_SELFTEST=y +CONFIG_S390_KPROBES_SANITY_TEST=y CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 678333936f78..707cd4622c13 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -7,6 +7,8 @@ lib-y += delay.o string.o uaccess.o find.o spinlock.o obj-y += mem.o xor.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o +obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o +test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o # Instrumenting memory accesses to __user data (in different address space) # produce false positives diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c new file mode 100644 index 000000000000..9e62d62812e5 --- /dev/null +++ b/arch/s390/lib/test_kprobes.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include "test_kprobes.h" + +static struct kprobe kp; + +static void setup_kprobe(struct kunit *test, struct kprobe *kp, + const char *symbol, int offset) +{ + kp->offset = offset; + kp->addr = NULL; + kp->symbol_name = symbol; +} + +static void test_kprobe_offset(struct kunit *test, struct kprobe *kp, + const char *target, int offset) +{ + int ret; + + setup_kprobe(test, kp, target, 0); + ret = register_kprobe(kp); + if (!ret) + unregister_kprobe(kp); + KUNIT_EXPECT_EQ(test, 0, ret); + setup_kprobe(test, kp, target, offset); + ret = register_kprobe(kp); + KUNIT_EXPECT_EQ(test, -EINVAL, ret); + if (!ret) + unregister_kprobe(kp); +} + +static void test_kprobe_odd(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_odd", + kprobes_target_odd_offs); +} + +static void test_kprobe_in_insn4(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn4", + kprobes_target_in_insn4_offs); +} + +static void test_kprobe_in_insn6_lo(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_lo", + kprobes_target_in_insn6_lo_offs); +} + +static void test_kprobe_in_insn6_hi(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_hi", + kprobes_target_in_insn6_hi_offs); +} + +static struct kunit_case kprobes_testcases[] = { + KUNIT_CASE(test_kprobe_odd), + KUNIT_CASE(test_kprobe_in_insn4), + KUNIT_CASE(test_kprobe_in_insn6_lo), + KUNIT_CASE(test_kprobe_in_insn6_hi), + {} +}; + +static struct kunit_suite kprobes_test_suite = { + .name = "kprobes_test_s390", + .test_cases = kprobes_testcases, +}; + +kunit_test_suites(&kprobes_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_kprobes.h b/arch/s390/lib/test_kprobes.h new file mode 100644 index 000000000000..2b4c9bc337f1 --- /dev/null +++ b/arch/s390/lib/test_kprobes.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_KPROBES_H +#define TEST_KPROBES_H + +extern unsigned long kprobes_target_odd_offs; +extern unsigned long kprobes_target_in_insn4_offs; +extern unsigned long kprobes_target_in_insn6_lo_offs; +extern unsigned long kprobes_target_in_insn6_hi_offs; + +#endif diff --git a/arch/s390/lib/test_kprobes_asm.S b/arch/s390/lib/test_kprobes_asm.S new file mode 100644 index 000000000000..ade7a3042334 --- /dev/null +++ b/arch/s390/lib/test_kprobes_asm.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#include +#include + +#define KPROBES_TARGET_START(name) \ + SYM_FUNC_START(name); \ + FTRACE_GEN_NOP_ASM(name) + +#define KPROBES_TARGET_END(name) \ + SYM_FUNC_END(name); \ + SYM_DATA(name##_offs, .quad 1b - name) + +KPROBES_TARGET_START(kprobes_target_in_insn4) + .word 0x4700 // bc 0,0 +1: .word 0x0000 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn4) + +KPROBES_TARGET_START(kprobes_target_in_insn6_lo) + .word 0xe310 // ly 1,0 +1: .word 0x0000 + .word 0x0058 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn6_lo) + +KPROBES_TARGET_START(kprobes_target_in_insn6_hi) + .word 0xe310 // ly 1,0 + .word 0x0000 +1: .word 0x0058 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn6_hi) + +KPROBES_TARGET_START(kprobes_target_bp) + nop + .word 0x0000 + nop +1: br %r14 +KPROBES_TARGET_END(kprobes_target_bp) + +KPROBES_TARGET_START(kprobes_target_odd) + .byte 0x07 +1: .byte 0x07 + br %r14 +KPROBES_TARGET_END(kprobes_target_odd) From bca2d0428e3d83b1a39ec46033e69fba8624280f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 27 Sep 2021 14:56:47 -0700 Subject: [PATCH 029/512] s390/sclp_vt220: fix unused function warning When CONFIG_SCLP_VT220_TTY=y and CONFIG_SCLP_VT220_CONSOLE is not set: ../drivers/s390/char/sclp_vt220.c:771:13: warning: '__sclp_vt220_flush_buffer' defined but not used [-Wunused-function] 771 | static void __sclp_vt220_flush_buffer(void) so move this function inside the #ifdef block where it is used. Signed-off-by: Randy Dunlap Cc: Vasily Gorbik Cc: Christian Borntraeger Link: https://lore.kernel.org/r/20210927215647.11506-1-rdunlap@infradead.org Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- drivers/s390/char/sclp_vt220.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c index 29a6a0099f83..7bc4e4a10937 100644 --- a/drivers/s390/char/sclp_vt220.c +++ b/drivers/s390/char/sclp_vt220.c @@ -768,6 +768,8 @@ out_driver: } __initcall(sclp_vt220_tty_init); +#ifdef CONFIG_SCLP_VT220_CONSOLE + static void __sclp_vt220_flush_buffer(void) { unsigned long flags; @@ -784,8 +786,6 @@ static void __sclp_vt220_flush_buffer(void) spin_unlock_irqrestore(&sclp_vt220_lock, flags); } -#ifdef CONFIG_SCLP_VT220_CONSOLE - static void sclp_vt220_con_write(struct console *con, const char *buf, unsigned int count) { From 584315ed87a7dce663ef3f07956b5f363f83c7bd Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 3 Aug 2021 19:42:32 +0200 Subject: [PATCH 030/512] s390/boot: initialize control registers in decompressor Partially revert commit 4555b9f34296 ("s390/boot: move dma sections from decompressor to decompressed kernel"). This is a prerequisite to allow initialization of virtual memory in decompressor and avoid overwriting of ASCEs in the decompressed kernel otherwise. Since the control registers 2, 5 and 15 are reinitialized in the decompressed kernel again, this change does not prevent relocating of amode31 section in any way. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/boot/head.S | 17 +++++++++++++++++ arch/s390/kernel/head64.S | 18 ------------------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index 40f4cff538b8..7e843d8e794e 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -317,6 +317,7 @@ SYM_CODE_START_LOCAL(startup_normal) xc 0x300(256),0x300 xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 + lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers stcke __LC_BOOT_CLOCK mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) @@ -335,6 +336,22 @@ SYM_CODE_END(startup_normal) .quad 0x0000000180000000,startup_pgm_check_handler .Lio_new_psw: .quad 0x0002000180000000,0x1f0 # disabled wait +.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space + .quad 0 # cr1: primary space segment table + .quad 0 # cr2: dispatchable unit control table + .quad 0 # cr3: instruction authorization + .quad 0xffff # cr4: instruction authorization + .quad 0 # cr5: primary-aste origin + .quad 0 # cr6: I/O interrupts + .quad 0 # cr7: secondary space segment table + .quad 0x0000000000008000 # cr8: access registers translation + .quad 0 # cr9: tracing off + .quad 0 # cr10: tracing off + .quad 0 # cr11: tracing off + .quad 0 # cr12: tracing off + .quad 0 # cr13: home space segment table + .quad 0xc0000000 # cr14: machine check handling off + .quad 0 # cr15: linkage stack operations #include "head_kdump.S" diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 114b5490ad8e..42f9a325a257 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -20,8 +20,6 @@ __HEAD ENTRY(startup_continue) larl %r1,tod_clock_base mvc 0(16,%r1),__LC_BOOT_CLOCK - larl %r13,.LPG1 # get base - lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers # # Setup stack # @@ -42,19 +40,3 @@ ENTRY(startup_continue) .align 16 .LPG1: .Ldw: .quad 0x0002000180000000,0x0000000000000000 -.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space - .quad 0 # cr1: primary space segment table - .quad 0 # cr2: dispatchable unit control table - .quad 0 # cr3: instruction authorization - .quad 0xffff # cr4: instruction authorization - .quad 0 # cr5: primary-aste origin - .quad 0 # cr6: I/O interrupts - .quad 0 # cr7: secondary space segment table - .quad 0x0000000000008000 # cr8: access registers translation - .quad 0 # cr9: tracing off - .quad 0 # cr10: tracing off - .quad 0 # cr11: tracing off - .quad 0 # cr12: tracing off - .quad 0 # cr13: home space segment table - .quad 0xc0000000 # cr14: machine check handling off - .quad 0 # cr15: linkage stack operations From e3ec8e0f5711d73f7e5d5c3cffdf4fad4f1487b9 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 27 Sep 2021 14:18:26 +0200 Subject: [PATCH 031/512] s390/boot: allocate amode31 section in decompressor The memory for amode31 section is allocated from the decompressed kernel. Instead, allocate that memory from the decompressor. This is a prerequisite to allow initialization of the virtual memory before the decompressed kernel takes over. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/boot/compressed/decompressor.h | 1 + arch/s390/boot/startup.c | 8 ++++++++ arch/s390/kernel/entry.h | 1 + arch/s390/kernel/setup.c | 22 +++++++++------------- arch/s390/kernel/vmlinux.lds.S | 1 + 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h index a59f75c5b049..f75cc31a77dd 100644 --- a/arch/s390/boot/compressed/decompressor.h +++ b/arch/s390/boot/compressed/decompressor.h @@ -24,6 +24,7 @@ struct vmlinux_info { unsigned long dynsym_start; unsigned long rela_dyn_start; unsigned long rela_dyn_end; + unsigned long amode31_size; }; /* Symbols defined by linker scripts */ diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 6dc8d0a53864..7571dee72a0c 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -15,6 +15,7 @@ #include "uv.h" unsigned long __bootdata_preserved(__kaslr_offset); +unsigned long __bootdata(__amode31_base); unsigned long __bootdata_preserved(VMALLOC_START); unsigned long __bootdata_preserved(VMALLOC_END); struct page *__bootdata_preserved(vmemmap); @@ -259,6 +260,12 @@ static void offset_vmlinux_info(unsigned long offset) vmlinux.dynsym_start += offset; } +static unsigned long reserve_amode31(unsigned long safe_addr) +{ + __amode31_base = PAGE_ALIGN(safe_addr); + return safe_addr + vmlinux.amode31_size; +} + void startup_kernel(void) { unsigned long random_lma; @@ -273,6 +280,7 @@ void startup_kernel(void) setup_lpp(); store_ipl_parmblock(); safe_addr = mem_safe_offset(); + safe_addr = reserve_amode31(safe_addr); safe_addr = read_ipl_report(safe_addr); uv_query_info(); rescue_initrd(safe_addr); diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 7f2696e8d511..6083090be1f4 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -70,5 +70,6 @@ extern struct exception_table_entry _stop_amode31_ex_table[]; #define __amode31_data __section(".amode31.data") #define __amode31_ref __section(".amode31.refs") extern long _start_amode31_refs[], _end_amode31_refs[]; +extern unsigned long __amode31_base; #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 67e5fff96ee0..191fc96a41b2 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -95,10 +95,10 @@ EXPORT_SYMBOL(console_irq); * relocated above 2 GB, because it has to use 31 bit addresses. * Such code and data is part of the .amode31 section. */ -unsigned long __amode31_ref __samode31 = __pa(&_samode31); -unsigned long __amode31_ref __eamode31 = __pa(&_eamode31); -unsigned long __amode31_ref __stext_amode31 = __pa(&_stext_amode31); -unsigned long __amode31_ref __etext_amode31 = __pa(&_etext_amode31); +unsigned long __amode31_ref __samode31 = (unsigned long)&_samode31; +unsigned long __amode31_ref __eamode31 = (unsigned long)&_eamode31; +unsigned long __amode31_ref __stext_amode31 = (unsigned long)&_stext_amode31; +unsigned long __amode31_ref __etext_amode31 = (unsigned long)&_etext_amode31; struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table; struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table; @@ -149,6 +149,7 @@ struct mem_detect_info __bootdata(mem_detect); struct initrd_data __bootdata(initrd_data); unsigned long __bootdata_preserved(__kaslr_offset); +unsigned long __bootdata(__amode31_base); unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); u64 __bootdata_preserved(stfle_fac_list[16]); @@ -808,6 +809,7 @@ static void __init reserve_kernel(void) memblock_reserve(0, STARTUP_NORMAL_OFFSET); memblock_reserve((unsigned long)sclp_early_sccb, EXT_SCCB_READ_SCP); + memblock_reserve(__amode31_base, __eamode31 - __samode31); memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - (unsigned long)_stext); } @@ -831,20 +833,14 @@ static void __init setup_memory(void) static void __init relocate_amode31_section(void) { - unsigned long amode31_addr, amode31_size; - long amode31_offset; + unsigned long amode31_size = __eamode31 - __samode31; + long amode31_offset = __amode31_base - __samode31; long *ptr; - /* Allocate a new AMODE31 capable memory region */ - amode31_size = __eamode31 - __samode31; pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size); - amode31_addr = (unsigned long)memblock_alloc_low(amode31_size, PAGE_SIZE); - if (!amode31_addr) - panic("Failed to allocate memory for AMODE31 section\n"); - amode31_offset = amode31_addr - __samode31; /* Move original AMODE31 section to the new one */ - memmove((void *)amode31_addr, (void *)__samode31, amode31_size); + memmove((void *)__amode31_base, (void *)__samode31, amode31_size); /* Zero out the old AMODE31 section to catch invalid accesses within it */ memset((void *)__samode31, 0, amode31_size); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 63bdb9e1bfc1..42c43521878f 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -212,6 +212,7 @@ SECTIONS QUAD(__dynsym_start) /* dynsym_start */ QUAD(__rela_dyn_start) /* rela_dyn_start */ QUAD(__rela_dyn_end) /* rela_dyn_end */ + QUAD(_eamode31 - _samode31) /* amode31_size */ } :NONE /* Debugging sections. */ From 11dfe199eb31079a6f2517a59c380ad55f156696 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Mon, 27 Sep 2021 15:02:30 -0700 Subject: [PATCH 032/512] s390/block/dasd_genhd: add error handling support for add_disk() We never checked for errors on add_disk() as this function returned void. Now that this is fixed, use the shiny new error handling. Be sure to call dasd_gendisk_free() on error. Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20210927220232.1071926-5-mcgrof@kernel.org Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- drivers/s390/block/dasd_genhd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index fa966e0db6ca..80673dbfb1f9 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -33,7 +33,7 @@ int dasd_gendisk_alloc(struct dasd_block *block) { struct gendisk *gdp; struct dasd_device *base; - int len; + int len, rc; /* Make sure the minor for this device exists. */ base = block->base; @@ -79,7 +79,13 @@ int dasd_gendisk_alloc(struct dasd_block *block) dasd_add_link_to_gendisk(gdp, base); block->gdp = gdp; set_capacity(block->gdp, 0); - device_add_disk(&base->cdev->dev, block->gdp, NULL); + + rc = device_add_disk(&base->cdev->dev, block->gdp, NULL); + if (rc) { + dasd_gendisk_free(block); + return rc; + } + return 0; } From 1a5db707c859a4f63c1066c5b88864d3f1c21c12 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 27 Sep 2021 15:02:31 -0700 Subject: [PATCH 033/512] s390/block/dcssblk: add error handling support for add_disk() We never checked for errors on add_disk() as this function returned void. Now that this is fixed, use the shiny new error handling. Signed-off-by: Luis Chamberlain Signed-off-by: Gerald Schaefer Link: https://lore.kernel.org/r/20210927220232.1071926-6-mcgrof@kernel.org Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- drivers/s390/block/dcssblk.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 5be3d1c39a78..0741a9321712 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -696,7 +696,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char } get_device(&dev_info->dev); - device_add_disk(&dev_info->dev, dev_info->gd, NULL); + rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL); + if (rc) + goto out_dax; switch (dev_info->segment_type) { case SEG_TYPE_SR: @@ -712,6 +714,10 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char rc = count; goto out; +out_dax: + put_device(&dev_info->dev); + kill_dax(dev_info->dax_dev); + put_dax(dev_info->dax_dev); put_dev: list_del(&dev_info->lh); blk_cleanup_disk(dev_info->gd); From f367c7d9fb326996862f6b5cf2aff7a2df64692d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Mon, 27 Sep 2021 15:02:32 -0700 Subject: [PATCH 034/512] s390/block/scm_blk: add error handling support for add_disk() We never checked for errors on add_disk() as this function returned void. Now that this is fixed, use the shiny new error handling. Acked-by: Heiko Carstens Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20210927220232.1071926-7-mcgrof@kernel.org Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- drivers/s390/block/scm_blk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 88cba6212ee2..61ecdcb2cc6a 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -495,9 +495,14 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) /* 512 byte sectors */ set_capacity(bdev->gendisk, scmdev->size >> 9); - device_add_disk(&scmdev->dev, bdev->gendisk, NULL); + ret = device_add_disk(&scmdev->dev, bdev->gendisk, NULL); + if (ret) + goto out_cleanup_disk; + return 0; +out_cleanup_disk: + blk_cleanup_disk(bdev->gendisk); out_tag: blk_mq_free_tag_set(&bdev->tag_set); out: From 65315ec52c9bd518e22fde1c9ce1e787b9122b4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Sun, 3 Oct 2021 02:54:39 +0000 Subject: [PATCH 035/512] PCI: imx6: Remove unused assignment to variable ret MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the maximum link speed was set following an "fsl,max-link-speed" property read, and should the read failed, then the PCIe generation was manually set to PCIe Gen1 and thus limiting the link speed to 2.5 GT/s. Code refactoring completed in the commit 39bc5006501c ("PCI: dwc: Centralize link gen setting") changed to the logic that was previously used to limit the maximum link speed leaving behind an unused assignment to a variable "ret". Since the value returned from the of_property_read_u32() and stored in the variable "ret" is never used in any meaningful way, and it's also immediately reassigned in the code that follows, the assignment can be removed. Link: https://lore.kernel.org/r/20211003025439.84783-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pci-imx6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 80fc98acf097..26f49f797b0f 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -1132,7 +1132,7 @@ static int imx6_pcie_probe(struct platform_device *pdev) /* Limit link speed */ pci->link_gen = 1; - ret = of_property_read_u32(node, "fsl,max-link-speed", &pci->link_gen); + of_property_read_u32(node, "fsl,max-link-speed", &pci->link_gen); imx6_pcie->vpcie = devm_regulator_get_optional(&pdev->dev, "vpcie"); if (IS_ERR(imx6_pcie->vpcie)) { From 460275f124fb072dca218a6b43b6370eebbab20d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:40 +0200 Subject: [PATCH 036/512] PCI: Add PCI_EXP_DEVCTL_PAYLOAD_* macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a macro PCI_EXP_DEVCTL_PAYLOAD_* for every possible Max Payload Size in linux/pci_regs.h, in the same style as PCI_EXP_DEVCTL_READRQ_*. Link: https://lore.kernel.org/r/20211005180952.6812-2-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Reviewed-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index e709ae8235e7..ff6ccbc6efe9 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -504,6 +504,12 @@ #define PCI_EXP_DEVCTL_URRE 0x0008 /* Unsupported Request Reporting En. */ #define PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */ #define PCI_EXP_DEVCTL_PAYLOAD 0x00e0 /* Max_Payload_Size */ +#define PCI_EXP_DEVCTL_PAYLOAD_128B 0x0000 /* 128 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_256B 0x0020 /* 256 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_512B 0x0040 /* 512 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_1024B 0x0060 /* 1024 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_2048B 0x0080 /* 2048 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_4096B 0x00a0 /* 4096 Bytes */ #define PCI_EXP_DEVCTL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ #define PCI_EXP_DEVCTL_PHANTOM 0x0200 /* Phantom Functions Enable */ #define PCI_EXP_DEVCTL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ From a4e17d65dafdd3513042d8f00404c9b6068a825c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:41 +0200 Subject: [PATCH 037/512] PCI: aardvark: Fix PCIe Max Payload Size setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change PCIe Max Payload Size setting in PCIe Device Control register to 512 bytes to align with PCIe Link Initialization sequence as defined in Marvell Armada 3700 Functional Specification. According to the specification, maximal Max Payload Size supported by this device is 512 bytes. Without this kernel prints suspicious line: pci 0000:01:00.0: Upstream bridge's Max Payload Size set to 256 (was 16384, max 512) With this change it changes to: pci 0000:01:00.0: Upstream bridge's Max Payload Size set to 256 (was 512, max 512) Link: https://lore.kernel.org/r/20211005180952.6812-3-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 596ebcfcc82d..884510630bae 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -488,8 +488,9 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); reg &= ~PCI_EXP_DEVCTL_RELAX_EN; reg &= ~PCI_EXP_DEVCTL_NOSNOOP_EN; + reg &= ~PCI_EXP_DEVCTL_PAYLOAD; reg &= ~PCI_EXP_DEVCTL_READRQ; - reg |= PCI_EXP_DEVCTL_PAYLOAD; /* Set max payload size */ + reg |= PCI_EXP_DEVCTL_PAYLOAD_512B; reg |= PCI_EXP_DEVCTL_READRQ_512B; advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); From 464de7e7fff767e87429cd7be09c4f2cb50a6ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 5 Oct 2021 20:09:42 +0200 Subject: [PATCH 038/512] PCI: aardvark: Don't spam about PIO Response Status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use dev_dbg() instead of dev_err() in advk_pcie_check_pio_status(). For example CRS is not an error status, it just says that the request should be retried. Link: https://lore.kernel.org/r/20211005180952.6812-4-kabel@kernel.org Fixes: 8c39d710363c1 ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-aardvark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 884510630bae..a7903c531aa3 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -683,7 +683,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 else str_posted = "Posted"; - dev_err(dev, "%s PIO Response Status: %s, %#x @ %#x\n", + dev_dbg(dev, "%s PIO Response Status: %s, %#x @ %#x\n", str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS)); return -EFAULT; From d419052bc6c60fa4ab2b5a51d5f1e55a66e2b4ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:43 +0200 Subject: [PATCH 039/512] PCI: aardvark: Fix preserving PCI_EXP_RTCTL_CRSSVE flag on emulated bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 43f5c77bcbd2 ("PCI: aardvark: Fix reporting CRS value") started using CRSSVE flag for handling CRS responses. PCI_EXP_RTCTL_CRSSVE flag is stored only in emulated config space buffer and there is handler for PCI_EXP_RTCTL register. So every read operation from config space automatically clears CRSSVE flag as it is not defined in PCI_EXP_RTCTL read handler. Fix this by reading current CRSSVE bit flag from emulated space buffer and appending it to PCI_EXP_RTCTL read response. Link: https://lore.kernel.org/r/20211005180952.6812-5-kabel@kernel.org Fixes: 43f5c77bcbd2 ("PCI: aardvark: Fix reporting CRS value") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún --- drivers/pci/controller/pci-aardvark.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index a7903c531aa3..bb57ca6aed35 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -724,6 +724,7 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, case PCI_EXP_RTCTL: { u32 val = advk_readl(pcie, PCIE_ISR0_MASK_REG); *value = (val & PCIE_MSG_PM_PME_MASK) ? 0 : PCI_EXP_RTCTL_PMEIE; + *value |= le16_to_cpu(bridge->pcie_conf.rootctl) & PCI_EXP_RTCTL_CRSSVE; *value |= PCI_EXP_RTCAP_CRSVIS << 16; return PCI_BRIDGE_EMUL_HANDLED; } From 46ef6090dbf590711cb12680b6eafde5fa21fe87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:44 +0200 Subject: [PATCH 040/512] PCI: aardvark: Fix configuring Reference clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 366697018c9a ("PCI: aardvark: Add PHY support") introduced configuration of PCIe Reference clock via PCIE_CORE_REF_CLK_REG register, but did it incorrectly. PCIe Reference clock differential pair is routed from system board to endpoint card, so on CPU side it has output direction. Therefore it is required to enable transmitting and disable receiving. Default configuration according to Armada 3700 Functional Specifications is enabled receiver part and disabled transmitter. We need this change because otherwise PCIe Reference clock is configured to some undefined state when differential pair is used for both transmitting and receiving. Fix this by disabling receiver part. Link: https://lore.kernel.org/r/20211005180952.6812-6-kabel@kernel.org Fixes: 366697018c9a ("PCI: aardvark: Add PHY support") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index bb57ca6aed35..d5d6f92e5143 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -99,6 +99,7 @@ #define PCIE_CORE_CTRL2_MSI_ENABLE BIT(10) #define PCIE_CORE_REF_CLK_REG (CONTROL_BASE_ADDR + 0x14) #define PCIE_CORE_REF_CLK_TX_ENABLE BIT(1) +#define PCIE_CORE_REF_CLK_RX_ENABLE BIT(2) #define PCIE_MSG_LOG_REG (CONTROL_BASE_ADDR + 0x30) #define PCIE_ISR0_REG (CONTROL_BASE_ADDR + 0x40) #define PCIE_MSG_PM_PME_MASK BIT(7) @@ -451,9 +452,15 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) u32 reg; int i; - /* Enable TX */ + /* + * Configure PCIe Reference clock. Direction is from the PCIe + * controller to the endpoint card, so enable transmitting of + * Reference clock differential signal off-chip and disable + * receiving off-chip differential signal. + */ reg = advk_readl(pcie, PCIE_CORE_REF_CLK_REG); reg |= PCIE_CORE_REF_CLK_TX_ENABLE; + reg &= ~PCIE_CORE_REF_CLK_RX_ENABLE; advk_writel(pcie, reg, PCIE_CORE_REF_CLK_REG); /* Set to Direct mode */ From a7ca6d7fa3c02c032db5440ff392d96c04684c21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:45 +0200 Subject: [PATCH 041/512] PCI: aardvark: Do not clear status bits of masked interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCIE_ISR1_REG says which interrupts are currently set / active, including those which are masked. The driver currently reads this register and looks if some unmasked interrupts are active, and if not, it clears status bits of _all_ interrupts, including the masked ones. This is incorrect, since, for example, some drivers may poll these bits. Remove this clearing, and also remove this early return statement completely, since it does not change functionality in any way. Link: https://lore.kernel.org/r/20211005180952.6812-7-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index d5d6f92e5143..f7eebf453e83 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -1295,12 +1295,6 @@ static void advk_pcie_handle_int(struct advk_pcie *pcie) isr1_mask = advk_readl(pcie, PCIE_ISR1_MASK_REG); isr1_status = isr1_val & ((~isr1_mask) & PCIE_ISR1_ALL_MASK); - if (!isr0_status && !isr1_status) { - advk_writel(pcie, isr0_val, PCIE_ISR0_REG); - advk_writel(pcie, isr1_val, PCIE_ISR1_REG); - return; - } - /* Process MSI interrupts */ if (isr0_status & PCIE_ISR0_MSI_INT_PENDING) advk_pcie_handle_msi(pcie); From 1fb95d7d3c7a926b002fe8a6bd27a1cb428b46dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:46 +0200 Subject: [PATCH 042/512] PCI: aardvark: Do not unmask unused interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are lot of undocumented interrupt bits. To prevent unwanted spurious interrupts, fix all *_ALL_MASK macros to define all interrupt bits, so that driver can properly mask all interrupts, including those which are undocumented. Link: https://lore.kernel.org/r/20211005180952.6812-8-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index f7eebf453e83..3448a8c446d4 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -107,13 +107,13 @@ #define PCIE_ISR0_MSI_INT_PENDING BIT(24) #define PCIE_ISR0_INTX_ASSERT(val) BIT(16 + (val)) #define PCIE_ISR0_INTX_DEASSERT(val) BIT(20 + (val)) -#define PCIE_ISR0_ALL_MASK GENMASK(26, 0) +#define PCIE_ISR0_ALL_MASK GENMASK(31, 0) #define PCIE_ISR1_REG (CONTROL_BASE_ADDR + 0x48) #define PCIE_ISR1_MASK_REG (CONTROL_BASE_ADDR + 0x4C) #define PCIE_ISR1_POWER_STATE_CHANGE BIT(4) #define PCIE_ISR1_FLUSH BIT(5) #define PCIE_ISR1_INTX_ASSERT(val) BIT(8 + (val)) -#define PCIE_ISR1_ALL_MASK GENMASK(11, 4) +#define PCIE_ISR1_ALL_MASK GENMASK(31, 0) #define PCIE_MSI_ADDR_LOW_REG (CONTROL_BASE_ADDR + 0x50) #define PCIE_MSI_ADDR_HIGH_REG (CONTROL_BASE_ADDR + 0x54) #define PCIE_MSI_STATUS_REG (CONTROL_BASE_ADDR + 0x58) @@ -199,7 +199,7 @@ #define PCIE_IRQ_MSI_INT2_DET BIT(21) #define PCIE_IRQ_RC_DBELL_DET BIT(22) #define PCIE_IRQ_EP_STATUS BIT(23) -#define PCIE_IRQ_ALL_MASK 0xfff0fb +#define PCIE_IRQ_ALL_MASK GENMASK(31, 0) #define PCIE_IRQ_ENABLE_INTS_MASK PCIE_IRQ_CORE_INT /* Transaction types */ From 67cb2a4c93499c2c22704998fd1fd2bc35194d8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 5 Oct 2021 20:09:47 +0200 Subject: [PATCH 043/512] PCI: aardvark: Deduplicate code in advk_pcie_rd_conf() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid code repetition in advk_pcie_rd_conf() by handling errors with goto jump, as is customary in kernel. Link: https://lore.kernel.org/r/20211005180952.6812-9-kabel@kernel.org Fixes: 43f5c77bcbd2 ("PCI: aardvark: Fix reporting CRS value") Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-aardvark.c | 48 +++++++++++---------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 3448a8c446d4..cd5be284789e 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -920,18 +920,8 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, (le16_to_cpu(pcie->bridge.pcie_conf.rootctl) & PCI_EXP_RTCTL_CRSSVE); - if (advk_pcie_pio_is_running(pcie)) { - /* - * If it is possible return Completion Retry Status so caller - * tries to issue the request again instead of failing. - */ - if (allow_crs) { - *val = CFG_RD_CRS_VAL; - return PCIBIOS_SUCCESSFUL; - } - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + if (advk_pcie_pio_is_running(pcie)) + goto try_crs; /* Program the control register */ reg = advk_readl(pcie, PIO_CTRL); @@ -955,25 +945,13 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, advk_writel(pcie, 1, PIO_START); ret = advk_pcie_wait_pio(pcie); - if (ret < 0) { - /* - * If it is possible return Completion Retry Status so caller - * tries to issue the request again instead of failing. - */ - if (allow_crs) { - *val = CFG_RD_CRS_VAL; - return PCIBIOS_SUCCESSFUL; - } - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + if (ret < 0) + goto try_crs; /* Check PIO status and get the read result */ ret = advk_pcie_check_pio_status(pcie, allow_crs, val); - if (ret < 0) { - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + if (ret < 0) + goto fail; if (size == 1) *val = (*val >> (8 * (where & 3))) & 0xff; @@ -981,6 +959,20 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, *val = (*val >> (8 * (where & 3))) & 0xffff; return PCIBIOS_SUCCESSFUL; + +try_crs: + /* + * If it is possible, return Completion Retry Status so that caller + * tries to issue the request again instead of failing. + */ + if (allow_crs) { + *val = CFG_RD_CRS_VAL; + return PCIBIOS_SUCCESSFUL; + } + +fail: + *val = 0xffffffff; + return PCIBIOS_SET_FAILED; } static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, From 223dec14a05337a4155f1deed46d2becce4d00fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:48 +0200 Subject: [PATCH 044/512] PCI: aardvark: Implement re-issuing config requests on CRS response MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 43f5c77bcbd2 ("PCI: aardvark: Fix reporting CRS value") fixed handling of CRS response and when CRSSVE flag was not enabled it marked CRS response as failed transaction (due to simplicity). But pci-aardvark.c driver is already waiting up to the PIO_RETRY_CNT count for PIO config response and so we can with a small change implement re-issuing of config requests as described in PCIe base specification. This change implements re-issuing of config requests when response is CRS. Set upper bound of wait cycles to around PIO_RETRY_CNT, afterwards the transaction is marked as failed and an all-ones value is returned as before. We do this by returning appropriate error codes from function advk_pcie_check_pio_status(). On CRS we return -EAGAIN and caller then reissues transaction. Link: https://lore.kernel.org/r/20211005180952.6812-10-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún --- drivers/pci/controller/pci-aardvark.c | 67 +++++++++++++++++---------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index cd5be284789e..efe8b5f7bf8c 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -603,6 +603,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 u32 reg; unsigned int status; char *strcomp_status, *str_posted; + int ret; reg = advk_readl(pcie, PIO_STAT); status = (reg & PIO_COMPLETION_STATUS_MASK) >> @@ -627,6 +628,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 case PIO_COMPLETION_STATUS_OK: if (reg & PIO_ERR_STATUS) { strcomp_status = "COMP_ERR"; + ret = -EFAULT; break; } /* Get the read result */ @@ -634,9 +636,11 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 *val = advk_readl(pcie, PIO_RD_DATA); /* No error */ strcomp_status = NULL; + ret = 0; break; case PIO_COMPLETION_STATUS_UR: strcomp_status = "UR"; + ret = -EOPNOTSUPP; break; case PIO_COMPLETION_STATUS_CRS: if (allow_crs && val) { @@ -654,6 +658,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 */ *val = CFG_RD_CRS_VAL; strcomp_status = NULL; + ret = 0; break; } /* PCIe r4.0, sec 2.3.2, says: @@ -669,21 +674,24 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 * Request and taking appropriate action, e.g., complete the * Request to the host as a failed transaction. * - * To simplify implementation do not re-issue the Configuration - * Request and complete the Request as a failed transaction. + * So return -EAGAIN and caller (pci-aardvark.c driver) will + * re-issue request again up to the PIO_RETRY_CNT retries. */ strcomp_status = "CRS"; + ret = -EAGAIN; break; case PIO_COMPLETION_STATUS_CA: strcomp_status = "CA"; + ret = -ECANCELED; break; default: strcomp_status = "Unknown"; + ret = -EINVAL; break; } if (!strcomp_status) - return 0; + return ret; if (reg & PIO_NON_POSTED_REQ) str_posted = "Non-posted"; @@ -693,7 +701,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 dev_dbg(dev, "%s PIO Response Status: %s, %#x @ %#x\n", str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS)); - return -EFAULT; + return ret; } static int advk_pcie_wait_pio(struct advk_pcie *pcie) @@ -701,13 +709,13 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie) struct device *dev = &pcie->pdev->dev; int i; - for (i = 0; i < PIO_RETRY_CNT; i++) { + for (i = 1; i <= PIO_RETRY_CNT; i++) { u32 start, isr; start = advk_readl(pcie, PIO_START); isr = advk_readl(pcie, PIO_ISR); if (!start && isr) - return 0; + return i; udelay(PIO_RETRY_DELAY); } @@ -898,6 +906,7 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, int where, int size, u32 *val) { struct advk_pcie *pcie = bus->sysdata; + int retry_count; bool allow_crs; u32 reg; int ret; @@ -940,16 +949,22 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, /* Program the data strobe */ advk_writel(pcie, 0xf, PIO_WR_DATA_STRB); - /* Clear PIO DONE ISR and start the transfer */ - advk_writel(pcie, 1, PIO_ISR); - advk_writel(pcie, 1, PIO_START); + retry_count = 0; + do { + /* Clear PIO DONE ISR and start the transfer */ + advk_writel(pcie, 1, PIO_ISR); + advk_writel(pcie, 1, PIO_START); - ret = advk_pcie_wait_pio(pcie); - if (ret < 0) - goto try_crs; + ret = advk_pcie_wait_pio(pcie); + if (ret < 0) + goto try_crs; + + retry_count += ret; + + /* Check PIO status and get the read result */ + ret = advk_pcie_check_pio_status(pcie, allow_crs, val); + } while (ret == -EAGAIN && retry_count < PIO_RETRY_CNT); - /* Check PIO status and get the read result */ - ret = advk_pcie_check_pio_status(pcie, allow_crs, val); if (ret < 0) goto fail; @@ -981,6 +996,7 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, struct advk_pcie *pcie = bus->sysdata; u32 reg; u32 data_strobe = 0x0; + int retry_count; int offset; int ret; @@ -1022,19 +1038,22 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, /* Program the data strobe */ advk_writel(pcie, data_strobe, PIO_WR_DATA_STRB); - /* Clear PIO DONE ISR and start the transfer */ - advk_writel(pcie, 1, PIO_ISR); - advk_writel(pcie, 1, PIO_START); + retry_count = 0; + do { + /* Clear PIO DONE ISR and start the transfer */ + advk_writel(pcie, 1, PIO_ISR); + advk_writel(pcie, 1, PIO_START); - ret = advk_pcie_wait_pio(pcie); - if (ret < 0) - return PCIBIOS_SET_FAILED; + ret = advk_pcie_wait_pio(pcie); + if (ret < 0) + return PCIBIOS_SET_FAILED; - ret = advk_pcie_check_pio_status(pcie, false, NULL); - if (ret < 0) - return PCIBIOS_SET_FAILED; + retry_count += ret; - return PCIBIOS_SUCCESSFUL; + ret = advk_pcie_check_pio_status(pcie, false, NULL); + } while (ret == -EAGAIN && retry_count < PIO_RETRY_CNT); + + return ret < 0 ? PCIBIOS_SET_FAILED : PCIBIOS_SUCCESSFUL; } static struct pci_ops advk_pcie_ops = { From 454c53271fc11f3aa5e44e41fd99ca181bd32c62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:49 +0200 Subject: [PATCH 045/512] PCI: aardvark: Simplify initialization of rootcap on virtual bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe config space can be initialized also before pci_bridge_emul_init() call, so move rootcap initialization after PCI config space initialization. This simplifies the function a little since it removes one if (ret < 0) check. Link: https://lore.kernel.org/r/20211005180952.6812-11-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún --- drivers/pci/controller/pci-aardvark.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index efe8b5f7bf8c..9c509d5a9afa 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -822,7 +822,6 @@ static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = { static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) { struct pci_bridge_emul *bridge = &pcie->bridge; - int ret; bridge->conf.vendor = cpu_to_le16(advk_readl(pcie, PCIE_CORE_DEV_ID_REG) & 0xffff); @@ -842,19 +841,14 @@ static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) /* Support interrupt A for MSI feature */ bridge->conf.intpin = PCIE_CORE_INT_A_ASSERT_ENABLE; + /* Indicates supports for Completion Retry Status */ + bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); + bridge->has_pcie = true; bridge->data = pcie; bridge->ops = &advk_pci_bridge_emul_ops; - /* PCIe config space can be initialized after pci_bridge_emul_init() */ - ret = pci_bridge_emul_init(bridge, 0); - if (ret < 0) - return ret; - - /* Indicates supports for Completion Retry Status */ - bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); - - return 0; + return pci_bridge_emul_init(bridge, 0); } static bool advk_pcie_valid_device(struct advk_pcie *pcie, struct pci_bus *bus, From f76b36d40beee0a13aa8f6aa011df0d7cbbb8a7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:50 +0200 Subject: [PATCH 046/512] PCI: aardvark: Fix link training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix multiple link training issues in aardvark driver. The main reason of these issues was misunderstanding of what certain registers do, since their names and comments were misleading: before commit 96be36dbffac ("PCI: aardvark: Replace custom macros by standard linux/pci_regs.h macros"), the pci-aardvark.c driver used custom macros for accessing standard PCIe Root Bridge registers, and misleading comments did not help to understand what the code was really doing. After doing more tests and experiments I've come to the conclusion that the SPEED_GEN register in aardvark sets the PCIe revision / generation compliance and forces maximal link speed. Both GEN3 and GEN2 values set the read-only PCI_EXP_FLAGS_VERS bits (PCIe capabilities version of Root Bridge) to value 2, while GEN1 value sets PCI_EXP_FLAGS_VERS to 1, which matches with PCI Express specifications revisions 3, 2 and 1 respectively. Changing SPEED_GEN also sets the read-only bits PCI_EXP_LNKCAP_SLS and PCI_EXP_LNKCAP2_SLS to corresponding speed. (Note that PCI Express rev 1 specification does not define PCI_EXP_LNKCAP2 and PCI_EXP_LNKCTL2 registers and when SPEED_GEN is set to GEN1 (which also sets PCI_EXP_FLAGS_VERS set to 1), lspci cannot access PCI_EXP_LNKCAP2 and PCI_EXP_LNKCTL2 registers.) Changing PCIe link speed can be done via PCI_EXP_LNKCTL2_TLS bits of PCI_EXP_LNKCTL2 register. Armada 3700 Functional Specifications says that the default value of PCI_EXP_LNKCTL2_TLS is based on SPEED_GEN value, but tests showed that the default value is always 8.0 GT/s, independently of speed set by SPEED_GEN. So after setting SPEED_GEN, we must also set value in PCI_EXP_LNKCTL2 register via PCI_EXP_LNKCTL2_TLS bits. Triggering PCI_EXP_LNKCTL_RL bit immediately after setting LINK_TRAINING_EN bit actually doesn't do anything. Tests have shown that a delay is needed after enabling LINK_TRAINING_EN bit. As triggering PCI_EXP_LNKCTL_RL currently does nothing, remove it. Commit 43fc679ced18 ("PCI: aardvark: Improve link training") introduced code which sets SPEED_GEN register based on negotiated link speed from PCI_EXP_LNKSTA_CLS bits of PCI_EXP_LNKSTA register. This code was added to fix detection of Compex WLE900VX (Atheros QCA9880) WiFi GEN1 PCIe cards, as otherwise these cards were "invisible" on PCIe bus (probably because they crashed). But apparently more people reported the same issues with these cards also with other PCIe controllers [1] and I was able to reproduce this issue also with other "noname" WiFi cards based on Atheros QCA9890 chip (with the same PCI vendor/device ids as Atheros QCA9880). So this is not an issue in aardvark but rather an issue in Atheros QCA98xx chips. Also, this issue only exists if the kernel is compiled with PCIe ASPM support, and a generic workaround for this is to change PCIe Bridge to 2.5 GT/s link speed via PCI_EXP_LNKCTL2_TLS_2_5GT bits in PCI_EXP_LNKCTL2 register [2], before triggering PCI_EXP_LNKCTL_RL bit. This workaround also works when SPEED_GEN is set to value GEN2 (5 GT/s). So remove this hack completely in the aardvark driver and always set SPEED_GEN to value from 'max-link-speed' DT property. Fix for Atheros QCA98xx chips is handled separately by patch [2]. These two things (code for triggering PCI_EXP_LNKCTL_RL bit and changing SPEED_GEN value) also explain why commit 6964494582f5 ("PCI: aardvark: Train link immediately after enabling training") somehow fixed detection of those problematic Compex cards with Atheros chips: if triggering link retraining (via PCI_EXP_LNKCTL_RL bit) was done immediately after enabling link training (via LINK_TRAINING_EN), it did nothing. If there was a specific delay, aardvark HW already initialized PCIe link and therefore triggering link retraining caused the above issue. Compex cards triggered link down event and disappeared from the PCIe bus. Commit f4c7d053d7f7 ("PCI: aardvark: Wait for endpoint to be ready before training link") added 100ms sleep before calling 'Start link training' command and explained that it is a requirement of PCI Express specification. But the code after this 100ms sleep was not doing 'Start link training', rather it triggered PCI_EXP_LNKCTL_RL bit via PCIe Root Bridge to put link into Recovery state. The required delay after fundamental reset is already done in function advk_pcie_wait_for_link() which also checks whether PCIe link is up. So after removing the code which triggers PCI_EXP_LNKCTL_RL bit on PCIe Root Bridge, there is no need to wait 100ms again. Remove the extra msleep() call and update comment about the delay required by the PCI Express specification. According to Marvell Armada 3700 Functional Specifications, Link training should be enabled via aardvark register LINK_TRAINING_EN after selecting PCIe generation and x1 lane. There is no need to disable it prior resetting card via PERST# signal. This disabling code was introduced in commit 5169a9851daa ("PCI: aardvark: Issue PERST via GPIO") as a workaround for some Atheros cards. It turns out that this also is Atheros specific issue and affects any PCIe controller, not only aardvark. Moreover this Atheros issue was triggered by juggling with PCI_EXP_LNKCTL_RL, LINK_TRAINING_EN and SPEED_GEN bits interleaved with sleeps. Now, after removing triggering PCI_EXP_LNKCTL_RL, there is no need to explicitly disable LINK_TRAINING_EN bit. So remove this code too. The problematic Compex cards described in previous git commits are correctly detected in advk_pcie_train_link() function even after applying all these changes. Note that with this patch, and also prior this patch, some NVMe disks which support PCIe GEN3 with 8 GT/s speed are negotiated only at the lowest link speed 2.5 GT/s, independently of SPEED_GEN value. After manually triggering PCI_EXP_LNKCTL_RL bit (e.g. from userspace via setpci), these NVMe disks change link speed to 5 GT/s when SPEED_GEN was configured to GEN2. This issue first needs to be properly investigated. I will send a fix in the future. On the other hand, some other GEN2 PCIe cards with 5 GT/s speed are autonomously by HW autonegotiated at full 5 GT/s speed without need of any software interaction. Armada 3700 Functional Specifications describes the following steps for link training: set SPEED_GEN to GEN2, enable LINK_TRAINING_EN, poll until link training is complete, trigger PCI_EXP_LNKCTL_RL, poll until signal rate is 5 GT/s, poll until link training is complete, enable ASPM L0s. The requirement for triggering PCI_EXP_LNKCTL_RL can be explained by the need to achieve 5 GT/s speed (as changing link speed is done by throw to recovery state entered by PCI_EXP_LNKCTL_RL) or maybe as a part of enabling ASPM L0s (but in this case ASPM L0s should have been enabled prior PCI_EXP_LNKCTL_RL). It is unknown why the original pci-aardvark.c driver was triggering PCI_EXP_LNKCTL_RL bit before waiting for the link to be up. This does not align with neither PCIe base specifications nor with Armada 3700 Functional Specification. (Note that in older versions of aardvark, this bit was called incorrectly PCIE_CORE_LINK_TRAINING, so this may be the reason.) It is also unknown why Armada 3700 Functional Specification says that it is needed to trigger PCI_EXP_LNKCTL_RL for GEN2 mode, as according to PCIe base specification 5 GT/s speed negotiation is supposed to be entirely autonomous, even if initial speed is 2.5 GT/s. [1] - https://lore.kernel.org/linux-pci/87h7l8axqp.fsf@toke.dk/ [2] - https://lore.kernel.org/linux-pci/20210326124326.21163-1-pali@kernel.org/ Link: https://lore.kernel.org/r/20211005180952.6812-12-kabel@kernel.org Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún --- drivers/pci/controller/pci-aardvark.c | 117 ++++++++------------------ 1 file changed, 34 insertions(+), 83 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 9c509d5a9afa..2c66cdbb8dd6 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -258,11 +258,6 @@ static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg) return readl(pcie->base + reg); } -static inline u16 advk_read16(struct advk_pcie *pcie, u64 reg) -{ - return advk_readl(pcie, (reg & ~0x3)) >> ((reg & 0x3) * 8); -} - static int advk_pcie_link_up(struct advk_pcie *pcie) { u32 val, ltssm_state; @@ -300,23 +295,9 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) static void advk_pcie_issue_perst(struct advk_pcie *pcie) { - u32 reg; - if (!pcie->reset_gpio) return; - /* - * As required by PCI Express spec (PCI Express Base Specification, REV. - * 4.0 PCI Express, February 19 2014, 6.6.1 Conventional Reset) a delay - * for at least 100ms after de-asserting PERST# signal is needed before - * link training is enabled. So ensure that link training is disabled - * prior de-asserting PERST# signal to fulfill that PCI Express spec - * requirement. - */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg &= ~LINK_TRAINING_EN; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* 10ms delay is needed for some cards */ dev_info(&pcie->pdev->dev, "issuing PERST via reset GPIO for 10ms\n"); gpiod_set_value_cansleep(pcie->reset_gpio, 1); @@ -324,53 +305,46 @@ static void advk_pcie_issue_perst(struct advk_pcie *pcie) gpiod_set_value_cansleep(pcie->reset_gpio, 0); } -static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen) +static void advk_pcie_train_link(struct advk_pcie *pcie) { - int ret, neg_gen; + struct device *dev = &pcie->pdev->dev; u32 reg; + int ret; - /* Setup link speed */ + /* + * Setup PCIe rev / gen compliance based on device tree property + * 'max-link-speed' which also forces maximal link speed. + */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg &= ~PCIE_GEN_SEL_MSK; - if (gen == 3) + if (pcie->link_gen == 3) reg |= SPEED_GEN_3; - else if (gen == 2) + else if (pcie->link_gen == 2) reg |= SPEED_GEN_2; else reg |= SPEED_GEN_1; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); /* - * Enable link training. This is not needed in every call to this - * function, just once suffices, but it does not break anything either. + * Set maximal link speed value also into PCIe Link Control 2 register. + * Armada 3700 Functional Specification says that default value is based + * on SPEED_GEN but tests showed that default value is always 8.0 GT/s. */ + reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL2); + reg &= ~PCI_EXP_LNKCTL2_TLS; + if (pcie->link_gen == 3) + reg |= PCI_EXP_LNKCTL2_TLS_8_0GT; + else if (pcie->link_gen == 2) + reg |= PCI_EXP_LNKCTL2_TLS_5_0GT; + else + reg |= PCI_EXP_LNKCTL2_TLS_2_5GT; + advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL2); + + /* Enable link training after selecting PCIe generation */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg |= LINK_TRAINING_EN; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* - * Start link training immediately after enabling it. - * This solves problems for some buggy cards. - */ - reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); - reg |= PCI_EXP_LNKCTL_RL; - advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); - - ret = advk_pcie_wait_for_link(pcie); - if (ret) - return ret; - - reg = advk_read16(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKSTA); - neg_gen = reg & PCI_EXP_LNKSTA_CLS; - - return neg_gen; -} - -static void advk_pcie_train_link(struct advk_pcie *pcie) -{ - struct device *dev = &pcie->pdev->dev; - int neg_gen = -1, gen; - /* * Reset PCIe card via PERST# signal. Some cards are not detected * during link training when they are in some non-initial state. @@ -381,41 +355,18 @@ static void advk_pcie_train_link(struct advk_pcie *pcie) * PERST# signal could have been asserted by pinctrl subsystem before * probe() callback has been called or issued explicitly by reset gpio * function advk_pcie_issue_perst(), making the endpoint going into - * fundamental reset. As required by PCI Express spec a delay for at - * least 100ms after such a reset before link training is needed. + * fundamental reset. As required by PCI Express spec (PCI Express + * Base Specification, REV. 4.0 PCI Express, February 19 2014, 6.6.1 + * Conventional Reset) a delay for at least 100ms after such a reset + * before sending a Configuration Request to the device is needed. + * So wait until PCIe link is up. Function advk_pcie_wait_for_link() + * waits for link at least 900ms. */ - msleep(PCI_PM_D3COLD_WAIT); - - /* - * Try link training at link gen specified by device tree property - * 'max-link-speed'. If this fails, iteratively train at lower gen. - */ - for (gen = pcie->link_gen; gen > 0; --gen) { - neg_gen = advk_pcie_train_at_gen(pcie, gen); - if (neg_gen > 0) - break; - } - - if (neg_gen < 0) - goto err; - - /* - * After successful training if negotiated gen is lower than requested, - * train again on negotiated gen. This solves some stability issues for - * some buggy gen1 cards. - */ - if (neg_gen < gen) { - gen = neg_gen; - neg_gen = advk_pcie_train_at_gen(pcie, gen); - } - - if (neg_gen == gen) { - dev_info(dev, "link up at gen %i\n", gen); - return; - } - -err: - dev_err(dev, "link never came up\n"); + ret = advk_pcie_wait_for_link(pcie); + if (ret < 0) + dev_err(dev, "link never came up\n"); + else + dev_info(dev, "link up\n"); } /* From 661c399a651c11aaf83c45cbfe0b4a1fb7bc3179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:51 +0200 Subject: [PATCH 047/512] PCI: aardvark: Fix checking for link up via LTSSM state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current implementation of advk_pcie_link_up() is wrong as it marks also link disabled or hot reset states as link up. Fix it by marking link up only to those states which are defined in PCIe Base specification 3.0, Table 4-14: Link Status Mapped to the LTSSM. To simplify implementation, Define macros for every LTSSM state which aardvark hardware can return in CFG_REG register. Fix also checking for link training according to the same Table 4-14. Define a new function advk_pcie_link_training() for this purpose. Link: https://lore.kernel.org/r/20211005180952.6812-13-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org Cc: Remi Pommarel --- drivers/pci/controller/pci-aardvark.c | 76 ++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 2c66cdbb8dd6..f831f7d197bd 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -165,8 +165,50 @@ #define CFG_REG (LMI_BASE_ADDR + 0x0) #define LTSSM_SHIFT 24 #define LTSSM_MASK 0x3f -#define LTSSM_L0 0x10 #define RC_BAR_CONFIG 0x300 + +/* LTSSM values in CFG_REG */ +enum { + LTSSM_DETECT_QUIET = 0x0, + LTSSM_DETECT_ACTIVE = 0x1, + LTSSM_POLLING_ACTIVE = 0x2, + LTSSM_POLLING_COMPLIANCE = 0x3, + LTSSM_POLLING_CONFIGURATION = 0x4, + LTSSM_CONFIG_LINKWIDTH_START = 0x5, + LTSSM_CONFIG_LINKWIDTH_ACCEPT = 0x6, + LTSSM_CONFIG_LANENUM_ACCEPT = 0x7, + LTSSM_CONFIG_LANENUM_WAIT = 0x8, + LTSSM_CONFIG_COMPLETE = 0x9, + LTSSM_CONFIG_IDLE = 0xa, + LTSSM_RECOVERY_RCVR_LOCK = 0xb, + LTSSM_RECOVERY_SPEED = 0xc, + LTSSM_RECOVERY_RCVR_CFG = 0xd, + LTSSM_RECOVERY_IDLE = 0xe, + LTSSM_L0 = 0x10, + LTSSM_RX_L0S_ENTRY = 0x11, + LTSSM_RX_L0S_IDLE = 0x12, + LTSSM_RX_L0S_FTS = 0x13, + LTSSM_TX_L0S_ENTRY = 0x14, + LTSSM_TX_L0S_IDLE = 0x15, + LTSSM_TX_L0S_FTS = 0x16, + LTSSM_L1_ENTRY = 0x17, + LTSSM_L1_IDLE = 0x18, + LTSSM_L2_IDLE = 0x19, + LTSSM_L2_TRANSMIT_WAKE = 0x1a, + LTSSM_DISABLED = 0x20, + LTSSM_LOOPBACK_ENTRY_MASTER = 0x21, + LTSSM_LOOPBACK_ACTIVE_MASTER = 0x22, + LTSSM_LOOPBACK_EXIT_MASTER = 0x23, + LTSSM_LOOPBACK_ENTRY_SLAVE = 0x24, + LTSSM_LOOPBACK_ACTIVE_SLAVE = 0x25, + LTSSM_LOOPBACK_EXIT_SLAVE = 0x26, + LTSSM_HOT_RESET = 0x27, + LTSSM_RECOVERY_EQUALIZATION_PHASE0 = 0x28, + LTSSM_RECOVERY_EQUALIZATION_PHASE1 = 0x29, + LTSSM_RECOVERY_EQUALIZATION_PHASE2 = 0x2a, + LTSSM_RECOVERY_EQUALIZATION_PHASE3 = 0x2b, +}; + #define VENDOR_ID_REG (LMI_BASE_ADDR + 0x44) /* PCIe core controller registers */ @@ -258,13 +300,35 @@ static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg) return readl(pcie->base + reg); } -static int advk_pcie_link_up(struct advk_pcie *pcie) +static u8 advk_pcie_ltssm_state(struct advk_pcie *pcie) { - u32 val, ltssm_state; + u32 val; + u8 ltssm_state; val = advk_readl(pcie, CFG_REG); ltssm_state = (val >> LTSSM_SHIFT) & LTSSM_MASK; - return ltssm_state >= LTSSM_L0; + return ltssm_state; +} + +static inline bool advk_pcie_link_up(struct advk_pcie *pcie) +{ + /* check if LTSSM is in normal operation - some L* state */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ltssm_state >= LTSSM_L0 && ltssm_state < LTSSM_DISABLED; +} + +static inline bool advk_pcie_link_training(struct advk_pcie *pcie) +{ + /* + * According to PCIe Base specification 3.0, Table 4-14: Link + * Status Mapped to the LTSSM is Link Training mapped to LTSSM + * Configuration and Recovery states. + */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ((ltssm_state >= LTSSM_CONFIG_LINKWIDTH_START && + ltssm_state < LTSSM_L0) || + (ltssm_state >= LTSSM_RECOVERY_EQUALIZATION_PHASE0 && + ltssm_state <= LTSSM_RECOVERY_EQUALIZATION_PHASE3)); } static int advk_pcie_wait_for_link(struct advk_pcie *pcie) @@ -287,7 +351,7 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) size_t retries; for (retries = 0; retries < RETRAIN_WAIT_MAX_RETRIES; ++retries) { - if (!advk_pcie_link_up(pcie)) + if (advk_pcie_link_training(pcie)) break; udelay(RETRAIN_WAIT_USLEEP_US); } @@ -706,7 +770,7 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, /* u32 contains both PCI_EXP_LNKCTL and PCI_EXP_LNKSTA */ u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg) & ~(PCI_EXP_LNKSTA_LT << 16); - if (!advk_pcie_link_up(pcie)) + if (advk_pcie_link_training(pcie)) val |= (PCI_EXP_LNKSTA_LT << 16); *value = val; return PCI_BRIDGE_EMUL_HANDLED; From 2b650b7ff20eb7ea8ef9031d20fb657286ab90cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 5 Oct 2021 20:09:52 +0200 Subject: [PATCH 048/512] PCI: aardvark: Fix reporting Data Link Layer Link Active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for reporting PCI_EXP_LNKSTA_DLLLA bit in Link Control register on emulated bridge via current LTSSM state. Also correctly indicate DLLLA capability via PCI_EXP_LNKCAP_DLLLARC bit in Link Control Capability register. Link: https://lore.kernel.org/r/20211005180952.6812-14-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marek Behún Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 29 ++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index f831f7d197bd..10476c00b312 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -317,6 +317,20 @@ static inline bool advk_pcie_link_up(struct advk_pcie *pcie) return ltssm_state >= LTSSM_L0 && ltssm_state < LTSSM_DISABLED; } +static inline bool advk_pcie_link_active(struct advk_pcie *pcie) +{ + /* + * According to PCIe Base specification 3.0, Table 4-14: Link + * Status Mapped to the LTSSM, and 4.2.6.3.6 Configuration.Idle + * is Link Up mapped to LTSSM Configuration.Idle, Recovery, L0, + * L0s, L1 and L2 states. And according to 3.2.1. Data Link + * Control and Management State Machine Rules is DL Up status + * reported in DL Active state. + */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ltssm_state >= LTSSM_CONFIG_IDLE && ltssm_state < LTSSM_DISABLED; +} + static inline bool advk_pcie_link_training(struct advk_pcie *pcie) { /* @@ -766,12 +780,26 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, return PCI_BRIDGE_EMUL_HANDLED; } + case PCI_EXP_LNKCAP: { + u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg); + /* + * PCI_EXP_LNKCAP_DLLLARC bit is hardwired in aardvark HW to 0. + * But support for PCI_EXP_LNKSTA_DLLLA is emulated via ltssm + * state so explicitly enable PCI_EXP_LNKCAP_DLLLARC flag. + */ + val |= PCI_EXP_LNKCAP_DLLLARC; + *value = val; + return PCI_BRIDGE_EMUL_HANDLED; + } + case PCI_EXP_LNKCTL: { /* u32 contains both PCI_EXP_LNKCTL and PCI_EXP_LNKSTA */ u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg) & ~(PCI_EXP_LNKSTA_LT << 16); if (advk_pcie_link_training(pcie)) val |= (PCI_EXP_LNKSTA_LT << 16); + if (advk_pcie_link_active(pcie)) + val |= (PCI_EXP_LNKSTA_DLLLA << 16); *value = val; return PCI_BRIDGE_EMUL_HANDLED; } @@ -779,7 +807,6 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, case PCI_CAP_LIST_ID: case PCI_EXP_DEVCAP: case PCI_EXP_DEVCTL: - case PCI_EXP_LNKCAP: *value = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg); return PCI_BRIDGE_EMUL_HANDLED; default: From 2908a0d81f5b24081e95219b8bdc5b93a310f537 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Jun 2021 17:01:02 +0300 Subject: [PATCH 049/512] PCI: dwc: Clean up Kconfig dependencies (PCIE_DW_HOST) The "depends on" Kconfig construct is a no-op in options that are selected and therefore has no effect. Remove it. Furthermore, there is no need to repeat menu dependencies (PCI). Clean up the users of PCIE_DW_HOST and introduce idiom depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST for all of them. Link: https://lore.kernel.org/r/20210623140103.47818-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/Kconfig | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 9892a1135678..8c2028e2eca6 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -8,7 +8,6 @@ config PCIE_DW config PCIE_DW_HOST bool - depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW config PCIE_DW_EP @@ -22,8 +21,8 @@ config PCI_DRA7XX config PCI_DRA7XX_HOST tristate "TI DRA7xx PCIe controller Host Mode" depends on SOC_DRA7XX || COMPILE_TEST - depends on PCI_MSI_IRQ_DOMAIN depends on OF && HAS_IOMEM && TI_PIPE3 + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST select PCI_DRA7XX default y if SOC_DRA7XX @@ -55,7 +54,7 @@ config PCIE_DW_PLAT config PCIE_DW_PLAT_HOST bool "Platform bus based DesignWare PCIe Controller - Host mode" - depends on PCI && PCI_MSI_IRQ_DOMAIN + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST select PCIE_DW_PLAT help @@ -138,8 +137,8 @@ config PCI_LAYERSCAPE bool "Freescale Layerscape PCIe controller - Host mode" depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST) depends on PCI_MSI_IRQ_DOMAIN - select MFD_SYSCON select PCIE_DW_HOST + select MFD_SYSCON help Say Y here if you want to enable PCIe controller support on Layerscape SoCs to work in Host mode. @@ -283,8 +282,8 @@ config PCIE_HISI_STB config PCI_MESON tristate "MESON PCIe controller" - depends on PCI_MSI_IRQ_DOMAIN default m if ARCH_MESON + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST help Say Y here if you want to enable PCI controller support on Amlogic From 8faa1d2defb795806cc46c21d16ad01bb37d23c4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 23 Jun 2021 17:01:03 +0300 Subject: [PATCH 050/512] PCI: dwc: Clean up Kconfig dependencies (PCIE_DW_EP) The "depends on" Kconfig construct is a no-op in options that are selected and therefore has no effect. Remove it. Clean up the users of PCIE_DW_EP and introduce idiom depends on PCI_ENDPOINT select PCIE_DW_EP for all of them. Link: https://lore.kernel.org/r/20210623140103.47818-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 8c2028e2eca6..6c4ee84053b5 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -12,7 +12,6 @@ config PCIE_DW_HOST config PCIE_DW_EP bool - depends on PCI_ENDPOINT select PCIE_DW config PCI_DRA7XX @@ -37,8 +36,8 @@ config PCI_DRA7XX_HOST config PCI_DRA7XX_EP tristate "TI DRA7xx PCIe controller Endpoint Mode" depends on SOC_DRA7XX || COMPILE_TEST - depends on PCI_ENDPOINT depends on OF && HAS_IOMEM && TI_PIPE3 + depends on PCI_ENDPOINT select PCIE_DW_EP select PCI_DRA7XX help From 5b8402562e553983fc941bb955e2c14d1a69215f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Thu, 7 Oct 2021 12:28:48 +0000 Subject: [PATCH 051/512] PCI: visconti: Remove surplus dev_err() when using platform_get_irq_byname() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to call the dev_err() function directly to print a custom message when handling an error from either the platform_get_irq() or platform_get_irq_byname() functions as both are going to display an appropriate error message in case of a failure. This change is as per suggestions from Coccinelle, e.g., drivers/pci/controller/dwc/pcie-visconti.c:286:2-9: line 286 is redundant because platform_get_irq() already prints an error Related: https://lore.kernel.org/all/20210310131913.2802385-1-kw@linux.com/ https://lore.kernel.org/all/20200802142601.1635926-1-kw@linux.com/ Link: https://lore.kernel.org/r/20211007122848.3366-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-visconti.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-visconti.c b/drivers/pci/controller/dwc/pcie-visconti.c index a88eab6829bb..50f80f07e4db 100644 --- a/drivers/pci/controller/dwc/pcie-visconti.c +++ b/drivers/pci/controller/dwc/pcie-visconti.c @@ -279,13 +279,10 @@ static int visconti_add_pcie_port(struct visconti_pcie *pcie, { struct dw_pcie *pci = &pcie->pci; struct pcie_port *pp = &pci->pp; - struct device *dev = &pdev->dev; pp->irq = platform_get_irq_byname(pdev, "intr"); - if (pp->irq < 0) { - dev_err(dev, "Interrupt intr is missing"); + if (pp->irq < 0) return pp->irq; - } pp->ops = &visconti_pcie_host_ops; From 42da7911b83a462373c2d093a587d052f02211b0 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Fri, 17 Sep 2021 21:13:24 +0800 Subject: [PATCH 052/512] PCI: vmd: Assign a number to each VMD controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the system has multiple VMD controllers, the driver does not assign a number to each controller, so when analyzing the interrupt through /proc/interrupts, the names of all controllers are the same, which is not very convenient for problem analysis. Here, try to assign a number to each VMD controller. Link: https://lore.kernel.org/r/1631884404-24141-1-git-send-email-brookxu.cn@gmail.com Signed-off-by: Chunguang Xu Signed-off-by: Lorenzo Pieralisi Reviewed-by: Jon Derrick Reviewed-by: Krzysztof Wilczyński --- drivers/pci/controller/vmd.c | 41 +++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index a5987e52700e..1b3e94a96414 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -70,6 +70,8 @@ enum vmd_features { VMD_FEAT_CAN_BYPASS_MSI_REMAP = (1 << 4), }; +static DEFINE_IDA(vmd_instance_ida); + /* * Lock for manipulating VMD IRQ lists. */ @@ -120,6 +122,8 @@ struct vmd_dev { struct pci_bus *bus; u8 busn_start; u8 first_vec; + char *name; + int instance; }; static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus) @@ -650,7 +654,7 @@ static int vmd_alloc_irqs(struct vmd_dev *vmd) INIT_LIST_HEAD(&vmd->irqs[i].irq_list); err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i), vmd_irq, IRQF_NO_THREAD, - "vmd", &vmd->irqs[i]); + vmd->name, &vmd->irqs[i]); if (err) return err; } @@ -834,18 +838,32 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) return -ENOMEM; vmd->dev = dev; + vmd->instance = ida_simple_get(&vmd_instance_ida, 0, 0, GFP_KERNEL); + if (vmd->instance < 0) + return vmd->instance; + + vmd->name = kasprintf(GFP_KERNEL, "vmd%d", vmd->instance); + if (!vmd->name) { + err = -ENOMEM; + goto out_release_instance; + } + err = pcim_enable_device(dev); if (err < 0) - return err; + goto out_release_instance; vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0); - if (!vmd->cfgbar) - return -ENOMEM; + if (!vmd->cfgbar) { + err = -ENOMEM; + goto out_release_instance; + } pci_set_master(dev); if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) - return -ENODEV; + dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) { + err = -ENODEV; + goto out_release_instance; + } if (features & VMD_FEAT_OFFSET_FIRST_VECTOR) vmd->first_vec = 1; @@ -854,11 +872,16 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) pci_set_drvdata(dev, vmd); err = vmd_enable_domain(vmd, features); if (err) - return err; + goto out_release_instance; dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n", vmd->sysdata.domain); return 0; + + out_release_instance: + ida_simple_remove(&vmd_instance_ida, vmd->instance); + kfree(vmd->name); + return err; } static void vmd_cleanup_srcu(struct vmd_dev *vmd) @@ -879,6 +902,8 @@ static void vmd_remove(struct pci_dev *dev) vmd_cleanup_srcu(vmd); vmd_detach_resources(vmd); vmd_remove_irq_domain(vmd); + ida_simple_remove(&vmd_instance_ida, vmd->instance); + kfree(vmd->name); } #ifdef CONFIG_PM_SLEEP @@ -903,7 +928,7 @@ static int vmd_resume(struct device *dev) for (i = 0; i < vmd->msix_count; i++) { err = devm_request_irq(dev, pci_irq_vector(pdev, i), vmd_irq, IRQF_NO_THREAD, - "vmd", &vmd->irqs[i]); + vmd->name, &vmd->irqs[i]); if (err) return err; } From c65bd90dc93e1b2f8e776f619ffbb0335a7a16ec Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 1 Oct 2021 14:16:09 +0200 Subject: [PATCH 053/512] PCI: rcar-ep: Remove unneeded includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove includes that are not needed, to speed up (re)compilation. Include , which is needed, and was included implicitly through before. Most of these are relics from splitting the driver in a host and a common part and adding endpoint support. [bhelgaas: use driver tag consistent with cadence-ep, designware-ep] Link: https://lore.kernel.org/r/7c708841a2bf84f85b14a963271c3e99c8ba38a5.1633090444.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Niklas Söderlund --- drivers/pci/controller/pcie-rcar-ep.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/pci/controller/pcie-rcar-ep.c b/drivers/pci/controller/pcie-rcar-ep.c index aa1cf24a5a72..f9682df1da61 100644 --- a/drivers/pci/controller/pcie-rcar-ep.c +++ b/drivers/pci/controller/pcie-rcar-ep.c @@ -6,16 +6,13 @@ * Author: Lad Prabhakar */ -#include #include #include -#include -#include #include #include #include -#include #include +#include #include "pcie-rcar.h" From 861e133ba268bef4765ea62c9602d4116b9c63ec Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 1 Oct 2021 14:16:43 +0200 Subject: [PATCH 054/512] PCI: rcar-host: Remove unneeded includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove includes that are not needed, to speed up (re)compilation. Most of these are relics from splitting the driver in a host and a common part. [bhelgaas: use driver tag analogous to rcar-ep] Link: https://lore.kernel.org/r/54bed9a0e6991490ddb2b07e5abfaf40a7a62928.1633090577.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Niklas Söderlund --- drivers/pci/controller/pcie-rcar-host.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/controller/pcie-rcar-host.c b/drivers/pci/controller/pcie-rcar-host.c index 8f3131844e77..e12c2d8be05a 100644 --- a/drivers/pci/controller/pcie-rcar-host.c +++ b/drivers/pci/controller/pcie-rcar-host.c @@ -24,13 +24,11 @@ #include #include #include -#include #include #include #include #include #include -#include #include "pcie-rcar.h" From 31c9ef00258075df2f1fe5ef812cd7e9b6a1dc55 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 20 Sep 2021 12:29:44 +0530 Subject: [PATCH 055/512] dt-bindings: PCI: Add Qualcomm PCIe Endpoint controller Add devicetree binding for Qualcomm PCIe Endpoint controller used in platforms like SDX55. The Endpoint controller is based on the DesignWare core with Qualcomm-specific wrappers. Link: https://lore.kernel.org/r/20210920065946.15090-2-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/qcom,pcie-ep.yaml | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml new file mode 100644 index 000000000000..3d23599e5e91 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/qcom,pcie-ep.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm PCIe Endpoint Controller binding + +maintainers: + - Manivannan Sadhasivam + +allOf: + - $ref: "pci-ep.yaml#" + +properties: + compatible: + const: qcom,sdx55-pcie-ep + + reg: + items: + - description: Qualcomm-specific PARF configuration registers + - description: DesignWare PCIe registers + - description: External local bus interface registers + - description: Address Translation Unit (ATU) registers + - description: Memory region used to map remote RC address space + - description: BAR memory region + + reg-names: + items: + - const: parf + - const: dbi + - const: elbi + - const: atu + - const: addr_space + - const: mmio + + clocks: + items: + - description: PCIe Auxiliary clock + - description: PCIe CFG AHB clock + - description: PCIe Master AXI clock + - description: PCIe Slave AXI clock + - description: PCIe Slave Q2A AXI clock + - description: PCIe Sleep clock + - description: PCIe Reference clock + + clock-names: + items: + - const: aux + - const: cfg + - const: bus_master + - const: bus_slave + - const: slave_q2a + - const: sleep + - const: ref + + qcom,perst-regs: + description: Reference to a syscon representing TCSR followed by the two + offsets within syscon for Perst enable and Perst separation + enable registers + $ref: "/schemas/types.yaml#/definitions/phandle-array" + items: + minItems: 3 + maxItems: 3 + + interrupts: + items: + - description: PCIe Global interrupt + - description: PCIe Doorbell interrupt + + interrupt-names: + items: + - const: global + - const: doorbell + + reset-gpios: + description: GPIO used as PERST# input signal + maxItems: 1 + + wake-gpios: + description: GPIO used as WAKE# output signal + maxItems: 1 + + resets: + maxItems: 1 + + reset-names: + const: core + + power-domains: + maxItems: 1 + + phys: + maxItems: 1 + + phy-names: + const: pciephy + + num-lanes: + default: 2 + +required: + - compatible + - reg + - reg-names + - clocks + - clock-names + - qcom,perst-regs + - interrupts + - interrupt-names + - reset-gpios + - resets + - reset-names + - power-domains + +unevaluatedProperties: false + +examples: + - | + #include + #include + #include + pcie_ep: pcie-ep@40000000 { + compatible = "qcom,sdx55-pcie-ep"; + reg = <0x01c00000 0x3000>, + <0x40000000 0xf1d>, + <0x40000f20 0xc8>, + <0x40001000 0x1000>, + <0x40002000 0x1000>, + <0x01c03000 0x3000>; + reg-names = "parf", "dbi", "elbi", "atu", "addr_space", + "mmio"; + + clocks = <&gcc GCC_PCIE_AUX_CLK>, + <&gcc GCC_PCIE_CFG_AHB_CLK>, + <&gcc GCC_PCIE_MSTR_AXI_CLK>, + <&gcc GCC_PCIE_SLV_AXI_CLK>, + <&gcc GCC_PCIE_SLV_Q2A_AXI_CLK>, + <&gcc GCC_PCIE_SLEEP_CLK>, + <&gcc GCC_PCIE_0_CLKREF_CLK>; + clock-names = "aux", "cfg", "bus_master", "bus_slave", + "slave_q2a", "sleep", "ref"; + + qcom,perst-regs = <&tcsr 0xb258 0xb270>; + + interrupts = , + ; + interrupt-names = "global", "doorbell"; + reset-gpios = <&tlmm 57 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 53 GPIO_ACTIVE_LOW>; + resets = <&gcc GCC_PCIE_BCR>; + reset-names = "core"; + power-domains = <&gcc PCIE_GDSC>; + phys = <&pcie0_lane>; + phy-names = "pciephy"; + max-link-speed = <3>; + num-lanes = <2>; + }; From b2105b9f39b57ac80fb909b7ae4da3d343af9f7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 6 Oct 2021 23:38:27 +0000 Subject: [PATCH 056/512] PCI: Correct misspelled and remove duplicated words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correct a number of misspelled words and remove any words that were duplicated in the PCI tree. No change to functionality intended. Link: https://lore.kernel.org/r/20211006233827.147328-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-xgene-msi.c | 2 +- drivers/pci/controller/pcie-brcmstb.c | 2 +- drivers/pci/controller/pcie-iproc.c | 2 +- drivers/pci/endpoint/pci-epc-core.c | 2 +- drivers/pci/endpoint/pci-epf-core.c | 4 ++-- drivers/pci/hotplug/acpiphp_glue.c | 2 +- drivers/pci/hotplug/cpqphp_ctrl.c | 4 ++-- drivers/pci/hotplug/ibmphp.h | 4 ++-- drivers/pci/hotplug/shpchp_hpc.c | 2 +- drivers/pci/pci-driver.c | 2 +- drivers/pci/pcie/aer.c | 2 +- drivers/pci/quirks.c | 2 +- 12 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/pci/controller/pci-xgene-msi.c b/drivers/pci/controller/pci-xgene-msi.c index b7a8e062fcc5..c50ff279903c 100644 --- a/drivers/pci/controller/pci-xgene-msi.c +++ b/drivers/pci/controller/pci-xgene-msi.c @@ -302,7 +302,7 @@ static void xgene_msi_isr(struct irq_desc *desc) /* * MSIINTn (n is 0..F) indicates if there is a pending MSI interrupt - * If bit x of this register is set (x is 0..7), one or more interupts + * If bit x of this register is set (x is 0..7), one or more interrupts * corresponding to MSInIRx is set. */ grp_select = xgene_msi_int_read(xgene_msi, msi_grp); diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index cc30215f5a43..1fc7bd49a7ad 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -145,7 +145,7 @@ #define BRCM_INT_PCI_MSI_LEGACY_NR 8 #define BRCM_INT_PCI_MSI_SHIFT 0 -/* MSI target adresses */ +/* MSI target addresses */ #define BRCM_MSI_TARGET_ADDR_LT_4GB 0x0fffffffcULL #define BRCM_MSI_TARGET_ADDR_GT_4GB 0xffffffffcULL diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c index 30ac5fbefbbf..36b9d2c46cfa 100644 --- a/drivers/pci/controller/pcie-iproc.c +++ b/drivers/pci/controller/pcie-iproc.c @@ -249,7 +249,7 @@ enum iproc_pcie_reg { /* * To hold the address of the register where the MSI writes are - * programed. When ARM GICv3 ITS is used, this should be programmed + * programmed. When ARM GICv3 ITS is used, this should be programmed * with the address of the GITS_TRANSLATER register. */ IPROC_PCIE_MSI_ADDR_LO, diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index ecbb0fb3b653..38621558d397 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(pci_epc_linkup); /** * pci_epc_init_notify() - Notify the EPF device that EPC device's core * initialization is completed. - * @epc: the EPC device whose core initialization is completeds + * @epc: the EPC device whose core initialization is completed * * Invoke to Notify the EPF device that the EPC device's initialization * is completed. diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index 8aea16380870..9ed556936f48 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -224,7 +224,7 @@ EXPORT_SYMBOL_GPL(pci_epf_add_vepf); * be removed * @epf_vf: the virtual EP function to be removed * - * Invoke to remove a virtual endpoint function from the physcial endpoint + * Invoke to remove a virtual endpoint function from the physical endpoint * function. */ void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf) @@ -432,7 +432,7 @@ EXPORT_SYMBOL_GPL(pci_epf_destroy); /** * pci_epf_create() - create a new PCI EPF device * @name: the name of the PCI EPF device. This name will be used to bind the - * the EPF device to a EPF driver + * EPF device to a EPF driver * * Invoke to create a new PCI EPF device by providing the name of the function * device. diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index f031302ad401..12f4b351be67 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -22,7 +22,7 @@ * when the bridge is scanned and it loses a refcount when the bridge * is removed. * - When a P2P bridge is present, we elevate the refcount on the subordinate - * bus. It loses the refcount when the the driver unloads. + * bus. It loses the refcount when the driver unloads. */ #define pr_fmt(fmt) "acpiphp_glue: " fmt diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c index 1b26ca0b3701..ed7b58eb64d2 100644 --- a/drivers/pci/hotplug/cpqphp_ctrl.c +++ b/drivers/pci/hotplug/cpqphp_ctrl.c @@ -519,7 +519,7 @@ error: * @head: list to search * @size: size of node to find, must be a power of two. * - * Description: This function sorts the resource list by size and then returns + * Description: This function sorts the resource list by size and then * returns the first node of "size" length that is not in the ISA aliasing * window. If it finds a node larger than "size" it will split it up. */ @@ -1202,7 +1202,7 @@ static u8 set_controller_speed(struct controller *ctrl, u8 adapter_speed, u8 hp_ mdelay(5); - /* Reenable interrupts */ + /* Re-enable interrupts */ writel(0, ctrl->hpc_reg + INT_MASK); pci_write_config_byte(ctrl->pci_dev, 0x41, reg); diff --git a/drivers/pci/hotplug/ibmphp.h b/drivers/pci/hotplug/ibmphp.h index e90a4ebf6550..0399c60d2ec1 100644 --- a/drivers/pci/hotplug/ibmphp.h +++ b/drivers/pci/hotplug/ibmphp.h @@ -352,7 +352,7 @@ struct resource_node { u32 len; int type; /* MEM, IO, PFMEM */ u8 fromMem; /* this is to indicate that the range is from - * from the Memory bucket rather than from PFMem */ + * the Memory bucket rather than from PFMem */ struct resource_node *next; struct resource_node *nextRange; /* for the other mem range on bus */ }; @@ -736,7 +736,7 @@ struct controller { int ibmphp_init_devno(struct slot **); /* This function is called from EBDA, so we need it not be static */ int ibmphp_do_disable_slot(struct slot *slot_cur); -int ibmphp_update_slot_info(struct slot *); /* This function is called from HPC, so we need it to not be be static */ +int ibmphp_update_slot_info(struct slot *); /* This function is called from HPC, so we need it to not be static */ int ibmphp_configure_card(struct pci_func *, u8); int ibmphp_unconfigure_card(struct slot **, int); extern const struct hotplug_slot_ops ibmphp_hotplug_slot_ops; diff --git a/drivers/pci/hotplug/shpchp_hpc.c b/drivers/pci/hotplug/shpchp_hpc.c index 9e3b27744305..bd7557ca4910 100644 --- a/drivers/pci/hotplug/shpchp_hpc.c +++ b/drivers/pci/hotplug/shpchp_hpc.c @@ -295,7 +295,7 @@ static int shpc_write_cmd(struct slot *slot, u8 t_slot, u8 cmd) mutex_lock(&slot->ctrl->cmd_lock); if (!shpc_poll_ctrl_busy(ctrl)) { - /* After 1 sec and and the controller is still busy */ + /* After 1 sec and the controller is still busy */ ctrl_err(ctrl, "Controller is still busy after 1 sec\n"); retval = -EBUSY; goto out; diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 2761ab86490d..df3bd7b40542 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -576,7 +576,7 @@ static int pci_pm_reenable_device(struct pci_dev *pci_dev) { int retval; - /* if the device was enabled before suspend, reenable */ + /* if the device was enabled before suspend, re-enable */ retval = pci_reenable_device(pci_dev); /* * if the device was busmaster before the suspend, make it busmaster diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 9784fdcf3006..9fa1f97e5b27 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -57,7 +57,7 @@ struct aer_stats { * "as seen by this device". Note that this may mean that if an * end point is causing problems, the AER counters may increment * at its link partner (e.g. root port) because the errors will be - * "seen" by the link partner and not the the problematic end point + * "seen" by the link partner and not the problematic end point * itself (which may report all counters as 0 as it never saw any * problems). */ diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 4537d1ea14fd..881c5d7c3d02 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2700,7 +2700,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, * then the device can't use INTx interrupts. Tegra's PCIe root ports don't * generate MSI interrupts for PME and AER events instead only INTx interrupts * are generated. Though Tegra's PCIe root ports can generate MSI interrupts - * for other events, since PCIe specificiation doesn't support using a mix of + * for other events, since PCIe specification doesn't support using a mix of * INTx and MSI/MSI-X, it is required to disable MSI interrupts to avoid port * service drivers registering their respective ISRs for MSIs. */ From 9bf3d20331295b1ecb81f4ed9ef358c51699a050 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 8 Oct 2021 17:38:20 +0800 Subject: [PATCH 057/512] quota: check block number when reading the block in quota file The block number in the quota tree on disk should be smaller than the v2_disk_dqinfo.dqi_blocks. If the quota file was corrupted, we may be allocating an 'allocated' block and that would lead to a loop in a tree, which will probably trigger oops later. This patch adds a check for the block number in the quota tree to prevent such potential issue. Link: https://lore.kernel.org/r/20211008093821.1001186-2-yi.zhang@huawei.com Signed-off-by: Zhang Yi Cc: stable@kernel.org Signed-off-by: Jan Kara --- fs/quota/quota_tree.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index d3e995e1046f..583b7f7715f9 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -479,6 +479,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + newblk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); newblk = 0; @@ -578,6 +585,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; + if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + blk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); else From d0e36a62bd4c60c09acc40e06ba4831a4d0bc75b Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 8 Oct 2021 17:38:21 +0800 Subject: [PATCH 058/512] quota: correct error number in free_dqentry() Fix the error path in free_dqentry(), pass out the error number if the block to free is not correct. Fixes: 1ccd14b9c271 ("quota: Split off quota tree handling into a separate file") Link: https://lore.kernel.org/r/20211008093821.1001186-3-yi.zhang@huawei.com Signed-off-by: Zhang Yi Cc: stable@kernel.org Signed-off-by: Jan Kara --- fs/quota/quota_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 583b7f7715f9..5f2405994280 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -414,6 +414,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); From 4a667ba873088d23c4a6d7564239160c9af11783 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Oct 2021 22:16:57 -0700 Subject: [PATCH 059/512] s390/debug: fix kernel-doc warnings Fix kernel-doc warning due to incorrect parameter name in kernel-doc function notation: ../arch/s390/include/asm/debug.h:484: warning: Function parameter or member 'pages' not described in 'DEFINE_STATIC_DEBUG_INFO' ../arch/s390/include/asm/debug.h:484: warning: Excess function parameter 'pages_per_area' description in 'DEFINE_STATIC_DEBUG_INFO' Fixes: d72541f94512 ("s390/debug: add early tracing support") Signed-off-by: Randy Dunlap Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: linux-s390@vger.kernel.org Cc: Peter Oberparleiter Link: https://lore.kernel.org/r/20211005051657.16714-1-rdunlap@infradead.org Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/debug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 19a55e1e3a0c..77f24262c25c 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -462,7 +462,7 @@ arch_initcall(VNAME(var, reg)) * * @var: Name of debug_info_t variable * @name: Name of debug log (e.g. used for debugfs entry) - * @pages_per_area: Number of pages per area + * @pages: Number of pages per area * @nr_areas: Number of debug areas * @buf_size: Size of data area in each debug entry * @view: Pointer to debug view struct From 25d36a85c61b576401f69e0e205071e6b1befce8 Mon Sep 17 00:00:00 2001 From: Mete Durlu Date: Fri, 1 Oct 2021 15:52:00 +0200 Subject: [PATCH 060/512] s390/test_unwind: convert to KUnit Modified stack unwinder self tests to use kunit framework. The functionality stayed the same but the output format is now in tap13 format. Reviewed-by: Vasily Gorbik Signed-off-by: Mete Durlu Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 2 + arch/s390/lib/test_unwind.c | 169 ++++++++++++++++++++---------------- 2 files changed, 98 insertions(+), 73 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 4d63662a919d..8ce2ee8ac88e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -938,6 +938,8 @@ menu "Selftests" config S390_UNWIND_SELFTEST def_tristate n + depends on KUNIT + default KUNIT_ALL_TESTS prompt "Test unwind functions" help This option enables s390 specific stack unwinder testing kernel diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index ecf327d743a0..cfc5f5557c06 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -3,7 +3,7 @@ * Test module for unwind_for_each_frame */ -#define pr_fmt(fmt) "test_unwind: " fmt +#include #include #include #include @@ -16,6 +16,8 @@ #include #include +struct kunit *current_test; + #define BT_BUF_SIZE (PAGE_SIZE * 4) /* @@ -29,7 +31,7 @@ static void print_backtrace(char *bt) p = strsep(&bt, "\n"); if (!p) break; - pr_err("%s\n", p); + kunit_err(current_test, "%s\n", p); } } @@ -49,7 +51,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, bt = kmalloc(BT_BUF_SIZE, GFP_ATOMIC); if (!bt) { - pr_err("failed to allocate backtrace buffer\n"); + kunit_err(current_test, "failed to allocate backtrace buffer\n"); return -ENOMEM; } /* Unwind. */ @@ -63,7 +65,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, if (frame_count++ == max_frames) break; if (state.reliable && !addr) { - pr_err("unwind state reliable but addr is 0\n"); + kunit_err(current_test, "unwind state reliable but addr is 0\n"); ret = -EINVAL; break; } @@ -75,7 +77,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, stack_type_name(state.stack_info.type), (void *)state.sp, (void *)state.ip); if (bt_pos >= BT_BUF_SIZE) - pr_err("backtrace buffer is too small\n"); + kunit_err(current_test, "backtrace buffer is too small\n"); } frame_count += 1; if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1")) @@ -85,15 +87,15 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, /* Check the results. */ if (unwind_error(&state)) { - pr_err("unwind error\n"); + kunit_err(current_test, "unwind error\n"); ret = -EINVAL; } if (!seen_func2_func1) { - pr_err("unwindme_func2 and unwindme_func1 not found\n"); + kunit_err(current_test, "unwindme_func2 and unwindme_func1 not found\n"); ret = -EINVAL; } if (frame_count == max_frames) { - pr_err("Maximum number of frames exceeded\n"); + kunit_err(current_test, "Maximum number of frames exceeded\n"); ret = -EINVAL; } if (ret) @@ -166,7 +168,7 @@ static noinline int unwindme_func4(struct unwindme *u) kp.pre_handler = pgm_pre_handler; ret = register_kprobe(&kp); if (ret < 0) { - pr_err("register_kprobe failed %d\n", ret); + kunit_err(current_test, "register_kprobe failed %d\n", ret); return -EINVAL; } @@ -252,7 +254,7 @@ static int test_unwind_irq(struct unwindme *u) } /* Spawns a task and passes it to test_unwind(). */ -static int test_unwind_task(struct unwindme *u) +static int test_unwind_task(struct kunit *test, struct unwindme *u) { struct task_struct *task; int ret; @@ -267,7 +269,7 @@ static int test_unwind_task(struct unwindme *u) */ task = kthread_run(unwindme_func1, u, "%s", __func__); if (IS_ERR(task)) { - pr_err("kthread_run() failed\n"); + kunit_err(test, "kthread_run() failed\n"); return PTR_ERR(task); } /* @@ -282,77 +284,98 @@ static int test_unwind_task(struct unwindme *u) return ret; } -static int test_unwind_flags(int flags) +struct test_params { + int flags; + char *name; +}; + +/* + * Create required parameter list for tests + */ +static const struct test_params param_list[] = { + {.flags = UWM_DEFAULT, .name = "UWM_DEFAULT"}, + {.flags = UWM_SP, .name = "UWM_SP"}, + {.flags = UWM_REGS, .name = "UWM_REGS"}, + {.flags = UWM_SWITCH_STACK, + .name = "UWM_SWITCH_STACK"}, + {.flags = UWM_SP | UWM_REGS, + .name = "UWM_SP | UWM_REGS"}, + {.flags = UWM_CALLER | UWM_SP, + .name = "WM_CALLER | UWM_SP"}, + {.flags = UWM_CALLER | UWM_SP | UWM_REGS, + .name = "UWM_CALLER | UWM_SP | UWM_REGS"}, + {.flags = UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK, + .name = "UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK"}, + {.flags = UWM_THREAD, .name = "UWM_THREAD"}, + {.flags = UWM_THREAD | UWM_SP, + .name = "UWM_THREAD | UWM_SP"}, + {.flags = UWM_THREAD | UWM_CALLER | UWM_SP, + .name = "UWM_THREAD | UWM_CALLER | UWM_SP"}, + {.flags = UWM_IRQ, .name = "UWM_IRQ"}, + {.flags = UWM_IRQ | UWM_SWITCH_STACK, + .name = "UWM_IRQ | UWM_SWITCH_STACK"}, + {.flags = UWM_IRQ | UWM_SP, + .name = "UWM_IRQ | UWM_SP"}, + {.flags = UWM_IRQ | UWM_REGS, + .name = "UWM_IRQ | UWM_REGS"}, + {.flags = UWM_IRQ | UWM_SP | UWM_REGS, + .name = "UWM_IRQ | UWM_SP | UWM_REGS"}, + {.flags = UWM_IRQ | UWM_CALLER | UWM_SP, + .name = "UWM_IRQ | UWM_CALLER | UWM_SP"}, + {.flags = UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS, + .name = "UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS"}, + {.flags = UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK, + .name = "UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK"}, + #ifdef CONFIG_KPROBES + {.flags = UWM_PGM, .name = "UWM_PGM"}, + {.flags = UWM_PGM | UWM_SP, + .name = "UWM_PGM | UWM_SP"}, + {.flags = UWM_PGM | UWM_REGS, + .name = "UWM_PGM | UWM_REGS"}, + {.flags = UWM_PGM | UWM_SP | UWM_REGS, + .name = "UWM_PGM | UWM_SP | UWM_REGS"}, + #endif +}; + +/* + * Parameter description generator: required for KUNIT_ARRAY_PARAM() + */ +static void get_desc(const struct test_params *params, char *desc) +{ + strscpy(desc, params->name, KUNIT_PARAM_DESC_SIZE); +} + +/* + * Create test_unwind_gen_params + */ +KUNIT_ARRAY_PARAM(test_unwind, param_list, get_desc); + +static void test_unwind_flags(struct kunit *test) { struct unwindme u; + const struct test_params *params; - u.flags = flags; + current_test = test; + params = (const struct test_params *)test->param_value; + u.flags = params->flags; if (u.flags & UWM_THREAD) - return test_unwind_task(&u); + KUNIT_EXPECT_EQ(test, 0, test_unwind_task(test, &u)); else if (u.flags & UWM_IRQ) - return test_unwind_irq(&u); + KUNIT_EXPECT_EQ(test, 0, test_unwind_irq(&u)); else - return unwindme_func1(&u); + KUNIT_EXPECT_EQ(test, 0, unwindme_func1(&u)); } -static int test_unwind_init(void) -{ - int failed = 0; - int total = 0; +static struct kunit_case unwind_test_cases[] = { + KUNIT_CASE_PARAM(test_unwind_flags, test_unwind_gen_params), + {} +}; -#define TEST(flags) \ -do { \ - pr_info("[ RUN ] " #flags "\n"); \ - total++; \ - if (!test_unwind_flags((flags))) { \ - pr_info("[ OK ] " #flags "\n"); \ - } else { \ - pr_err("[ FAILED ] " #flags "\n"); \ - failed++; \ - } \ -} while (0) +static struct kunit_suite test_unwind_suite = { + .name = "test_unwind", + .test_cases = unwind_test_cases, +}; - pr_info("running stack unwinder tests"); - TEST(UWM_DEFAULT); - TEST(UWM_SP); - TEST(UWM_REGS); - TEST(UWM_SWITCH_STACK); - TEST(UWM_SP | UWM_REGS); - TEST(UWM_CALLER | UWM_SP); - TEST(UWM_CALLER | UWM_SP | UWM_REGS); - TEST(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK); - TEST(UWM_THREAD); - TEST(UWM_THREAD | UWM_SP); - TEST(UWM_THREAD | UWM_CALLER | UWM_SP); - TEST(UWM_IRQ); - TEST(UWM_IRQ | UWM_SWITCH_STACK); - TEST(UWM_IRQ | UWM_SP); - TEST(UWM_IRQ | UWM_REGS); - TEST(UWM_IRQ | UWM_SP | UWM_REGS); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK); -#ifdef CONFIG_KPROBES - TEST(UWM_PGM); - TEST(UWM_PGM | UWM_SP); - TEST(UWM_PGM | UWM_REGS); - TEST(UWM_PGM | UWM_SP | UWM_REGS); -#endif -#undef TEST - if (failed) { - pr_err("%d of %d stack unwinder tests failed", failed, total); - WARN(1, "%d of %d stack unwinder tests failed", failed, total); - } else { - pr_info("all %d stack unwinder tests passed", total); - } +kunit_test_suites(&test_unwind_suite); - return failed ? -EINVAL : 0; -} - -static void test_unwind_exit(void) -{ -} - -module_init(test_unwind_init); -module_exit(test_unwind_exit); MODULE_LICENSE("GPL"); From fbbd140737121637b98aef53440c7a2676e412cf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 1 Oct 2021 12:56:55 +0200 Subject: [PATCH 061/512] s390/barrier: factor out bcr_serialize() Factor out bcr_serialize() inline assembly function which describes what the bcr instruction is used for. Use bcr_serialize() like before in mb(), but also in upcoming changes. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/barrier.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index f9eddbca79d2..2c057e1f3200 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -16,20 +16,24 @@ #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES /* Fast-BCR without checkpoint synchronization */ -#define __ASM_BARRIER "bcr 14,0\n" +#define __ASM_BCR_SERIALIZE "bcr 14,0\n" #else -#define __ASM_BARRIER "bcr 15,0\n" +#define __ASM_BCR_SERIALIZE "bcr 15,0\n" #endif -#define mb() do { asm volatile(__ASM_BARRIER : : : "memory"); } while (0) +static __always_inline void bcr_serialize(void) +{ + asm volatile(__ASM_BCR_SERIALIZE : : : "memory"); +} -#define rmb() barrier() -#define wmb() barrier() -#define dma_rmb() mb() -#define dma_wmb() mb() -#define __smp_mb() mb() -#define __smp_rmb() rmb() -#define __smp_wmb() wmb() +#define mb() bcr_serialize() +#define rmb() barrier() +#define wmb() barrier() +#define dma_rmb() mb() +#define dma_wmb() mb() +#define __smp_mb() mb() +#define __smp_rmb() rmb() +#define __smp_wmb() wmb() #define __smp_store_release(p, v) \ do { \ From e16d02ee3f348900a0a2cf41931b204e2042f5e3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 1 Oct 2021 12:47:01 +0200 Subject: [PATCH 062/512] s390: introduce text_poke_sync() Introduce a text_poke_sync() similar to what x86 has. This can be used to execute a serializing instruction on all CPUs (including the current one). Note: according to the Principles of Operation an IPI (= interrupt) will already serialize a CPU, however it is better to be explicit. In addition on_each_cpu() makes sure that also the current CPU get serialized - just to make sure that possible preemption can prevent some theoretical case where a CPU will not be serialized. Therefore text_poke_sync() has to be used whenever code got modified, just to avoid to rely on implicit serialization. Also introduce text_poke_sync_lock() which will also disable CPU hotplug, to prevent that any CPU is just going online with a prefetched old version of a modified instruction. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/text-patching.h | 16 ++++++++++++++++ arch/s390/kernel/alternative.c | 20 ++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 arch/s390/include/asm/text-patching.h diff --git a/arch/s390/include/asm/text-patching.h b/arch/s390/include/asm/text-patching.h new file mode 100644 index 000000000000..b219056a8817 --- /dev/null +++ b/arch/s390/include/asm/text-patching.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_TEXT_PATCHING_H +#define _ASM_S390_TEXT_PATCHING_H + +#include + +static __always_inline void sync_core(void) +{ + bcr_serialize(); +} + +void text_poke_sync(void); +void text_poke_sync_lock(void); + +#endif /* _ASM_S390_TEXT_PATCHING_H */ diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index c22ea1c3ef84..cce0ddee2d02 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include +#include #include #include #include @@ -110,3 +113,20 @@ void __init apply_alternative_instructions(void) { apply_alternatives(__alt_instructions, __alt_instructions_end); } + +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +void text_poke_sync_lock(void) +{ + cpus_read_lock(); + text_poke_sync(); + cpus_read_unlock(); +} From 1c27dfb24e3b2a026488aa51e9991e27a1d94164 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 1 Oct 2021 14:15:56 +0200 Subject: [PATCH 063/512] s390/jump_label: use text_poke_sync() Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/jump_label.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 9156653b56f6..1e245da82197 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -6,8 +6,8 @@ * Author(s): Jan Glauber */ #include -#include #include +#include #include struct insn { @@ -72,15 +72,11 @@ static void __jump_label_transform(struct jump_entry *entry, s390_kernel_write(code, &new, sizeof(new)); } -static void __jump_label_sync(void *dummy) -{ -} - void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { __jump_label_transform(entry, type, 0); - smp_call_function(__jump_label_sync, NULL, 1); + text_poke_sync(); } void arch_jump_label_transform_static(struct jump_entry *entry, From ae2b9a11b494a9ec59d7dc11d42654d659f859c6 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 1 Oct 2021 14:21:43 +0200 Subject: [PATCH 064/512] s390/ftrace: use text_poke_sync_lock() Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/ftrace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 5d0c45c13b5f..04a3e88c58d7 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -207,14 +208,13 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -static void __ftrace_sync(void *dummy) -{ -} - int ftrace_arch_code_modify_post_process(void) { - /* Send SIGP to the other CPUs, so they see the new code. */ - smp_call_function(__ftrace_sync, NULL, 1); + /* + * Flush any pre-fetched instructions on all + * CPUs to make the new code visible. + */ + text_poke_sync_lock(); return 0; } From e5873d6f7a7aa5f9de73ca829034fcaaf7b1d25e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 1 Oct 2021 14:24:42 +0200 Subject: [PATCH 065/512] s390/ftrace: add missing serialization for graph caller patching CPUs must be serialized also when ftrace_graph_caller gets patched. This is missing since ftrace function graph support was added on s390. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 04a3e88c58d7..e416aaaaf524 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -268,12 +268,14 @@ NOKPROBE_SYMBOL(prepare_ftrace_return); int ftrace_enable_ftrace_graph_caller(void) { brcl_disable(ftrace_graph_caller); + text_poke_sync_lock(); return 0; } int ftrace_disable_ftrace_graph_caller(void) { brcl_enable(ftrace_graph_caller); + text_poke_sync_lock(); return 0; } From 4e0502b8b31032abbc0424cff222139288a3891a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Oct 2021 12:02:35 +0200 Subject: [PATCH 066/512] s390/jump_label: make use of HAVE_JUMP_LABEL_BATCH Specify HAVE_JUMP_LABEL_BATCH in header file. This allows to make use of the arch_jump_label_transform_queue()/arch_jump_label_transform_apply() mechanism. However unlike on x86, which currently is the only user of this mechanism, the to be patched instructions are still directly modified. The only difference to before is that serialization is only done after all instructions have been modified. This way the number of serialization/synchronization events is reduced. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/jump_label.h | 2 ++ arch/s390/kernel/jump_label.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index dcb1bba4f406..916cfcb36d8a 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -2,6 +2,8 @@ #ifndef _ASM_S390_JUMP_LABEL_H #define _ASM_S390_JUMP_LABEL_H +#define HAVE_JUMP_LABEL_BATCH + #ifndef __ASSEMBLY__ #include diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 1e245da82197..edefb40f172b 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -79,8 +79,21 @@ void arch_jump_label_transform(struct jump_entry *entry, text_poke_sync(); } +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) +{ + __jump_label_transform(entry, type, 0); + return true; +} + +void arch_jump_label_transform_apply(void) +{ + text_poke_sync(); +} + void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { __jump_label_transform(entry, type, 1); + text_poke_sync(); } From acd6c9afc63cbb571d5312e7f7583bf266e61f69 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Oct 2021 12:03:42 +0200 Subject: [PATCH 067/512] s390/jump_label: rename __jump_label_transform() Trivial patch just to get rid of the leading underscores. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/jump_label.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index edefb40f172b..0546c67da99b 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -48,9 +48,9 @@ static struct insn orignop = { .offset = JUMP_LABEL_NOP_OFFSET >> 1, }; -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { void *code = (void *)jump_entry_code(entry); struct insn old, new; @@ -75,14 +75,14 @@ static void __jump_label_transform(struct jump_entry *entry, void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - __jump_label_transform(entry, type, 0); + jump_label_transform(entry, type, 0); text_poke_sync(); } bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { - __jump_label_transform(entry, type, 0); + jump_label_transform(entry, type, 0); return true; } @@ -94,6 +94,6 @@ void arch_jump_label_transform_apply(void) void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - __jump_label_transform(entry, type, 1); + jump_label_transform(entry, type, 1); text_poke_sync(); } From 0c14c037952c0e29c3c66ccad6301648e02e11e1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Oct 2021 12:07:35 +0200 Subject: [PATCH 068/512] s390/jump_label: add __init_or_module annotation Add missing __init_or_module to arch_jump_label_transform_static(). Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/jump_label.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 0546c67da99b..6bec000c6c1c 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -7,6 +7,7 @@ */ #include #include +#include #include #include @@ -91,8 +92,8 @@ void arch_jump_label_transform_apply(void) text_poke_sync(); } -void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) +void __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) { jump_label_transform(entry, type, 1); text_poke_sync(); From 5740a7c71ab6721863f7dcfc8ab96be00b0a4b58 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Oct 2021 16:10:24 +0200 Subject: [PATCH 069/512] s390/ftrace: add HAVE_DYNAMIC_FTRACE_WITH_ARGS support Add HAVE_DYNAMIC_FTRACE_WITH_ARGS support similar to commit 02a474ca266a ("ftrace/x86: Allow for arguments to be passed in to ftrace_regs by default"). s390's ftrace implementation always provides all registers with pt_regs, therefore this is trivial. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 + arch/s390/include/asm/ftrace.h | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8ce2ee8ac88e..03eb3ec08b07 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -153,6 +153,7 @@ config S390 select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_ARGS select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index d1841f4176be..98c066cb347f 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -42,6 +42,15 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + return &fregs->regs; +} + /* * Even though the system call numbers are identical for s390/s390x a * different system call table is used for compat tasks. This may lead From 176510ebecd1ca201a8808dbbee9991eca005207 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Oct 2021 16:16:45 +0200 Subject: [PATCH 070/512] s390/ftrace: add ftrace_instruction_pointer_set() helper function Add ftrace_instruction_pointer_set() helper function to match x86. See commit 2860cd8a2353 ("livepatch: Use the default ftrace_ops instead of REGS when ARGS is available"). Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ftrace.h | 8 ++++++++ arch/s390/include/asm/livepatch.h | 4 +--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 98c066cb347f..3baceb4a7d3c 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -51,6 +51,14 @@ static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs * return &fregs->regs; } +static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs, + unsigned long ip) +{ + struct pt_regs *regs = arch_ftrace_get_regs(fregs); + + regs->psw.addr = ip; +} + /* * Even though the system call numbers are identical for s390/s390x a * different system call table is used for compat tasks. This may lead diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h index d578a8c76676..5209f223331a 100644 --- a/arch/s390/include/asm/livepatch.h +++ b/arch/s390/include/asm/livepatch.h @@ -16,9 +16,7 @@ static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { - struct pt_regs *regs = ftrace_get_regs(fregs); - - regs->psw.addr = ip; + ftrace_instruction_pointer_set(fregs, ip); } #endif From 894979689d3ac4780601b4506a03b4722398caf5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Oct 2021 20:24:12 +0200 Subject: [PATCH 071/512] s390/ftrace: provide separate ftrace_caller/ftrace_regs_caller implementations ftrace_regs_caller is an alias to ftrace_caller - making ftrace_caller quite heavyweight. Split the function and provide an ftrace_caller implementation which comes with fewer instructions. Especially getting rid of 'stosm' on each function entry should help here, e.g. to have less performance impact on live patched functions. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ftrace.h | 1 - arch/s390/kernel/mcount.S | 41 ++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 3baceb4a7d3c..4a721b44d4f4 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -17,7 +17,6 @@ void ftrace_caller(void); -extern char ftrace_graph_caller_end; extern void *ftrace_func; struct dyn_arch_ftrace { }; diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 6b13797143a7..cc25fbd75ea9 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -33,13 +33,15 @@ ENDPROC(ftrace_stub) #define TRACED_FUNC_FRAME_SIZE STACK_FRAME_OVERHEAD #endif -ENTRY(ftrace_caller) - .globl ftrace_regs_caller - .set ftrace_regs_caller,ftrace_caller + .macro ftrace_regs_entry, allregs=0 stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller + + .if \allregs == 1 lghi %r14,0 # save condition code ipm %r14 # don't put any instructions sllg %r14,%r14,16 # clobbering CC before this point + .endif + lgr %r1,%r15 # allocate stack frame for ftrace_caller to contain traced function aghi %r15,-TRACED_FUNC_FRAME_SIZE @@ -49,13 +51,30 @@ ENTRY(ftrace_caller) # allocate pt_regs and stack frame for ftrace_trace_function aghi %r15,-STACK_FRAME_SIZE stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15) + + .if \allregs == 1 stg %r14,(STACK_PTREGS_PSW)(%r15) - lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address stosm (STACK_PTREGS_PSW)(%r15),0 + .endif + + lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address aghi %r1,-TRACED_FUNC_FRAME_SIZE stg %r1,__SF_BACKCHAIN(%r15) stg %r0,(STACK_PTREGS_PSW+8)(%r15) stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15) + .endm + +SYM_CODE_START(ftrace_regs_caller) + ftrace_regs_entry 1 + j ftrace_common +SYM_CODE_END(ftrace_regs_caller) + +SYM_CODE_START(ftrace_caller) + ftrace_regs_entry 0 + j ftrace_common +SYM_CODE_END(ftrace_caller) + +SYM_CODE_START(ftrace_common) #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES aghik %r2,%r0,-MCOUNT_INSN_SIZE lgrl %r4,function_trace_op @@ -74,24 +93,22 @@ ENTRY(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER # The j instruction gets runtime patched to a nop instruction. # See ftrace_enable_ftrace_graph_caller. - .globl ftrace_graph_caller -ftrace_graph_caller: - j ftrace_graph_caller_end +SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) + j .Lftrace_graph_caller_end lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) lg %r4,(STACK_PTREGS_PSW+8)(%r15) brasl %r14,prepare_ftrace_return stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) -ftrace_graph_caller_end: - .globl ftrace_graph_caller_end +.Lftrace_graph_caller_end: #endif lg %r1,(STACK_PTREGS_PSW+8)(%r15) lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 -ENDPROC(ftrace_caller) +SYM_CODE_END(ftrace_common) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(return_to_handler) +SYM_FUNC_START(return_to_handler) stmg %r2,%r5,32(%r15) lgr %r1,%r15 aghi %r15,-STACK_FRAME_OVERHEAD @@ -101,6 +118,6 @@ ENTRY(return_to_handler) lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 -ENDPROC(return_to_handler) +SYM_FUNC_END(return_to_handler) #endif From 885359c42942566c91bbc93eb1acfaea86d95bcf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 7 Oct 2021 18:12:50 +0200 Subject: [PATCH 072/512] s390/ptrace: fix coding style Reported-by: Steffen Maier Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ptrace.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 61b22aa990e7..662ee212ba51 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -76,8 +76,7 @@ enum { * The pt_regs struct defines the way the registers are stored on * the stack during a system call. */ -struct pt_regs -{ +struct pt_regs { union { user_pt_regs user_regs; struct { From 3990b5baf225a3a96e70a784747d31002686c300 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 6 Oct 2021 14:38:14 +0200 Subject: [PATCH 073/512] selftests/ftrace: add s390 support for kprobe args tests This is the s390 variant of commit 9855c4626c67 ("selftests/ftrace: Add ppc support for kprobe args tests"). Acked-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- .../selftests/ftrace/test.d/kprobe/kprobe_args_string.tc | 3 +++ .../selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc index 84285a6f60b0..dc7ade196798 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc @@ -22,6 +22,9 @@ ppc64*) ppc*) ARG1=%r3 ;; +s390*) + ARG1=%r2 +;; *) echo "Please implement other architecture here" exit_untested diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc index 474ca1a9a088..47d84b5cb6ca 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc @@ -32,6 +32,10 @@ ppc*) GOODREG=%r3 BADREG=%msr ;; +s390*) + GOODREG=%r2 + BADREG=%s2 +;; *) echo "Please implement other architecture here" exit_untested From a30b5b03047602be56c5b8ffc3a0e4cfed17561c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 6 Oct 2021 11:59:29 +0200 Subject: [PATCH 074/512] s390/ptrace: add function argument access API Add regs_get_kernel_argument() which returns Nth argument of a function call. This enables ftrace kprobe events to access kernel function arguments via $argN syntax. This is the s390 variant of commit a823c35ff2ed ("arm64: ptrace: Add function argument access API"). Acked-by: Ilya Leoshkevich Reviewed-by: Steffen Maier Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 + arch/s390/include/asm/ptrace.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 03eb3ec08b07..c35c98e515a2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -160,6 +160,7 @@ config S390 select HAVE_FAST_GUP select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 662ee212ba51..82fc11907451 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -196,6 +196,25 @@ const char *regs_query_register_name(unsigned int offset); unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_kernel_argument() returns @n th argument of the function call. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, + unsigned int n) +{ + unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long); + +#define NR_REG_ARGUMENTS 5 + if (n < NR_REG_ARGUMENTS) + return regs_get_register(regs, 2 + n); + n -= NR_REG_ARGUMENTS; + return regs_get_kernel_stack_nth(regs, argoffset + n); +} + static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->gprs[15]; From b2f583937aad0e43cdf18186070e5502983f60fd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Oct 2021 16:02:01 +0300 Subject: [PATCH 075/512] s390/cmm: use string_upper() instead of open coded variant Use string_upper() from string helper module instead of open coded variant. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211001130201.72545-1-andriy.shevchenko@linux.intel.com [hca@linux.ibm.com: removed hunk which converts extmem.c] Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/cmm.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 1141c8d5c0d0..2203164b39da 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -14,8 +14,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -394,13 +394,10 @@ static int __init cmm_init(void) goto out_sysctl; #ifdef CONFIG_CMM_IUCV /* convert sender to uppercase characters */ - if (sender) { - int len = strlen(sender); - while (len--) - sender[len] = toupper(sender[len]); - } else { + if (sender) + string_upper(sender, sender); + else sender = cmm_default_sender; - } rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); if (rc < 0) From bf2928c7a284e31f9f91a004b5dca09c522157c0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 10 Sep 2021 08:22:06 +0200 Subject: [PATCH 076/512] PCI/VPD: Add pci_read/write_vpd_any() In certain cases we need a variant of pci_read_vpd()/pci_write_vpd() that does not check against dev->vpd.len. Such cases are: - Reading VPD if dev->vpd.len isn't set yet (in pci_vpd_size()) - Devices that map non-VPD information to arbitrary places in VPD address space (example: Chelsio T3 EEPROM write-protect flag) Therefore add pci_read_vpd_any() and pci_write_vpd_any() that check against PCI_VPD_MAX_SIZE only. Link: https://lore.kernel.org/r/93ecce28-a158-f02a-d134-8afcaced8efe@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Acked-by: Jakub Kicinski --- drivers/pci/vpd.c | 72 +++++++++++++++++++++++++++++++-------------- include/linux/pci.h | 2 ++ 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 4be24890132e..0a804715d98e 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -156,9 +156,10 @@ static int pci_vpd_wait(struct pci_dev *dev, bool set) } static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count, - void *arg) + void *arg, bool check_size) { struct pci_vpd *vpd = &dev->vpd; + unsigned int max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE; int ret = 0; loff_t end = pos + count; u8 *buf = arg; @@ -169,11 +170,11 @@ static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count, if (pos < 0) return -EINVAL; - if (pos > vpd->len) + if (pos >= max_len) return 0; - if (end > vpd->len) { - end = vpd->len; + if (end > max_len) { + end = max_len; count = end - pos; } @@ -217,9 +218,10 @@ static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count, } static ssize_t pci_vpd_write(struct pci_dev *dev, loff_t pos, size_t count, - const void *arg) + const void *arg, bool check_size) { struct pci_vpd *vpd = &dev->vpd; + unsigned int max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE; const u8 *buf = arg; loff_t end = pos + count; int ret = 0; @@ -230,7 +232,7 @@ static ssize_t pci_vpd_write(struct pci_dev *dev, loff_t pos, size_t count, if (pos < 0 || (pos & 3) || (count & 3)) return -EINVAL; - if (end > vpd->len) + if (end > max_len) return -EINVAL; if (mutex_lock_killable(&vpd->lock)) @@ -381,6 +383,24 @@ static int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, return -ENOENT; } +static ssize_t __pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf, + bool check_size) +{ + ssize_t ret; + + if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) { + dev = pci_get_func0_dev(dev); + if (!dev) + return -ENODEV; + + ret = pci_vpd_read(dev, pos, count, buf, check_size); + pci_dev_put(dev); + return ret; + } + + return pci_vpd_read(dev, pos, count, buf, check_size); +} + /** * pci_read_vpd - Read one entry from Vital Product Data * @dev: PCI device struct @@ -389,6 +409,20 @@ static int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, * @buf: pointer to where to store result */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf) +{ + return __pci_read_vpd(dev, pos, count, buf, true); +} +EXPORT_SYMBOL(pci_read_vpd); + +/* Same, but allow to access any address */ +ssize_t pci_read_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, void *buf) +{ + return __pci_read_vpd(dev, pos, count, buf, false); +} +EXPORT_SYMBOL(pci_read_vpd_any); + +static ssize_t __pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, + const void *buf, bool check_size) { ssize_t ret; @@ -397,14 +431,13 @@ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf) if (!dev) return -ENODEV; - ret = pci_vpd_read(dev, pos, count, buf); + ret = pci_vpd_write(dev, pos, count, buf, check_size); pci_dev_put(dev); return ret; } - return pci_vpd_read(dev, pos, count, buf); + return pci_vpd_write(dev, pos, count, buf, check_size); } -EXPORT_SYMBOL(pci_read_vpd); /** * pci_write_vpd - Write entry to Vital Product Data @@ -415,22 +448,17 @@ EXPORT_SYMBOL(pci_read_vpd); */ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf) { - ssize_t ret; - - if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) { - dev = pci_get_func0_dev(dev); - if (!dev) - return -ENODEV; - - ret = pci_vpd_write(dev, pos, count, buf); - pci_dev_put(dev); - return ret; - } - - return pci_vpd_write(dev, pos, count, buf); + return __pci_write_vpd(dev, pos, count, buf, true); } EXPORT_SYMBOL(pci_write_vpd); +/* Same, but allow to access any address */ +ssize_t pci_write_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, const void *buf) +{ + return __pci_write_vpd(dev, pos, count, buf, false); +} +EXPORT_SYMBOL(pci_write_vpd_any); + int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len, const char *kw, unsigned int *size) { diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..9649bd9e468d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1350,6 +1350,8 @@ void pci_unlock_rescan_remove(void); /* Vital Product Data routines */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); +ssize_t pci_read_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, void *buf); +ssize_t pci_write_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); From f55fee56a631032969480e4b0ee5d79734fe3c69 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 20 Sep 2021 12:29:45 +0530 Subject: [PATCH 077/512] PCI: qcom-ep: Add Qualcomm PCIe Endpoint controller driver Add driver for Qualcomm PCIe Endpoint controller based on the DesignWare core with added Qualcomm-specific wrapper around the core. The driver support is very basic such that it supports only enumeration, PCIe read/write, and MSI. There is no ASPM and PM support for now but these will be added later. The driver is capable of using the PERST# and WAKE# side-band GPIOs for operation and written on top of the DWC PCI framework. [bhelgaas: wrap a few long lines] Co-developed-by: Siddartha Mohanadoss [mani: restructured the driver and fixed several bugs for upstream] Link: https://lore.kernel.org/r/20210920065946.15090-3-manivannan.sadhasivam@linaro.org Signed-off-by: Siddartha Mohanadoss Signed-off-by: Manivannan Sadhasivam Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring --- drivers/pci/controller/dwc/Kconfig | 10 + drivers/pci/controller/dwc/Makefile | 1 + .../pci/controller/dwc/pcie-designware-ep.c | 3 + drivers/pci/controller/dwc/pcie-qcom-ep.c | 721 ++++++++++++++++++ 4 files changed, 735 insertions(+) create mode 100644 drivers/pci/controller/dwc/pcie-qcom-ep.c diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 76c0a63a3f64..49bcbe18a575 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -180,6 +180,16 @@ config PCIE_QCOM PCIe controller uses the DesignWare core plus Qualcomm-specific hardware wrappers. +config PCIE_QCOM_EP + tristate "Qualcomm PCIe controller - Endpoint mode" + depends on OF && (ARCH_QCOM || COMPILE_TEST) + depends on PCI_ENDPOINT + select PCIE_DW_EP + help + Say Y here to enable support for the PCIe controllers on Qualcomm SoCs + to work in endpoint mode. The PCIe controller uses the DesignWare core + plus Qualcomm-specific hardware wrappers. + config PCIE_ARMADA_8K bool "Marvell Armada-8K PCIe controller" depends on ARCH_MVEBU || COMPILE_TEST diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index 73244409792c..8ba7b67f5e50 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o +obj-$(CONFIG_PCIE_QCOM_EP) += pcie-qcom-ep.o obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o obj-$(CONFIG_PCIE_ROCKCHIP_DW_HOST) += pcie-dw-rockchip.o diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 998b698f4085..0eda8236c125 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -83,6 +83,7 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar) for (func_no = 0; func_no < funcs; func_no++) __dw_pcie_ep_reset_bar(pci, func_no, bar, 0); } +EXPORT_SYMBOL_GPL(dw_pcie_ep_reset_bar); static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie_ep *ep, u8 func_no, u8 cap_ptr, u8 cap) @@ -485,6 +486,7 @@ int dw_pcie_ep_raise_legacy_irq(struct dw_pcie_ep *ep, u8 func_no) return -EINVAL; } +EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_legacy_irq); int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, u8 interrupt_num) @@ -536,6 +538,7 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, return 0; } +EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_msi_irq); int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no, u16 interrupt_num) diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c new file mode 100644 index 000000000000..7b17da2f9b3f --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -0,0 +1,721 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Qualcomm PCIe Endpoint controller driver + * + * Copyright (c) 2020, The Linux Foundation. All rights reserved. + * Author: Siddartha Mohanadoss +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pcie-designware.h" + +/* PARF registers */ +#define PARF_SYS_CTRL 0x00 +#define PARF_DB_CTRL 0x10 +#define PARF_PM_CTRL 0x20 +#define PARF_MHI_BASE_ADDR_LOWER 0x178 +#define PARF_MHI_BASE_ADDR_UPPER 0x17c +#define PARF_DEBUG_INT_EN 0x190 +#define PARF_AXI_MSTR_RD_HALT_NO_WRITES 0x1a4 +#define PARF_AXI_MSTR_WR_ADDR_HALT 0x1a8 +#define PARF_Q2A_FLUSH 0x1ac +#define PARF_LTSSM 0x1b0 +#define PARF_CFG_BITS 0x210 +#define PARF_INT_ALL_STATUS 0x224 +#define PARF_INT_ALL_CLEAR 0x228 +#define PARF_INT_ALL_MASK 0x22c +#define PARF_SLV_ADDR_MSB_CTRL 0x2c0 +#define PARF_DBI_BASE_ADDR 0x350 +#define PARF_DBI_BASE_ADDR_HI 0x354 +#define PARF_SLV_ADDR_SPACE_SIZE 0x358 +#define PARF_SLV_ADDR_SPACE_SIZE_HI 0x35c +#define PARF_ATU_BASE_ADDR 0x634 +#define PARF_ATU_BASE_ADDR_HI 0x638 +#define PARF_SRIS_MODE 0x644 +#define PARF_DEVICE_TYPE 0x1000 +#define PARF_BDF_TO_SID_CFG 0x2c00 + +/* PARF_INT_ALL_{STATUS/CLEAR/MASK} register fields */ +#define PARF_INT_ALL_LINK_DOWN BIT(1) +#define PARF_INT_ALL_BME BIT(2) +#define PARF_INT_ALL_PM_TURNOFF BIT(3) +#define PARF_INT_ALL_DEBUG BIT(4) +#define PARF_INT_ALL_LTR BIT(5) +#define PARF_INT_ALL_MHI_Q6 BIT(6) +#define PARF_INT_ALL_MHI_A7 BIT(7) +#define PARF_INT_ALL_DSTATE_CHANGE BIT(8) +#define PARF_INT_ALL_L1SUB_TIMEOUT BIT(9) +#define PARF_INT_ALL_MMIO_WRITE BIT(10) +#define PARF_INT_ALL_CFG_WRITE BIT(11) +#define PARF_INT_ALL_BRIDGE_FLUSH_N BIT(12) +#define PARF_INT_ALL_LINK_UP BIT(13) +#define PARF_INT_ALL_AER_LEGACY BIT(14) +#define PARF_INT_ALL_PLS_ERR BIT(15) +#define PARF_INT_ALL_PME_LEGACY BIT(16) +#define PARF_INT_ALL_PLS_PME BIT(17) + +/* PARF_BDF_TO_SID_CFG register fields */ +#define PARF_BDF_TO_SID_BYPASS BIT(0) + +/* PARF_DEBUG_INT_EN register fields */ +#define PARF_DEBUG_INT_PM_DSTATE_CHANGE BIT(1) +#define PARF_DEBUG_INT_CFG_BUS_MASTER_EN BIT(2) +#define PARF_DEBUG_INT_RADM_PM_TURNOFF BIT(3) + +/* PARF_DEVICE_TYPE register fields */ +#define PARF_DEVICE_TYPE_EP 0x0 + +/* PARF_PM_CTRL register fields */ +#define PARF_PM_CTRL_REQ_EXIT_L1 BIT(1) +#define PARF_PM_CTRL_READY_ENTR_L23 BIT(2) +#define PARF_PM_CTRL_REQ_NOT_ENTR_L1 BIT(5) + +/* PARF_AXI_MSTR_RD_HALT_NO_WRITES register fields */ +#define PARF_AXI_MSTR_RD_HALT_NO_WRITE_EN BIT(0) + +/* PARF_AXI_MSTR_WR_ADDR_HALT register fields */ +#define PARF_AXI_MSTR_WR_ADDR_HALT_EN BIT(31) + +/* PARF_Q2A_FLUSH register fields */ +#define PARF_Q2A_FLUSH_EN BIT(16) + +/* PARF_SYS_CTRL register fields */ +#define PARF_SYS_CTRL_AUX_PWR_DET BIT(4) +#define PARF_SYS_CTRL_CORE_CLK_CGC_DIS BIT(6) +#define PARF_SYS_CTRL_SLV_DBI_WAKE_DISABLE BIT(11) + +/* PARF_DB_CTRL register fields */ +#define PARF_DB_CTRL_INSR_DBNCR_BLOCK BIT(0) +#define PARF_DB_CTRL_RMVL_DBNCR_BLOCK BIT(1) +#define PARF_DB_CTRL_DBI_WKP_BLOCK BIT(4) +#define PARF_DB_CTRL_SLV_WKP_BLOCK BIT(5) +#define PARF_DB_CTRL_MST_WKP_BLOCK BIT(6) + +/* PARF_CFG_BITS register fields */ +#define PARF_CFG_BITS_REQ_EXIT_L1SS_MSI_LTR_EN BIT(1) + +/* ELBI registers */ +#define ELBI_SYS_STTS 0x08 + +/* DBI registers */ +#define DBI_CON_STATUS 0x44 + +/* DBI register fields */ +#define DBI_CON_STATUS_POWER_STATE_MASK GENMASK(1, 0) + +#define XMLH_LINK_UP 0x400 +#define CORE_RESET_TIME_US_MIN 1000 +#define CORE_RESET_TIME_US_MAX 1005 +#define WAKE_DELAY_US 2000 /* 2 ms */ + +#define to_pcie_ep(x) dev_get_drvdata((x)->dev) + +enum qcom_pcie_ep_link_status { + QCOM_PCIE_EP_LINK_DISABLED, + QCOM_PCIE_EP_LINK_ENABLED, + QCOM_PCIE_EP_LINK_UP, + QCOM_PCIE_EP_LINK_DOWN, +}; + +static struct clk_bulk_data qcom_pcie_ep_clks[] = { + { .id = "cfg" }, + { .id = "aux" }, + { .id = "bus_master" }, + { .id = "bus_slave" }, + { .id = "ref" }, + { .id = "sleep" }, + { .id = "slave_q2a" }, +}; + +struct qcom_pcie_ep { + struct dw_pcie pci; + + void __iomem *parf; + void __iomem *elbi; + struct regmap *perst_map; + struct resource *mmio_res; + + struct reset_control *core_reset; + struct gpio_desc *reset; + struct gpio_desc *wake; + struct phy *phy; + + u32 perst_en; + u32 perst_sep_en; + + enum qcom_pcie_ep_link_status link_status; + int global_irq; + int perst_irq; +}; + +static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep) +{ + struct dw_pcie *pci = &pcie_ep->pci; + struct device *dev = pci->dev; + int ret; + + ret = reset_control_assert(pcie_ep->core_reset); + if (ret) { + dev_err(dev, "Cannot assert core reset\n"); + return ret; + } + + usleep_range(CORE_RESET_TIME_US_MIN, CORE_RESET_TIME_US_MAX); + + ret = reset_control_deassert(pcie_ep->core_reset); + if (ret) { + dev_err(dev, "Cannot de-assert core reset\n"); + return ret; + } + + usleep_range(CORE_RESET_TIME_US_MIN, CORE_RESET_TIME_US_MAX); + + return 0; +} + +/* + * Delatch PERST_EN and PERST_SEPARATION_ENABLE with TCSR to avoid + * device reset during host reboot and hibernation. The driver is + * expected to handle this situation. + */ +static void qcom_pcie_ep_configure_tcsr(struct qcom_pcie_ep *pcie_ep) +{ + regmap_write(pcie_ep->perst_map, pcie_ep->perst_en, 0); + regmap_write(pcie_ep->perst_map, pcie_ep->perst_sep_en, 0); +} + +static int qcom_pcie_dw_link_up(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + u32 reg; + + reg = readl_relaxed(pcie_ep->elbi + ELBI_SYS_STTS); + + return reg & XMLH_LINK_UP; +} + +static int qcom_pcie_dw_start_link(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + + enable_irq(pcie_ep->perst_irq); + + return 0; +} + +static void qcom_pcie_dw_stop_link(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + + disable_irq(pcie_ep->perst_irq); +} + +static int qcom_pcie_perst_deassert(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + struct device *dev = pci->dev; + u32 val, offset; + int ret; + + ret = clk_bulk_prepare_enable(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + if (ret) + return ret; + + ret = qcom_pcie_ep_core_reset(pcie_ep); + if (ret) + goto err_disable_clk; + + ret = phy_init(pcie_ep->phy); + if (ret) + goto err_disable_clk; + + ret = phy_power_on(pcie_ep->phy); + if (ret) + goto err_phy_exit; + + /* Assert WAKE# to RC to indicate device is ready */ + gpiod_set_value_cansleep(pcie_ep->wake, 1); + usleep_range(WAKE_DELAY_US, WAKE_DELAY_US + 500); + gpiod_set_value_cansleep(pcie_ep->wake, 0); + + qcom_pcie_ep_configure_tcsr(pcie_ep); + + /* Disable BDF to SID mapping */ + val = readl_relaxed(pcie_ep->parf + PARF_BDF_TO_SID_CFG); + val |= PARF_BDF_TO_SID_BYPASS; + writel_relaxed(val, pcie_ep->parf + PARF_BDF_TO_SID_CFG); + + /* Enable debug IRQ */ + val = readl_relaxed(pcie_ep->parf + PARF_DEBUG_INT_EN); + val |= PARF_DEBUG_INT_RADM_PM_TURNOFF | + PARF_DEBUG_INT_CFG_BUS_MASTER_EN | + PARF_DEBUG_INT_PM_DSTATE_CHANGE; + writel_relaxed(val, pcie_ep->parf + PARF_DEBUG_INT_EN); + + /* Configure PCIe to endpoint mode */ + writel_relaxed(PARF_DEVICE_TYPE_EP, pcie_ep->parf + PARF_DEVICE_TYPE); + + /* Allow entering L1 state */ + val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL); + val &= ~PARF_PM_CTRL_REQ_NOT_ENTR_L1; + writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL); + + /* Read halts write */ + val = readl_relaxed(pcie_ep->parf + PARF_AXI_MSTR_RD_HALT_NO_WRITES); + val &= ~PARF_AXI_MSTR_RD_HALT_NO_WRITE_EN; + writel_relaxed(val, pcie_ep->parf + PARF_AXI_MSTR_RD_HALT_NO_WRITES); + + /* Write after write halt */ + val = readl_relaxed(pcie_ep->parf + PARF_AXI_MSTR_WR_ADDR_HALT); + val |= PARF_AXI_MSTR_WR_ADDR_HALT_EN; + writel_relaxed(val, pcie_ep->parf + PARF_AXI_MSTR_WR_ADDR_HALT); + + /* Q2A flush disable */ + val = readl_relaxed(pcie_ep->parf + PARF_Q2A_FLUSH); + val &= ~PARF_Q2A_FLUSH_EN; + writel_relaxed(val, pcie_ep->parf + PARF_Q2A_FLUSH); + + /* Disable DBI Wakeup, core clock CGC and enable AUX power */ + val = readl_relaxed(pcie_ep->parf + PARF_SYS_CTRL); + val |= PARF_SYS_CTRL_SLV_DBI_WAKE_DISABLE | + PARF_SYS_CTRL_CORE_CLK_CGC_DIS | + PARF_SYS_CTRL_AUX_PWR_DET; + writel_relaxed(val, pcie_ep->parf + PARF_SYS_CTRL); + + /* Disable the debouncers */ + val = readl_relaxed(pcie_ep->parf + PARF_DB_CTRL); + val |= PARF_DB_CTRL_INSR_DBNCR_BLOCK | PARF_DB_CTRL_RMVL_DBNCR_BLOCK | + PARF_DB_CTRL_DBI_WKP_BLOCK | PARF_DB_CTRL_SLV_WKP_BLOCK | + PARF_DB_CTRL_MST_WKP_BLOCK; + writel_relaxed(val, pcie_ep->parf + PARF_DB_CTRL); + + /* Request to exit from L1SS for MSI and LTR MSG */ + val = readl_relaxed(pcie_ep->parf + PARF_CFG_BITS); + val |= PARF_CFG_BITS_REQ_EXIT_L1SS_MSI_LTR_EN; + writel_relaxed(val, pcie_ep->parf + PARF_CFG_BITS); + + dw_pcie_dbi_ro_wr_en(pci); + + /* Set the L0s Exit Latency to 2us-4us = 0x6 */ + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + val = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_L0SEL; + val |= FIELD_PREP(PCI_EXP_LNKCAP_L0SEL, 0x6); + dw_pcie_writel_dbi(pci, offset + PCI_EXP_LNKCAP, val); + + /* Set the L1 Exit Latency to be 32us-64 us = 0x6 */ + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + val = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_L1EL; + val |= FIELD_PREP(PCI_EXP_LNKCAP_L1EL, 0x6); + dw_pcie_writel_dbi(pci, offset + PCI_EXP_LNKCAP, val); + + dw_pcie_dbi_ro_wr_dis(pci); + + writel_relaxed(0, pcie_ep->parf + PARF_INT_ALL_MASK); + val = PARF_INT_ALL_LINK_DOWN | PARF_INT_ALL_BME | + PARF_INT_ALL_PM_TURNOFF | PARF_INT_ALL_DSTATE_CHANGE | + PARF_INT_ALL_LINK_UP; + writel_relaxed(val, pcie_ep->parf + PARF_INT_ALL_MASK); + + ret = dw_pcie_ep_init_complete(&pcie_ep->pci.ep); + if (ret) { + dev_err(dev, "Failed to complete initialization: %d\n", ret); + goto err_phy_power_off; + } + + /* + * The physical address of the MMIO region which is exposed as the BAR + * should be written to MHI BASE registers. + */ + writel_relaxed(pcie_ep->mmio_res->start, + pcie_ep->parf + PARF_MHI_BASE_ADDR_LOWER); + writel_relaxed(0, pcie_ep->parf + PARF_MHI_BASE_ADDR_UPPER); + + dw_pcie_ep_init_notify(&pcie_ep->pci.ep); + + /* Enable LTSSM */ + val = readl_relaxed(pcie_ep->parf + PARF_LTSSM); + val |= BIT(8); + writel_relaxed(val, pcie_ep->parf + PARF_LTSSM); + + return 0; + +err_phy_power_off: + phy_power_off(pcie_ep->phy); +err_phy_exit: + phy_exit(pcie_ep->phy); +err_disable_clk: + clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + + return ret; +} + +static void qcom_pcie_perst_assert(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + struct device *dev = pci->dev; + + if (pcie_ep->link_status == QCOM_PCIE_EP_LINK_DISABLED) { + dev_dbg(dev, "Link is already disabled\n"); + return; + } + + phy_power_off(pcie_ep->phy); + phy_exit(pcie_ep->phy); + clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + pcie_ep->link_status = QCOM_PCIE_EP_LINK_DISABLED; +} + +/* Common DWC controller ops */ +static const struct dw_pcie_ops pci_ops = { + .link_up = qcom_pcie_dw_link_up, + .start_link = qcom_pcie_dw_start_link, + .stop_link = qcom_pcie_dw_stop_link, +}; + +static int qcom_pcie_ep_get_io_resources(struct platform_device *pdev, + struct qcom_pcie_ep *pcie_ep) +{ + struct device *dev = &pdev->dev; + struct dw_pcie *pci = &pcie_ep->pci; + struct device_node *syscon; + struct resource *res; + int ret; + + pcie_ep->parf = devm_platform_ioremap_resource_byname(pdev, "parf"); + if (IS_ERR(pcie_ep->parf)) + return PTR_ERR(pcie_ep->parf); + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + pci->dbi_base = devm_pci_remap_cfg_resource(dev, res); + if (IS_ERR(pci->dbi_base)) + return PTR_ERR(pci->dbi_base); + pci->dbi_base2 = pci->dbi_base; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi"); + pcie_ep->elbi = devm_pci_remap_cfg_resource(dev, res); + if (IS_ERR(pcie_ep->elbi)) + return PTR_ERR(pcie_ep->elbi); + + pcie_ep->mmio_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "mmio"); + + syscon = of_parse_phandle(dev->of_node, "qcom,perst-regs", 0); + if (!syscon) { + dev_err(dev, "Failed to parse qcom,perst-regs\n"); + return -EINVAL; + } + + pcie_ep->perst_map = syscon_node_to_regmap(syscon); + of_node_put(syscon); + if (IS_ERR(pcie_ep->perst_map)) + return PTR_ERR(pcie_ep->perst_map); + + ret = of_property_read_u32_index(dev->of_node, "qcom,perst-regs", + 1, &pcie_ep->perst_en); + if (ret < 0) { + dev_err(dev, "No Perst Enable offset in syscon\n"); + return ret; + } + + ret = of_property_read_u32_index(dev->of_node, "qcom,perst-regs", + 2, &pcie_ep->perst_sep_en); + if (ret < 0) { + dev_err(dev, "No Perst Separation Enable offset in syscon\n"); + return ret; + } + + return 0; +} + +static int qcom_pcie_ep_get_resources(struct platform_device *pdev, + struct qcom_pcie_ep *pcie_ep) +{ + struct device *dev = &pdev->dev; + int ret; + + ret = qcom_pcie_ep_get_io_resources(pdev, pcie_ep); + if (ret) { + dev_err(&pdev->dev, "Failed to get io resources %d\n", ret); + return ret; + } + + ret = devm_clk_bulk_get(dev, ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + if (ret) + return ret; + + pcie_ep->core_reset = devm_reset_control_get_exclusive(dev, "core"); + if (IS_ERR(pcie_ep->core_reset)) + return PTR_ERR(pcie_ep->core_reset); + + pcie_ep->reset = devm_gpiod_get(dev, "reset", GPIOD_IN); + if (IS_ERR(pcie_ep->reset)) + return PTR_ERR(pcie_ep->reset); + + pcie_ep->wake = devm_gpiod_get_optional(dev, "wake", GPIOD_OUT_LOW); + if (IS_ERR(pcie_ep->wake)) + return PTR_ERR(pcie_ep->wake); + + pcie_ep->phy = devm_phy_optional_get(&pdev->dev, "pciephy"); + if (IS_ERR(pcie_ep->phy)) + ret = PTR_ERR(pcie_ep->phy); + + return ret; +} + +/* TODO: Notify clients about PCIe state change */ +static irqreturn_t qcom_pcie_ep_global_irq_thread(int irq, void *data) +{ + struct qcom_pcie_ep *pcie_ep = data; + struct dw_pcie *pci = &pcie_ep->pci; + struct device *dev = pci->dev; + u32 status = readl_relaxed(pcie_ep->parf + PARF_INT_ALL_STATUS); + u32 mask = readl_relaxed(pcie_ep->parf + PARF_INT_ALL_MASK); + u32 dstate, val; + + writel_relaxed(status, pcie_ep->parf + PARF_INT_ALL_CLEAR); + status &= mask; + + if (FIELD_GET(PARF_INT_ALL_LINK_DOWN, status)) { + dev_dbg(dev, "Received Linkdown event\n"); + pcie_ep->link_status = QCOM_PCIE_EP_LINK_DOWN; + } else if (FIELD_GET(PARF_INT_ALL_BME, status)) { + dev_dbg(dev, "Received BME event. Link is enabled!\n"); + pcie_ep->link_status = QCOM_PCIE_EP_LINK_ENABLED; + } else if (FIELD_GET(PARF_INT_ALL_PM_TURNOFF, status)) { + dev_dbg(dev, "Received PM Turn-off event! Entering L23\n"); + val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL); + val |= PARF_PM_CTRL_READY_ENTR_L23; + writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL); + } else if (FIELD_GET(PARF_INT_ALL_DSTATE_CHANGE, status)) { + dstate = dw_pcie_readl_dbi(pci, DBI_CON_STATUS) & + DBI_CON_STATUS_POWER_STATE_MASK; + dev_dbg(dev, "Received D%d state event\n", dstate); + if (dstate == 3) { + val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL); + val |= PARF_PM_CTRL_REQ_EXIT_L1; + writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL); + } + } else if (FIELD_GET(PARF_INT_ALL_LINK_UP, status)) { + dev_dbg(dev, "Received Linkup event. Enumeration complete!\n"); + dw_pcie_ep_linkup(&pci->ep); + pcie_ep->link_status = QCOM_PCIE_EP_LINK_UP; + } else { + dev_dbg(dev, "Received unknown event: %d\n", status); + } + + return IRQ_HANDLED; +} + +static irqreturn_t qcom_pcie_ep_perst_irq_thread(int irq, void *data) +{ + struct qcom_pcie_ep *pcie_ep = data; + struct dw_pcie *pci = &pcie_ep->pci; + struct device *dev = pci->dev; + u32 perst; + + perst = gpiod_get_value(pcie_ep->reset); + if (perst) { + dev_dbg(dev, "PERST asserted by host. Shutting down the PCIe link!\n"); + qcom_pcie_perst_assert(pci); + } else { + dev_dbg(dev, "PERST de-asserted by host. Starting link training!\n"); + qcom_pcie_perst_deassert(pci); + } + + irq_set_irq_type(gpiod_to_irq(pcie_ep->reset), + (perst ? IRQF_TRIGGER_HIGH : IRQF_TRIGGER_LOW)); + + return IRQ_HANDLED; +} + +static int qcom_pcie_ep_enable_irq_resources(struct platform_device *pdev, + struct qcom_pcie_ep *pcie_ep) +{ + int irq, ret; + + irq = platform_get_irq_byname(pdev, "global"); + if (irq < 0) { + dev_err(&pdev->dev, "Failed to get Global IRQ\n"); + return irq; + } + + ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, + qcom_pcie_ep_global_irq_thread, + IRQF_ONESHOT, + "global_irq", pcie_ep); + if (ret) { + dev_err(&pdev->dev, "Failed to request Global IRQ\n"); + return ret; + } + + pcie_ep->perst_irq = gpiod_to_irq(pcie_ep->reset); + irq_set_status_flags(pcie_ep->perst_irq, IRQ_NOAUTOEN); + ret = devm_request_threaded_irq(&pdev->dev, pcie_ep->perst_irq, NULL, + qcom_pcie_ep_perst_irq_thread, + IRQF_TRIGGER_HIGH | IRQF_ONESHOT, + "perst_irq", pcie_ep); + if (ret) { + dev_err(&pdev->dev, "Failed to request PERST IRQ\n"); + disable_irq(irq); + return ret; + } + + return 0; +} + +static int qcom_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no, + enum pci_epc_irq_type type, u16 interrupt_num) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + + switch (type) { + case PCI_EPC_IRQ_LEGACY: + return dw_pcie_ep_raise_legacy_irq(ep, func_no); + case PCI_EPC_IRQ_MSI: + return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num); + default: + dev_err(pci->dev, "Unknown IRQ type\n"); + return -EINVAL; + } +} + +static const struct pci_epc_features qcom_pcie_epc_features = { + .linkup_notifier = true, + .core_init_notifier = true, + .msi_capable = true, + .msix_capable = false, +}; + +static const struct pci_epc_features * +qcom_pcie_epc_get_features(struct dw_pcie_ep *pci_ep) +{ + return &qcom_pcie_epc_features; +} + +static void qcom_pcie_ep_init(struct dw_pcie_ep *ep) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + enum pci_barno bar; + + for (bar = BAR_0; bar <= BAR_5; bar++) + dw_pcie_ep_reset_bar(pci, bar); +} + +static struct dw_pcie_ep_ops pci_ep_ops = { + .ep_init = qcom_pcie_ep_init, + .raise_irq = qcom_pcie_ep_raise_irq, + .get_features = qcom_pcie_epc_get_features, +}; + +static int qcom_pcie_ep_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct qcom_pcie_ep *pcie_ep; + int ret; + + pcie_ep = devm_kzalloc(dev, sizeof(*pcie_ep), GFP_KERNEL); + if (!pcie_ep) + return -ENOMEM; + + pcie_ep->pci.dev = dev; + pcie_ep->pci.ops = &pci_ops; + pcie_ep->pci.ep.ops = &pci_ep_ops; + platform_set_drvdata(pdev, pcie_ep); + + ret = qcom_pcie_ep_get_resources(pdev, pcie_ep); + if (ret) + return ret; + + ret = clk_bulk_prepare_enable(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + if (ret) + return ret; + + ret = qcom_pcie_ep_core_reset(pcie_ep); + if (ret) + goto err_disable_clk; + + ret = phy_init(pcie_ep->phy); + if (ret) + goto err_disable_clk; + + /* PHY needs to be powered on for dw_pcie_ep_init() */ + ret = phy_power_on(pcie_ep->phy); + if (ret) + goto err_phy_exit; + + ret = dw_pcie_ep_init(&pcie_ep->pci.ep); + if (ret) { + dev_err(dev, "Failed to initialize endpoint: %d\n", ret); + goto err_phy_power_off; + } + + ret = qcom_pcie_ep_enable_irq_resources(pdev, pcie_ep); + if (ret) + goto err_phy_power_off; + + return 0; + +err_phy_power_off: + phy_power_off(pcie_ep->phy); +err_phy_exit: + phy_exit(pcie_ep->phy); +err_disable_clk: + clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + + return ret; +} + +static int qcom_pcie_ep_remove(struct platform_device *pdev) +{ + struct qcom_pcie_ep *pcie_ep = platform_get_drvdata(pdev); + + if (pcie_ep->link_status == QCOM_PCIE_EP_LINK_DISABLED) + return 0; + + phy_power_off(pcie_ep->phy); + phy_exit(pcie_ep->phy); + clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + + return 0; +} + +static const struct of_device_id qcom_pcie_ep_match[] = { + { .compatible = "qcom,sdx55-pcie-ep", }, + { } +}; + +static struct platform_driver qcom_pcie_ep_driver = { + .probe = qcom_pcie_ep_probe, + .remove = qcom_pcie_ep_remove, + .driver = { + .name = "qcom-pcie-ep", + .of_match_table = qcom_pcie_ep_match, + }, +}; +builtin_platform_driver(qcom_pcie_ep_driver); + +MODULE_AUTHOR("Siddartha Mohanadoss "); +MODULE_AUTHOR("Manivannan Sadhasivam "); +MODULE_DESCRIPTION("Qualcomm PCIe Endpoint controller driver"); +MODULE_LICENSE("GPL v2"); From 79352928a6666a5093dda37db4c909b6a37edf98 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 20 Sep 2021 12:29:46 +0530 Subject: [PATCH 078/512] MAINTAINERS: Add entry for Qualcomm PCIe Endpoint driver and binding Add MAINTAINERS entry for Qualcomm PCIe Endpoint driver and its devicetree binding. Also fix the PCIe RC entry to cover only the RC driver. Link: https://lore.kernel.org/r/20210920065946.15090-4-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- MAINTAINERS | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ca6d6fde85cf..4b3ca1155908 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14596,7 +14596,15 @@ M: Stanimir Varbanov L: linux-pci@vger.kernel.org L: linux-arm-msm@vger.kernel.org S: Maintained -F: drivers/pci/controller/dwc/*qcom* +F: drivers/pci/controller/dwc/pcie-qcom.c + +PCIE ENDPOINT DRIVER FOR QUALCOMM +M: Manivannan Sadhasivam +L: linux-pci@vger.kernel.org +L: linux-arm-msm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml +F: drivers/pci/controller/dwc/pcie-qcom-ep.c PCIE DRIVER FOR ROCKCHIP M: Shawn Lin From 7e919677bb392c765e0f86fda63191e517c0bd1f Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 23 Aug 2021 08:49:57 -0700 Subject: [PATCH 079/512] PCI: dwc: Perform host_init() before registering msi On the Qualcomm sc8180x platform the bootloader does something related to PCI that leaves a pending "msi" interrupt, which with the current ordering often fires before init has a chance to enable the clocks that are necessary for the interrupt handler to access the hardware. Move the host_init() call before the registration of the "msi" interrupt handler to ensure the host driver has a chance to enable the clocks. The assignment of the bridge's ops and child_ops is moved along, because at least the TI Keystone driver overwrites these in its host_init callback. Link: https://lore.kernel.org/r/20210823154958.305677-1-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../pci/controller/dwc/pcie-designware-host.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index d1d9b8344ec9..f4755f3a03be 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -335,6 +335,16 @@ int dw_pcie_host_init(struct pcie_port *pp) if (pci->link_gen < 1) pci->link_gen = of_pci_get_max_link_speed(np); + /* Set default bus ops */ + bridge->ops = &dw_pcie_ops; + bridge->child_ops = &dw_child_pcie_ops; + + if (pp->ops->host_init) { + ret = pp->ops->host_init(pp); + if (ret) + return ret; + } + if (pci_msi_enabled()) { pp->has_msi_ctrl = !(pp->ops->msi_host_init || of_property_read_bool(np, "msi-parent") || @@ -388,15 +398,6 @@ int dw_pcie_host_init(struct pcie_port *pp) } } - /* Set default bus ops */ - bridge->ops = &dw_pcie_ops; - bridge->child_ops = &dw_child_pcie_ops; - - if (pp->ops->host_init) { - ret = pp->ops->host_init(pp); - if (ret) - goto err_free_msi; - } dw_pcie_iatu_detect(pci); dw_pcie_setup_rc(pp); From 2565e5b69c44b4e42469afea3cc5a97e74d1ed45 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Wed, 1 Sep 2021 20:40:47 +0800 Subject: [PATCH 080/512] PCI: vmd: Do not disable MSI-X remapping if interrupt remapping is enabled by IOMMU When enabling VMD in BIOS setup (Ice Lake Processor: Whitley platform), the host OS cannot boot successfully with the following error message: nvme nvme0: I/O 12 QID 0 timeout, completion polled nvme nvme0: Shutdown timeout set to 6 seconds DMAR: DRHD: handling fault status reg 2 DMAR: [INTR-REMAP] Request device [0x00:0x00.5] fault index 0xa00 [fault reason 0x25] Blocked a compatibility format interrupt request The request device is the VMD controller: # lspci -s 0000:00.5 -nn 0000:00:00.5 RAID bus controller [0104]: Intel Corporation Volume Management Device NVMe RAID Controller [8086:28c0] (rev 04) `git bisect` points to this offending commit ee81ee84f873 ("PCI: vmd: Disable MSI-X remapping when possible"), which disables VMD MSI remapping. The IOMMU hardware blocks the compatibility format interrupt request because Interrupt Remapping Enable Status (IRES) and Extended Interrupt Mode Enable (EIME) are enabled. Please refer to section "5.1.4 Interrupt-Remapping Hardware Operation" in Intel VT-d spec. To fix the issue, VMD driver still enables the interrupt remapping irrespective of VMD_FEAT_CAN_BYPASS_MSI_REMAP if the IOMMU subsystem enables the interrupt remapping. Test configuration is shown as follows: * Two VMD controllers 1. 8086:28c0 (Whitley's VMD) 2. 8086:201d (Purley's VMD: The issue does not appear in this controller. Just make sure if any side effect occurs.) * w/wo intremap=off Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=214219 Link: https://lore.kernel.org/r/20210901124047.1615-1-adrianhuang0701@gmail.com Signed-off-by: Adrian Huang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Jon Derrick Cc: Jon Derrick Cc: Nirmal Patel Cc: Joerg Roedel --- drivers/pci/controller/vmd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 1b3e94a96414..1979fc5eac27 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -765,7 +766,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) * acceptable because the guest is usually CPU-limited and MSI * remapping doesn't become a performance bottleneck. */ - if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) || + if (iommu_capable(vmd->dev->dev.bus, IOMMU_CAP_INTR_REMAP) || + !(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) || offset[0] || offset[1]) { ret = vmd_alloc_irqs(vmd); if (ret) From f18312084300598544529510bcfab5f3e795e36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Fri, 8 Oct 2021 22:27:30 +0000 Subject: [PATCH 081/512] PCI: hv: Remove unnecessary use of %hx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "dom_req" is a u16 but varargs automatically promotes it to int, so there's no point in using the %h modifier. Drop it. See cbacb5ab0aa0 ("docs: printk-formats: Stop encouraging use of unnecessary %h[xudi] and %hh[xudi]") and 70eb2275ff8e ("checkpatch: add warning for unnecessary use of %h[xudi] and %hh[xudi]"). Link: https://lore.kernel.org/r/20211008222732.2868493-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index eaec915ffe62..8459f857ad9e 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3126,14 +3126,14 @@ static int hv_pci_probe(struct hv_device *hdev, if (dom == HVPCI_DOM_INVALID) { dev_err(&hdev->device, - "Unable to use dom# 0x%hx or other numbers", dom_req); + "Unable to use dom# 0x%x or other numbers", dom_req); ret = -EINVAL; goto free_bus; } if (dom != dom_req) dev_info(&hdev->device, - "PCI dom# 0x%hx has collision, using 0x%hx", + "PCI dom# 0x%x has collision, using 0x%x", dom_req, dom); hbus->bridge->domain_nr = dom; From 357df2fc00661e0993e0e40d1a56bdd967f45c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Fri, 8 Oct 2021 22:27:32 +0000 Subject: [PATCH 082/512] PCI: Use unsigned to match sscanf("%x") in pci_dev_str_match_path() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cppcheck warns that pci_dev_str_match_path() passes pointers to signed ints to sscanf("%x"), which expects pointers to *unsigned* ints: invalidScanfArgType_int drivers/pci/pci.c:312 %x in format string (no. 2) requires 'unsigned int *' but the argument type is 'signed int *'. Declare the variables as unsigned to avoid this issue. [bhelgaas: commit log] Link: https://lore.kernel.org/r/20211008222732.2868493-3-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ce2ab62b64cf..7998b65e9ae5 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -269,7 +269,7 @@ static int pci_dev_str_match_path(struct pci_dev *dev, const char *path, const char **endptr) { int ret; - int seg, bus, slot, func; + unsigned int seg, bus, slot, func; char *wpath, *p; char end; From 8e9028b3790ddb02c407e1a4da0ab7cf82790e7e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 12 Oct 2021 15:42:59 -0500 Subject: [PATCH 083/512] PCI: Return NULL for to_pci_driver(NULL) to_pci_driver() takes a pointer to a struct device_driver and uses container_of() to find the struct pci_driver that contains it. If given a NULL pointer to a struct device_driver, return a NULL pci_driver pointer instead of applying container_of() to NULL. This simplifies callers that would otherwise have to check for a NULL pointer first. Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..a2d62165a7f7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -900,7 +900,10 @@ struct pci_driver { struct pci_dynids dynids; }; -#define to_pci_driver(drv) container_of(drv, struct pci_driver, driver) +static inline struct pci_driver *to_pci_driver(struct device_driver *drv) +{ + return drv ? container_of(drv, struct pci_driver, driver) : NULL; +} /** * PCI_DEVICE - macro used to describe a specific PCI device From 097d9d414433315122f759ee6c2d8a7417a8ff0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:25 +0200 Subject: [PATCH 084/512] PCI: Drop pci_device_remove() test of pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the driver core calls pci_device_remove(), there is a driver bound to the device, so pci_dev->driver is never NULL. Remove the unnecessary test of pci_dev->driver. Link: https://lore.kernel.org/r/20211004125935.2300113-2-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/pci-driver.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 2761ab86490d..8fb6418c93e8 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -459,16 +459,14 @@ static void pci_device_remove(struct device *dev) struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; - if (drv) { - if (drv->remove) { - pm_runtime_get_sync(dev); - drv->remove(pci_dev); - pm_runtime_put_noidle(dev); - } - pcibios_free_irq(pci_dev); - pci_dev->driver = NULL; - pci_iov_remove(pci_dev); + if (drv->remove) { + pm_runtime_get_sync(dev); + drv->remove(pci_dev); + pm_runtime_put_noidle(dev); } + pcibios_free_irq(pci_dev); + pci_dev->driver = NULL; + pci_iov_remove(pci_dev); /* Undo the runtime PM settings in local_pci_probe() */ pm_runtime_put_sync(dev); From ae232f0970ea0c1d2b1e95922000afbe58af039f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:26 +0200 Subject: [PATCH 085/512] PCI: Drop pci_device_probe() test of !pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the device core calls the .probe() callback for a device, the device is never bound, so pci_dev->driver is always NULL. Remove the unnecessary test of !pci_dev->driver. Link: https://lore.kernel.org/r/20211004125935.2300113-3-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/pci-driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 8fb6418c93e8..50449ec622a3 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -397,7 +397,7 @@ static int __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev) const struct pci_device_id *id; int error = 0; - if (!pci_dev->driver && drv->probe) { + if (drv->probe) { error = -ENODEV; id = pci_match_device(drv, pci_dev); From 171d149ce8d11f7fafbd4f338f14d472a153c3d4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 12 Oct 2021 17:20:06 -0500 Subject: [PATCH 086/512] PCI/ERR: Factor out common dev->driver expressions Save the struct pci_driver pointer from pdev->driver instead of repeating it several times. No functional change. Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/err.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index b576aa890c76..0c5a143025af 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -49,14 +49,16 @@ static int report_error_detected(struct pci_dev *dev, pci_channel_state_t state, enum pci_ers_result *result) { + struct pci_driver *pdrv; pci_ers_result_t vote; const struct pci_error_handlers *err_handler; device_lock(&dev->dev); + pdrv = dev->driver; if (!pci_dev_set_io_state(dev, state) || - !dev->driver || - !dev->driver->err_handler || - !dev->driver->err_handler->error_detected) { + !pdrv || + !pdrv->err_handler || + !pdrv->err_handler->error_detected) { /* * If any device in the subtree does not have an error_detected * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent @@ -70,7 +72,7 @@ static int report_error_detected(struct pci_dev *dev, vote = PCI_ERS_RESULT_NONE; } } else { - err_handler = dev->driver->err_handler; + err_handler = pdrv->err_handler; vote = err_handler->error_detected(dev, state); } pci_uevent_ers(dev, vote); @@ -91,16 +93,18 @@ static int report_normal_detected(struct pci_dev *dev, void *data) static int report_mmio_enabled(struct pci_dev *dev, void *data) { + struct pci_driver *pdrv; pci_ers_result_t vote, *result = data; const struct pci_error_handlers *err_handler; device_lock(&dev->dev); - if (!dev->driver || - !dev->driver->err_handler || - !dev->driver->err_handler->mmio_enabled) + pdrv = dev->driver; + if (!pdrv || + !pdrv->err_handler || + !pdrv->err_handler->mmio_enabled) goto out; - err_handler = dev->driver->err_handler; + err_handler = pdrv->err_handler; vote = err_handler->mmio_enabled(dev); *result = merge_result(*result, vote); out: @@ -110,16 +114,18 @@ out: static int report_slot_reset(struct pci_dev *dev, void *data) { + struct pci_driver *pdrv; pci_ers_result_t vote, *result = data; const struct pci_error_handlers *err_handler; device_lock(&dev->dev); - if (!dev->driver || - !dev->driver->err_handler || - !dev->driver->err_handler->slot_reset) + pdrv = dev->driver; + if (!pdrv || + !pdrv->err_handler || + !pdrv->err_handler->slot_reset) goto out; - err_handler = dev->driver->err_handler; + err_handler = pdrv->err_handler; vote = err_handler->slot_reset(dev); *result = merge_result(*result, vote); out: @@ -129,16 +135,18 @@ out: static int report_resume(struct pci_dev *dev, void *data) { + struct pci_driver *pdrv; const struct pci_error_handlers *err_handler; device_lock(&dev->dev); + pdrv = dev->driver; if (!pci_dev_set_io_state(dev, pci_channel_io_normal) || - !dev->driver || - !dev->driver->err_handler || - !dev->driver->err_handler->resume) + !pdrv || + !pdrv->err_handler || + !pdrv->err_handler->resume) goto out; - err_handler = dev->driver->err_handler; + err_handler = pdrv->err_handler; err_handler->resume(dev); out: pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED); From a534ff3f4d6082f5ac228194accb7b414f0d6f0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:32 +0200 Subject: [PATCH 087/512] scsi: message: fusion: Remove unused mpt_pci driver .probe() 'id' parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only two drivers don't make use of the id parameter, so drop it. This is a step toward removing pci_dev->driver. Link: https://lore.kernel.org/r/20211004125935.2300113-9-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Acked-by: Martin K. Petersen --- drivers/message/fusion/mptbase.c | 7 ++----- drivers/message/fusion/mptbase.h | 2 +- drivers/message/fusion/mptctl.c | 4 ++-- drivers/message/fusion/mptlan.c | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 7f7abc9069f7..b94d5e4fdc23 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -829,7 +829,6 @@ int mpt_device_driver_register(struct mpt_pci_driver * dd_cbfunc, u8 cb_idx) { MPT_ADAPTER *ioc; - const struct pci_device_id *id; if (!cb_idx || cb_idx >= MPT_MAX_PROTOCOL_DRIVERS) return -EINVAL; @@ -838,10 +837,8 @@ mpt_device_driver_register(struct mpt_pci_driver * dd_cbfunc, u8 cb_idx) /* call per pci device probe entry point */ list_for_each_entry(ioc, &ioc_list, list) { - id = ioc->pcidev->driver ? - ioc->pcidev->driver->id_table : NULL; if (dd_cbfunc->probe) - dd_cbfunc->probe(ioc->pcidev, id); + dd_cbfunc->probe(ioc->pcidev); } return 0; @@ -2032,7 +2029,7 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id) for(cb_idx = 0; cb_idx < MPT_MAX_PROTOCOL_DRIVERS; cb_idx++) { if(MptDeviceDriverHandlers[cb_idx] && MptDeviceDriverHandlers[cb_idx]->probe) { - MptDeviceDriverHandlers[cb_idx]->probe(pdev,id); + MptDeviceDriverHandlers[cb_idx]->probe(pdev); } } diff --git a/drivers/message/fusion/mptbase.h b/drivers/message/fusion/mptbase.h index b9e0376be723..4bd0682c65d3 100644 --- a/drivers/message/fusion/mptbase.h +++ b/drivers/message/fusion/mptbase.h @@ -257,7 +257,7 @@ typedef enum { } MPT_DRIVER_CLASS; struct mpt_pci_driver{ - int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); + int (*probe) (struct pci_dev *dev); void (*remove) (struct pci_dev *dev); }; diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index 72025996cd70..ae433c150b37 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -114,7 +114,7 @@ static int mptctl_do_reset(MPT_ADAPTER *iocp, unsigned long arg); static int mptctl_hp_hostinfo(MPT_ADAPTER *iocp, unsigned long arg, unsigned int cmd); static int mptctl_hp_targetinfo(MPT_ADAPTER *iocp, unsigned long arg); -static int mptctl_probe(struct pci_dev *, const struct pci_device_id *); +static int mptctl_probe(struct pci_dev *); static void mptctl_remove(struct pci_dev *); #ifdef CONFIG_COMPAT @@ -2838,7 +2838,7 @@ static long compat_mpctl_ioctl(struct file *f, unsigned int cmd, unsigned long a */ static int -mptctl_probe(struct pci_dev *pdev, const struct pci_device_id *id) +mptctl_probe(struct pci_dev *pdev) { MPT_ADAPTER *ioc = pci_get_drvdata(pdev); diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c index 3261cac762de..7c1af5e6eb0b 100644 --- a/drivers/message/fusion/mptlan.c +++ b/drivers/message/fusion/mptlan.c @@ -1377,7 +1377,7 @@ mpt_register_lan_device (MPT_ADAPTER *mpt_dev, int pnum) } static int -mptlan_probe(struct pci_dev *pdev, const struct pci_device_id *id) +mptlan_probe(struct pci_dev *pdev) { MPT_ADAPTER *ioc = pci_get_drvdata(pdev); struct net_device *dev; From 8f5c335e34b5ae2dda6db2af300381fc9fb53fe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:33 +0200 Subject: [PATCH 088/512] crypto: qat - simplify adf_enable_aer() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A struct pci_driver is shared across all device instances, so assigning pci_driver.err_handler once per device isn't really sensible. Set adf_driver.err_handler statically instead of in adf_enable_aer(). This removes a use of pci_dev->driver, which is a step toward removing pci_dev->driver altogether. Since adf_enable_aer() returns zero unconditionally, make it a void function. Link: https://lore.kernel.org/r/20211004125935.2300113-10-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/crypto/qat/qat_4xxx/adf_drv.c | 7 ++----- drivers/crypto/qat/qat_c3xxx/adf_drv.c | 7 ++----- drivers/crypto/qat/qat_c62x/adf_drv.c | 7 ++----- drivers/crypto/qat/qat_common/adf_aer.c | 10 +++------- drivers/crypto/qat/qat_common/adf_common_drv.h | 3 ++- drivers/crypto/qat/qat_dh895xcc/adf_drv.c | 7 ++----- 6 files changed, 13 insertions(+), 28 deletions(-) diff --git a/drivers/crypto/qat/qat_4xxx/adf_drv.c b/drivers/crypto/qat/qat_4xxx/adf_drv.c index 359fb7989dfb..71ef065914b2 100644 --- a/drivers/crypto/qat/qat_4xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_4xxx/adf_drv.c @@ -247,11 +247,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_master(pdev); - if (adf_enable_aer(accel_dev)) { - dev_err(&pdev->dev, "Failed to enable aer.\n"); - ret = -EFAULT; - goto out_err; - } + adf_enable_aer(accel_dev); if (pci_save_state(pdev)) { dev_err(&pdev->dev, "Failed to save pci state.\n"); @@ -304,6 +300,7 @@ static struct pci_driver adf_driver = { .probe = adf_probe, .remove = adf_remove, .sriov_configure = adf_sriov_configure, + .err_handler = &adf_err_handler, }; module_pci_driver(adf_driver); diff --git a/drivers/crypto/qat/qat_c3xxx/adf_drv.c b/drivers/crypto/qat/qat_c3xxx/adf_drv.c index cc6e75dc60de..2aef0bb791df 100644 --- a/drivers/crypto/qat/qat_c3xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_c3xxx/adf_drv.c @@ -33,6 +33,7 @@ static struct pci_driver adf_driver = { .probe = adf_probe, .remove = adf_remove, .sriov_configure = adf_sriov_configure, + .err_handler = &adf_err_handler, }; static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev) @@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); - if (adf_enable_aer(accel_dev)) { - dev_err(&pdev->dev, "Failed to enable aer\n"); - ret = -EFAULT; - goto out_err_free_reg; - } + adf_enable_aer(accel_dev); if (pci_save_state(pdev)) { dev_err(&pdev->dev, "Failed to save pci state\n"); diff --git a/drivers/crypto/qat/qat_c62x/adf_drv.c b/drivers/crypto/qat/qat_c62x/adf_drv.c index bf251dfe74b3..56163083f161 100644 --- a/drivers/crypto/qat/qat_c62x/adf_drv.c +++ b/drivers/crypto/qat/qat_c62x/adf_drv.c @@ -33,6 +33,7 @@ static struct pci_driver adf_driver = { .probe = adf_probe, .remove = adf_remove, .sriov_configure = adf_sriov_configure, + .err_handler = &adf_err_handler, }; static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev) @@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); - if (adf_enable_aer(accel_dev)) { - dev_err(&pdev->dev, "Failed to enable aer\n"); - ret = -EFAULT; - goto out_err_free_reg; - } + adf_enable_aer(accel_dev); if (pci_save_state(pdev)) { dev_err(&pdev->dev, "Failed to save pci state\n"); diff --git a/drivers/crypto/qat/qat_common/adf_aer.c b/drivers/crypto/qat/qat_common/adf_aer.c index ed3e40bc56eb..fe9bb2f3536a 100644 --- a/drivers/crypto/qat/qat_common/adf_aer.c +++ b/drivers/crypto/qat/qat_common/adf_aer.c @@ -166,11 +166,12 @@ static void adf_resume(struct pci_dev *pdev) dev_info(&pdev->dev, "Device is up and running\n"); } -static const struct pci_error_handlers adf_err_handler = { +const struct pci_error_handlers adf_err_handler = { .error_detected = adf_error_detected, .slot_reset = adf_slot_reset, .resume = adf_resume, }; +EXPORT_SYMBOL_GPL(adf_err_handler); /** * adf_enable_aer() - Enable Advance Error Reporting for acceleration device @@ -179,17 +180,12 @@ static const struct pci_error_handlers adf_err_handler = { * Function enables PCI Advance Error Reporting for the * QAT acceleration device accel_dev. * To be used by QAT device specific drivers. - * - * Return: 0 on success, error code otherwise. */ -int adf_enable_aer(struct adf_accel_dev *accel_dev) +void adf_enable_aer(struct adf_accel_dev *accel_dev) { struct pci_dev *pdev = accel_to_pci_dev(accel_dev); - struct pci_driver *pdrv = pdev->driver; - pdrv->err_handler = &adf_err_handler; pci_enable_pcie_error_reporting(pdev); - return 0; } EXPORT_SYMBOL_GPL(adf_enable_aer); diff --git a/drivers/crypto/qat/qat_common/adf_common_drv.h b/drivers/crypto/qat/qat_common/adf_common_drv.h index 4261749fae8d..e4c24be212ff 100644 --- a/drivers/crypto/qat/qat_common/adf_common_drv.h +++ b/drivers/crypto/qat/qat_common/adf_common_drv.h @@ -95,7 +95,8 @@ void adf_ae_fw_release(struct adf_accel_dev *accel_dev); int adf_ae_start(struct adf_accel_dev *accel_dev); int adf_ae_stop(struct adf_accel_dev *accel_dev); -int adf_enable_aer(struct adf_accel_dev *accel_dev); +extern const struct pci_error_handlers adf_err_handler; +void adf_enable_aer(struct adf_accel_dev *accel_dev); void adf_disable_aer(struct adf_accel_dev *accel_dev); void adf_reset_sbr(struct adf_accel_dev *accel_dev); void adf_reset_flr(struct adf_accel_dev *accel_dev); diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c index 3976a81bd99b..acca56752aa0 100644 --- a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c +++ b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c @@ -33,6 +33,7 @@ static struct pci_driver adf_driver = { .probe = adf_probe, .remove = adf_remove, .sriov_configure = adf_sriov_configure, + .err_handler = &adf_err_handler, }; static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev) @@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); - if (adf_enable_aer(accel_dev)) { - dev_err(&pdev->dev, "Failed to enable aer\n"); - ret = -EFAULT; - goto out_err_free_reg; - } + adf_enable_aer(accel_dev); if (pci_save_state(pdev)) { dev_err(&pdev->dev, "Failed to save pci state\n"); From 823c523eb2e4f2fe37f898861c28b3d8ef22f5ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:28 +0200 Subject: [PATCH 089/512] bcma: simplify reference to driver name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When bcma_host_pci_probe() is called, the PCI driver core has already assigned the device's driver in local_pci_probe(). So dev->driver is always true, and here it points to bcma_pci_bridge_driver, which has .name set to "bcma-pci-bridge". Simplify accordingly. Link: https://lore.kernel.org/r/20211004125935.2300113-5-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/bcma/host_pci.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c index 69c10a7b7c61..960632197b05 100644 --- a/drivers/bcma/host_pci.c +++ b/drivers/bcma/host_pci.c @@ -162,7 +162,6 @@ static int bcma_host_pci_probe(struct pci_dev *dev, { struct bcma_bus *bus; int err = -ENOMEM; - const char *name; u32 val; /* Alloc */ @@ -175,10 +174,7 @@ static int bcma_host_pci_probe(struct pci_dev *dev, if (err) goto err_kfree_bus; - name = dev_name(&dev->dev); - if (dev->driver && dev->driver->name) - name = dev->driver->name; - err = pci_request_regions(dev, name); + err = pci_request_regions(dev, "bcma-pci-bridge"); if (err) goto err_pci_disable; pci_set_master(dev); From 7c3b2c933a916e32e9c8b56074a47c1b3f3e7cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:30 +0200 Subject: [PATCH 090/512] ssb: Use dev_driver_string() instead of pci_dev->driver->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All drivers that use ssb_pcihost_probe(), i.e., b43_pci_bridge_driver and b44_pci_driver, set the pci_driver.name, and __pci_register_driver() sets the struct driver.name member to the same value. Replace dev->driver_name() by dev_driver_string() for the corresponding struct device. This is a step toward removing pci_dev->driver. Link: https://lore.kernel.org/r/20211004125935.2300113-7-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Acked-by: Michael Büsch --- drivers/ssb/pcihost_wrapper.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/ssb/pcihost_wrapper.c b/drivers/ssb/pcihost_wrapper.c index 410215c16920..dd70fd41c77d 100644 --- a/drivers/ssb/pcihost_wrapper.c +++ b/drivers/ssb/pcihost_wrapper.c @@ -69,7 +69,6 @@ static int ssb_pcihost_probe(struct pci_dev *dev, { struct ssb_bus *ssb; int err = -ENOMEM; - const char *name; u32 val; ssb = kzalloc(sizeof(*ssb), GFP_KERNEL); @@ -78,10 +77,7 @@ static int ssb_pcihost_probe(struct pci_dev *dev, err = pci_enable_device(dev); if (err) goto err_kfree_ssb; - name = dev_name(&dev->dev); - if (dev->driver && dev->driver->name) - name = dev->driver->name; - err = pci_request_regions(dev, name); + err = pci_request_regions(dev, dev_driver_string(&dev->dev)); if (err) goto err_pci_disable; pci_set_master(dev); From 5a72431ec318ed38b02a413b1e4ab934e5d3451e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:29 +0200 Subject: [PATCH 091/512] powerpc/eeh: Use dev_driver_string() instead of struct pci_dev->driver->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace pdev->driver->name by dev_driver_string() for the corresponding struct device. This is a step toward removing pci_dev->driver. Move the function nearer its only user and instead of the ?: operator use a normal "if" which is more readable. Link: https://lore.kernel.org/r/20211004125935.2300113-6-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- arch/powerpc/include/asm/ppc-pci.h | 5 ----- arch/powerpc/kernel/eeh.c | 8 ++++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index 2b9edbf6e929..f6cf0159024e 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -55,11 +55,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); void eeh_sysfs_add_device(struct pci_dev *pdev); void eeh_sysfs_remove_device(struct pci_dev *pdev); -static inline const char *eeh_driver_name(struct pci_dev *pdev) -{ - return (pdev && pdev->driver) ? pdev->driver->name : ""; -} - #endif /* CONFIG_EEH */ #define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index e9b597ed423c..4b08881c4a1e 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -399,6 +399,14 @@ out: return ret; } +static inline const char *eeh_driver_name(struct pci_dev *pdev) +{ + if (pdev) + return dev_driver_string(&pdev->dev); + + return ""; +} + /** * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze * @edev: eeh device From 1fbbcffd0ee1f9fc0a10be311e078c4f37f6c733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 17:51:53 -0500 Subject: [PATCH 092/512] crypto: hisilicon - use dev_driver_string() instead of pci_dev->driver->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace dev->driver_name() by dev_driver_string() for the corresponding struct device. This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-8-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/crypto/hisilicon/qm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 369562d34d66..8f361e54e524 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3085,7 +3085,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm) }; int ret; - ret = strscpy(interface.name, pdev->driver->name, + ret = strscpy(interface.name, dev_driver_string(&pdev->dev), sizeof(interface.name)); if (ret < 0) return -ENAMETOOLONG; From e519d9ea62e821659d68c2f1f4d8a5c33ea70db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 17:52:59 -0500 Subject: [PATCH 093/512] net: hns3: use dev_driver_string() instead of pci_dev->driver->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace dev->driver_name() by dev_driver_string() for the corresponding struct device. This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-8-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 7ea511d59e91..f279edfce3f1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -606,7 +606,7 @@ static void hns3_get_drvinfo(struct net_device *netdev, return; } - strncpy(drvinfo->driver, h->pdev->driver->name, + strncpy(drvinfo->driver, dev_driver_string(&h->pdev->dev), sizeof(drvinfo->driver)); drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0'; From e14dc26013141d5e962c31c8ddfc488ef09c3c69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 17:53:45 -0500 Subject: [PATCH 094/512] net: marvell: prestera: use dev_driver_string() instead of pci_dev->driver->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace dev->driver_name() by dev_driver_string() for the corresponding struct device. This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-8-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/net/ethernet/marvell/prestera/prestera_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_pci.c b/drivers/net/ethernet/marvell/prestera/prestera_pci.c index a250d394da38..a8f007f6dad2 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_pci.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_pci.c @@ -720,7 +720,7 @@ out_release: static int prestera_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - const char *driver_name = pdev->driver->name; + const char *driver_name = dev_driver_string(&pdev->dev); struct prestera_fw *fw; int err; From 40dbd5ffc27843582f98d2bc498409da0ba02359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 17:54:37 -0500 Subject: [PATCH 095/512] mlxsw: pci: Use dev_driver_string() instead of pci_dev->driver->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace dev->driver_name() by dev_driver_string() for the corresponding struct device. This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-8-u.kleine-koenig@pengutronix.de Tested-by: Ido Schimmel Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Reviewed-by: Ido Schimmel --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 13b0259f7ea6..8f306364f7bf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1876,7 +1876,7 @@ static void mlxsw_pci_cmd_fini(struct mlxsw_pci *mlxsw_pci) static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - const char *driver_name = pdev->driver->name; + const char *driver_name = dev_driver_string(&pdev->dev); struct mlxsw_pci *mlxsw_pci; int err; From 230b1e54bd1476578e4ae336b7ac3bca13d81459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 17:55:03 -0500 Subject: [PATCH 096/512] nfp: use dev_driver_string() instead of pci_dev->driver->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace dev->driver_name() by dev_driver_string() for the corresponding struct device. This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-8-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Acked-by: Simon Horman --- drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index 0685ece1f155..1de076f55740 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -202,7 +202,8 @@ nfp_get_drvinfo(struct nfp_app *app, struct pci_dev *pdev, { char nsp_version[ETHTOOL_FWVERS_LEN] = {}; - strlcpy(drvinfo->driver, pdev->driver->name, sizeof(drvinfo->driver)); + strlcpy(drvinfo->driver, dev_driver_string(&pdev->dev), + sizeof(drvinfo->driver)); nfp_net_get_nspinfo(app, nsp_version); snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%s %s %s %s", vnic_version, nsp_version, From e98754233c58bfe7cad67ed4b7f3980d7e0d731d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 13 Oct 2021 01:14:12 +0000 Subject: [PATCH 097/512] PCI: cpqphp: Format if-statement code block correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code block related to the if-statement in cpqhp_set_irq() is somewhat awkwardly formatted making the code hard to read. Make it readable. No change to functionality intended. Link: https://lore.kernel.org/r/20211013011412.1110829-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/cpqphp_pci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/hotplug/cpqphp_pci.c b/drivers/pci/hotplug/cpqphp_pci.c index 1b2b3f3b648b..9038039ad6db 100644 --- a/drivers/pci/hotplug/cpqphp_pci.c +++ b/drivers/pci/hotplug/cpqphp_pci.c @@ -189,8 +189,10 @@ int cpqhp_set_irq(u8 bus_num, u8 dev_num, u8 int_pin, u8 irq_num) /* This should only be for x86 as it sets the Edge Level * Control Register */ - outb((u8) (temp_word & 0xFF), 0x4d0); outb((u8) ((temp_word & - 0xFF00) >> 8), 0x4d1); rc = 0; } + outb((u8)(temp_word & 0xFF), 0x4d0); + outb((u8)((temp_word & 0xFF00) >> 8), 0x4d1); + rc = 0; + } return rc; } From af7cda832f8a16c5fd4dffdff4ca43790f94d4e8 Mon Sep 17 00:00:00 2001 From: Simon Xue Date: Wed, 18 Aug 2021 17:34:06 +0800 Subject: [PATCH 098/512] dt-bindings: rockchip: Add DesignWare based PCIe controller Document DT bindings for PCIe controller found on Rockchip SoC. Link: https://lore.kernel.org/r/20210818093406.157788-1-xxm@rock-chips.com Signed-off-by: Simon Xue Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Kever Yang --- .../bindings/pci/rockchip-dw-pcie.yaml | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml diff --git a/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml new file mode 100644 index 000000000000..142bbe577763 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/rockchip-dw-pcie.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: DesignWare based PCIe controller on Rockchip SoCs + +maintainers: + - Shawn Lin + - Simon Xue + - Heiko Stuebner + +description: |+ + RK3568 SoC PCIe host controller is based on the Synopsys DesignWare + PCIe IP and thus inherits all the common properties defined in + designware-pcie.txt. + +allOf: + - $ref: /schemas/pci/pci-bus.yaml# + +# We need a select here so we don't match all nodes with 'snps,dw-pcie' +select: + properties: + compatible: + contains: + const: rockchip,rk3568-pcie + required: + - compatible + +properties: + compatible: + items: + - const: rockchip,rk3568-pcie + - const: snps,dw-pcie + + reg: + items: + - description: Data Bus Interface (DBI) registers + - description: Rockchip designed configuration registers + - description: Config registers + + reg-names: + items: + - const: dbi + - const: apb + - const: config + + clocks: + items: + - description: AHB clock for PCIe master + - description: AHB clock for PCIe slave + - description: AHB clock for PCIe dbi + - description: APB clock for PCIe + - description: Auxiliary clock for PCIe + + clock-names: + items: + - const: aclk_mst + - const: aclk_slv + - const: aclk_dbi + - const: pclk + - const: aux + + msi-map: true + + num-lanes: true + + phys: + maxItems: 1 + + phy-names: + const: pcie-phy + + power-domains: + maxItems: 1 + + ranges: + maxItems: 2 + + resets: + maxItems: 1 + + reset-names: + const: pipe + + vpcie3v3-supply: true + +required: + - compatible + - reg + - reg-names + - clocks + - clock-names + - msi-map + - num-lanes + - phys + - phy-names + - power-domains + - resets + - reset-names + +unevaluatedProperties: false + +examples: + - | + + bus { + #address-cells = <2>; + #size-cells = <2>; + + pcie3x2: pcie@fe280000 { + compatible = "rockchip,rk3568-pcie", "snps,dw-pcie"; + reg = <0x3 0xc0800000 0x0 0x390000>, + <0x0 0xfe280000 0x0 0x10000>, + <0x3 0x80000000 0x0 0x100000>; + reg-names = "dbi", "apb", "config"; + bus-range = <0x20 0x2f>; + clocks = <&cru 143>, <&cru 144>, + <&cru 145>, <&cru 146>, + <&cru 147>; + clock-names = "aclk_mst", "aclk_slv", + "aclk_dbi", "pclk", + "aux"; + device_type = "pci"; + linux,pci-domain = <2>; + max-link-speed = <2>; + msi-map = <0x2000 &its 0x2000 0x1000>; + num-lanes = <2>; + phys = <&pcie30phy>; + phy-names = "pcie-phy"; + power-domains = <&power 15>; + ranges = <0x81000000 0x0 0x80800000 0x3 0x80800000 0x0 0x100000>, + <0x83000000 0x0 0x80900000 0x3 0x80900000 0x0 0x3f700000>; + resets = <&cru 193>; + reset-names = "pipe"; + #address-cells = <3>; + #size-cells = <2>; + }; + }; +... From 42cf2a633d5dc2b5abc440d61b6c1c5fb7dddbbb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 11 Aug 2021 16:25:30 +0200 Subject: [PATCH 099/512] PCI: vmd: depend on !UML With UML having enabled (simulated) PCI on UML, VMD breaks allyesconfig/allmodconfig compilation because it assumes it's running on X86_64 bare metal, and has hardcoded API use of ARCH=x86. Make it depend on !UML to fix this. Link: https://lore.kernel.org/r/20210811162530.affe26231bc3.I131b3c1e67e3d2ead6e98addd256c835fbef9a3e@changeid Signed-off-by: Johannes Berg Signed-off-by: Lorenzo Pieralisi Reviewed-by: Jon Derrick Acked-by: Randy Dunlap # build-tested --- drivers/pci/controller/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 326f7d13024f..9ae1ff198e57 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -254,7 +254,7 @@ config PCIE_MEDIATEK_GEN3 MediaTek SoCs. config VMD - depends on PCI_MSI && X86_64 && SRCU + depends on PCI_MSI && X86_64 && SRCU && !UML tristate "Intel Volume Management Device Driver" help Adds support for the Intel Volume Management Device (VMD). VMD is a From 1a323bd071dd81c9c136c06fad9e02c403b48aca Mon Sep 17 00:00:00 2001 From: Kelvin Cao Date: Thu, 14 Oct 2021 14:18:55 +0000 Subject: [PATCH 100/512] PCI/switchtec: Error out MRPC execution when MMIO reads fail A firmware hard reset may be initiated by various mechanisms including a UART interface, TWI sideband interface from BMC, MRPC command from userspace, etc. The switchtec management driver is unaware of these resets. The reset clears PCI state including the BARs and Memory Space Enable bits, so the device no longer responds to the MMIO accesses the driver uses to operate it. MMIO reads to the device will fail with a PCIe error. When the root complex handles that error, it typically fabricates ~0 data to complete the CPU read. Check for this sort of error by reading the device ID from MMIO space. This ID can never be ~0, so if we see that value, it probably means the PCIe Memory Read failed and we should return an error indication to the application using the switchtec driver. Link: https://lore.kernel.org/r/20211014141859.11444-2-kelvin.cao@microchip.com Signed-off-by: Kelvin Cao Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 67 ++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index 0b301f8be9ed..e5bb2ac0e7bb 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -45,6 +45,7 @@ enum mrpc_state { MRPC_QUEUED, MRPC_RUNNING, MRPC_DONE, + MRPC_IO_ERROR, }; struct switchtec_user { @@ -66,6 +67,19 @@ struct switchtec_user { int event_cnt; }; +/* + * The MMIO reads to the device_id register should always return the device ID + * of the device, otherwise the firmware is probably stuck or unreachable + * due to a firmware reset which clears PCI state including the BARs and Memory + * Space Enable bits. + */ +static int is_firmware_running(struct switchtec_dev *stdev) +{ + u32 device = ioread32(&stdev->mmio_sys_info->device_id); + + return stdev->pdev->device == device; +} + static struct switchtec_user *stuser_create(struct switchtec_dev *stdev) { struct switchtec_user *stuser; @@ -113,6 +127,7 @@ static void stuser_set_state(struct switchtec_user *stuser, [MRPC_QUEUED] = "QUEUED", [MRPC_RUNNING] = "RUNNING", [MRPC_DONE] = "DONE", + [MRPC_IO_ERROR] = "IO_ERROR", }; stuser->state = state; @@ -184,9 +199,26 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser) return 0; } +static void mrpc_cleanup_cmd(struct switchtec_dev *stdev) +{ + /* requires the mrpc_mutex to already be held when called */ + + struct switchtec_user *stuser = list_entry(stdev->mrpc_queue.next, + struct switchtec_user, list); + + stuser->cmd_done = true; + wake_up_interruptible(&stuser->cmd_comp); + list_del_init(&stuser->list); + stuser_put(stuser); + stdev->mrpc_busy = 0; + + mrpc_cmd_submit(stdev); +} + static void mrpc_complete_cmd(struct switchtec_dev *stdev) { /* requires the mrpc_mutex to already be held when called */ + struct switchtec_user *stuser; if (list_empty(&stdev->mrpc_queue)) @@ -223,13 +255,7 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev) memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data, stuser->read_len); out: - stuser->cmd_done = true; - wake_up_interruptible(&stuser->cmd_comp); - list_del_init(&stuser->list); - stuser_put(stuser); - stdev->mrpc_busy = 0; - - mrpc_cmd_submit(stdev); + mrpc_cleanup_cmd(stdev); } static void mrpc_event_work(struct work_struct *work) @@ -246,6 +272,23 @@ static void mrpc_event_work(struct work_struct *work) mutex_unlock(&stdev->mrpc_mutex); } +static void mrpc_error_complete_cmd(struct switchtec_dev *stdev) +{ + /* requires the mrpc_mutex to already be held when called */ + + struct switchtec_user *stuser; + + if (list_empty(&stdev->mrpc_queue)) + return; + + stuser = list_entry(stdev->mrpc_queue.next, + struct switchtec_user, list); + + stuser_set_state(stuser, MRPC_IO_ERROR); + + mrpc_cleanup_cmd(stdev); +} + static void mrpc_timeout_work(struct work_struct *work) { struct switchtec_dev *stdev; @@ -257,6 +300,11 @@ static void mrpc_timeout_work(struct work_struct *work) mutex_lock(&stdev->mrpc_mutex); + if (!is_firmware_running(stdev)) { + mrpc_error_complete_cmd(stdev); + goto out; + } + if (stdev->dma_mrpc) status = stdev->dma_mrpc->status; else @@ -544,6 +592,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data, if (rc) return rc; + if (stuser->state == MRPC_IO_ERROR) { + mutex_unlock(&stdev->mrpc_mutex); + return -EIO; + } + if (stuser->state != MRPC_DONE) { mutex_unlock(&stdev->mrpc_mutex); return -EBADE; From 551ec658b69807318aeef5506da886cf9587b251 Mon Sep 17 00:00:00 2001 From: Kelvin Cao Date: Thu, 14 Oct 2021 14:18:56 +0000 Subject: [PATCH 101/512] PCI/switchtec: Fix a MRPC error status handling issue If an error is encountered when executing a MRPC command, the firmware will set the status register to SWITCHTEC_MRPC_STATUS_ERROR and return the error code in the return value register. Add handling of SWITCHTEC_MRPC_STATUS_ERROR on status register when completing a MRPC command. Link: https://lore.kernel.org/r/20211014141859.11444-3-kelvin.cao@microchip.com Signed-off-by: Kelvin Cao Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index e5bb2ac0e7bb..5c300ff3921d 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -238,7 +238,8 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev) stuser_set_state(stuser, MRPC_DONE); stuser->return_code = 0; - if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE) + if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE && + stuser->status != SWITCHTEC_MRPC_STATUS_ERROR) goto out; if (stdev->dma_mrpc) @@ -622,7 +623,8 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data, out: mutex_unlock(&stdev->mrpc_mutex); - if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE) + if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE || + stuser->status == SWITCHTEC_MRPC_STATUS_ERROR) return size; else if (stuser->status == SWITCHTEC_MRPC_STATUS_INTERRUPTED) return -ENXIO; From 1420ac218abcc1df809eedfb0f7351941b01c785 Mon Sep 17 00:00:00 2001 From: Kelvin Cao Date: Thu, 14 Oct 2021 14:18:57 +0000 Subject: [PATCH 102/512] PCI/switchtec: Update the way of getting management VEP instance ID Gen4 firmware adds DMA VEP and NVMe VEP support in VEP (virtual EP) instance ID register in addtion to management EP, update the way of getting management VEP instance ID. Link: https://lore.kernel.org/r/20211014141859.11444-4-kelvin.cao@microchip.com Signed-off-by: Kelvin Cao Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index 5c300ff3921d..97a93c9b4629 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -1133,7 +1133,7 @@ static int ioctl_pff_to_port(struct switchtec_dev *stdev, break; } - reg = ioread32(&pcfg->vep_pff_inst_id); + reg = ioread32(&pcfg->vep_pff_inst_id) & 0xFF; if (reg == p.pff) { p.port = SWITCHTEC_IOCTL_PFF_VEP; break; @@ -1179,7 +1179,7 @@ static int ioctl_port_to_pff(struct switchtec_dev *stdev, p.pff = ioread32(&pcfg->usp_pff_inst_id); break; case SWITCHTEC_IOCTL_PFF_VEP: - p.pff = ioread32(&pcfg->vep_pff_inst_id); + p.pff = ioread32(&pcfg->vep_pff_inst_id) & 0xFF; break; default: if (p.port > ARRAY_SIZE(pcfg->dsp_pff_inst_id)) @@ -1553,7 +1553,7 @@ static void init_pff(struct switchtec_dev *stdev) if (reg < stdev->pff_csr_count) stdev->pff_local[reg] = 1; - reg = ioread32(&pcfg->vep_pff_inst_id); + reg = ioread32(&pcfg->vep_pff_inst_id) & 0xFF; if (reg < stdev->pff_csr_count) stdev->pff_local[reg] = 1; From 67116444cf55e1d070ee2060ffe4d1c72917ec31 Mon Sep 17 00:00:00 2001 From: Kelvin Cao Date: Thu, 14 Oct 2021 14:18:58 +0000 Subject: [PATCH 103/512] PCI/switchtec: Replace ENOTSUPP with EOPNOTSUPP ENOTSUPP is not a SUSV4 error code, and the following checkpatch.pl warning will be given for new patches which still use ENOTSUPP. WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP See the link below for the discussion. https://lore.kernel.org/netdev/20200511165319.2251678-1-kuba@kernel.org/ Replace ENOTSUPP with EOPNOTSUPP to align with future patches which will be using EOPNOTSUPP. Link: https://lore.kernel.org/r/20211014141859.11444-5-kelvin.cao@microchip.com Signed-off-by: Kelvin Cao Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index 97a93c9b4629..236cb40cc7c5 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -376,7 +376,7 @@ static ssize_t field ## _show(struct device *dev, \ return io_string_show(buf, &si->gen4.field, \ sizeof(si->gen4.field)); \ else \ - return -ENOTSUPP; \ + return -EOPNOTSUPP; \ } \ \ static DEVICE_ATTR_RO(field) @@ -668,7 +668,7 @@ static int ioctl_flash_info(struct switchtec_dev *stdev, info.flash_length = ioread32(&fi->gen4.flash_length); info.num_partitions = SWITCHTEC_NUM_PARTITIONS_GEN4; } else { - return -ENOTSUPP; + return -EOPNOTSUPP; } if (copy_to_user(uinfo, &info, sizeof(info))) @@ -876,7 +876,7 @@ static int ioctl_flash_part_info(struct switchtec_dev *stdev, if (ret) return ret; } else { - return -ENOTSUPP; + return -EOPNOTSUPP; } if (copy_to_user(uinfo, &info, sizeof(info))) @@ -1611,7 +1611,7 @@ static int switchtec_init_pci(struct switchtec_dev *stdev, else if (stdev->gen == SWITCHTEC_GEN4) part_id = &stdev->mmio_sys_info->gen4.partition_id; else - return -ENOTSUPP; + return -EOPNOTSUPP; stdev->partition = ioread8(part_id); stdev->partition_count = ioread8(&stdev->mmio_ntb->partition_count); From 9f37ab0412eba537377c38b1dde1a04fbd7b5264 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 14 Oct 2021 14:18:59 +0000 Subject: [PATCH 104/512] PCI/switchtec: Add check of event support Not all events are supported by every gen/variant of the Switchtec firmware. To solve this, since Gen4, a new bit in each event header is introduced to indicate if an event is supported by the firmware. Link: https://lore.kernel.org/r/20211014141859.11444-6-kelvin.cao@microchip.com Signed-off-by: Logan Gunthorpe Signed-off-by: Kelvin Cao Signed-off-by: Bjorn Helgaas --- drivers/pci/switch/switchtec.c | 8 +++++++- include/linux/switchtec.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index 236cb40cc7c5..38c2b036fb8e 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -1024,6 +1024,9 @@ static int event_ctl(struct switchtec_dev *stdev, return PTR_ERR(reg); hdr = ioread32(reg); + if (hdr & SWITCHTEC_EVENT_NOT_SUPP) + return -EOPNOTSUPP; + for (i = 0; i < ARRAY_SIZE(ctl->data); i++) ctl->data[i] = ioread32(®[i + 1]); @@ -1096,7 +1099,7 @@ static int ioctl_event_ctl(struct switchtec_dev *stdev, for (ctl.index = 0; ctl.index < nr_idxs; ctl.index++) { ctl.flags = event_flags; ret = event_ctl(stdev, &ctl); - if (ret < 0) + if (ret < 0 && ret != -EOPNOTSUPP) return ret; } } else { @@ -1403,6 +1406,9 @@ static int mask_event(struct switchtec_dev *stdev, int eid, int idx) hdr_reg = event_regs[eid].map_reg(stdev, off, idx); hdr = ioread32(hdr_reg); + if (hdr & SWITCHTEC_EVENT_NOT_SUPP) + return 0; + if (!(hdr & SWITCHTEC_EVENT_OCCURRED && hdr & SWITCHTEC_EVENT_EN_IRQ)) return 0; diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h index 082f1d51957a..be24056ac00f 100644 --- a/include/linux/switchtec.h +++ b/include/linux/switchtec.h @@ -19,6 +19,7 @@ #define SWITCHTEC_EVENT_EN_CLI BIT(2) #define SWITCHTEC_EVENT_EN_IRQ BIT(3) #define SWITCHTEC_EVENT_FATAL BIT(4) +#define SWITCHTEC_EVENT_NOT_SUPP BIT(31) #define SWITCHTEC_DMA_MRPC_EN BIT(0) From b89ff410253d7468f84720d2d5c2bb0bafedf3bd Mon Sep 17 00:00:00 2001 From: Prasad Malisetty Date: Thu, 7 Oct 2021 23:18:42 +0530 Subject: [PATCH 105/512] PCI: qcom: Replace ops with struct pcie_cfg in pcie match data Add struct qcom_pcie_cfg as match data for all platforms. Assign appropriate platform ops into struct qcom_pcie_cfg and read using of_device_get_match_data() in qcom_pcie_probe(). Link: https://lore.kernel.org/r/1633628923-25047-5-git-send-email-pmaliset@codeaurora.org Signed-off-by: Prasad Malisetty Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Stephen Boyd --- drivers/pci/controller/dwc/pcie-qcom.c | 66 +++++++++++++++++++++----- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 8a7a300163e5..1c3c01ccb831 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -189,6 +189,10 @@ struct qcom_pcie_ops { int (*config_sid)(struct qcom_pcie *pcie); }; +struct qcom_pcie_cfg { + const struct qcom_pcie_ops *ops; +}; + struct qcom_pcie { struct dw_pcie *pci; void __iomem *parf; /* DT parf */ @@ -1456,6 +1460,38 @@ static const struct qcom_pcie_ops ops_1_9_0 = { .config_sid = qcom_pcie_config_sid_sm8250, }; +static const struct qcom_pcie_cfg apq8084_cfg = { + .ops = &ops_1_0_0, +}; + +static const struct qcom_pcie_cfg ipq8064_cfg = { + .ops = &ops_2_1_0, +}; + +static const struct qcom_pcie_cfg msm8996_cfg = { + .ops = &ops_2_3_2, +}; + +static const struct qcom_pcie_cfg ipq8074_cfg = { + .ops = &ops_2_3_3, +}; + +static const struct qcom_pcie_cfg ipq4019_cfg = { + .ops = &ops_2_4_0, +}; + +static const struct qcom_pcie_cfg sdm845_cfg = { + .ops = &ops_2_7_0, +}; + +static const struct qcom_pcie_cfg sm8250_cfg = { + .ops = &ops_1_9_0, +}; + +static const struct qcom_pcie_cfg sc7280_cfg = { + .ops = &ops_1_9_0, +}; + static const struct dw_pcie_ops dw_pcie_ops = { .link_up = qcom_pcie_link_up, .start_link = qcom_pcie_start_link, @@ -1467,6 +1503,7 @@ static int qcom_pcie_probe(struct platform_device *pdev) struct pcie_port *pp; struct dw_pcie *pci; struct qcom_pcie *pcie; + const struct qcom_pcie_cfg *pcie_cfg; int ret; pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); @@ -1488,7 +1525,13 @@ static int qcom_pcie_probe(struct platform_device *pdev) pcie->pci = pci; - pcie->ops = of_device_get_match_data(dev); + pcie_cfg = of_device_get_match_data(dev); + if (!pcie_cfg || !pcie_cfg->ops) { + dev_err(dev, "Invalid platform data\n"); + return -EINVAL; + } + + pcie->ops = pcie_cfg->ops; pcie->reset = devm_gpiod_get_optional(dev, "perst", GPIOD_OUT_HIGH); if (IS_ERR(pcie->reset)) { @@ -1545,16 +1588,17 @@ err_pm_runtime_put: } static const struct of_device_id qcom_pcie_match[] = { - { .compatible = "qcom,pcie-apq8084", .data = &ops_1_0_0 }, - { .compatible = "qcom,pcie-ipq8064", .data = &ops_2_1_0 }, - { .compatible = "qcom,pcie-ipq8064-v2", .data = &ops_2_1_0 }, - { .compatible = "qcom,pcie-apq8064", .data = &ops_2_1_0 }, - { .compatible = "qcom,pcie-msm8996", .data = &ops_2_3_2 }, - { .compatible = "qcom,pcie-ipq8074", .data = &ops_2_3_3 }, - { .compatible = "qcom,pcie-ipq4019", .data = &ops_2_4_0 }, - { .compatible = "qcom,pcie-qcs404", .data = &ops_2_4_0 }, - { .compatible = "qcom,pcie-sdm845", .data = &ops_2_7_0 }, - { .compatible = "qcom,pcie-sm8250", .data = &ops_1_9_0 }, + { .compatible = "qcom,pcie-apq8084", .data = &apq8084_cfg }, + { .compatible = "qcom,pcie-ipq8064", .data = &ipq8064_cfg }, + { .compatible = "qcom,pcie-ipq8064-v2", .data = &ipq8064_cfg }, + { .compatible = "qcom,pcie-apq8064", .data = &ipq8064_cfg }, + { .compatible = "qcom,pcie-msm8996", .data = &msm8996_cfg }, + { .compatible = "qcom,pcie-ipq8074", .data = &ipq8074_cfg }, + { .compatible = "qcom,pcie-ipq4019", .data = &ipq4019_cfg }, + { .compatible = "qcom,pcie-qcs404", .data = &ipq4019_cfg }, + { .compatible = "qcom,pcie-sdm845", .data = &sdm845_cfg }, + { .compatible = "qcom,pcie-sm8250", .data = &sm8250_cfg }, + { .compatible = "qcom,pcie-sc7280", .data = &sc7280_cfg }, { } }; From aa9c0df98c2920f7176b001737546c6595f594a4 Mon Sep 17 00:00:00 2001 From: Prasad Malisetty Date: Thu, 7 Oct 2021 23:18:43 +0530 Subject: [PATCH 106/512] PCI: qcom: Switch pcie_1_pipe_clk_src after PHY init in SC7280 On the SC7280, the clock source for gcc_pcie_1_pipe_clk_src must be the TCXO while gdsc is enabled. After PHY init successful clock source should switch to pipe clock for gcc_pcie_1_pipe_clk_src. Link: https://lore.kernel.org/r/1633628923-25047-6-git-send-email-pmaliset@codeaurora.org Signed-off-by: Prasad Malisetty Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Stephen Boyd --- drivers/pci/controller/dwc/pcie-qcom.c | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 1c3c01ccb831..c4ff996dc443 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -166,6 +166,9 @@ struct qcom_pcie_resources_2_7_0 { struct regulator_bulk_data supplies[2]; struct reset_control *pci_reset; struct clk *pipe_clk; + struct clk *pipe_clk_src; + struct clk *phy_pipe_clk; + struct clk *ref_clk_src; }; union qcom_pcie_resources { @@ -191,6 +194,7 @@ struct qcom_pcie_ops { struct qcom_pcie_cfg { const struct qcom_pcie_ops *ops; + unsigned int pipe_clk_need_muxing:1; }; struct qcom_pcie { @@ -201,6 +205,7 @@ struct qcom_pcie { struct phy *phy; struct gpio_desc *reset; const struct qcom_pcie_ops *ops; + unsigned int pipe_clk_need_muxing:1; }; #define to_qcom_pcie(x) dev_get_drvdata((x)->dev) @@ -1171,6 +1176,20 @@ static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie) if (ret < 0) return ret; + if (pcie->pipe_clk_need_muxing) { + res->pipe_clk_src = devm_clk_get(dev, "pipe_mux"); + if (IS_ERR(res->pipe_clk_src)) + return PTR_ERR(res->pipe_clk_src); + + res->phy_pipe_clk = devm_clk_get(dev, "phy_pipe"); + if (IS_ERR(res->phy_pipe_clk)) + return PTR_ERR(res->phy_pipe_clk); + + res->ref_clk_src = devm_clk_get(dev, "ref"); + if (IS_ERR(res->ref_clk_src)) + return PTR_ERR(res->ref_clk_src); + } + res->pipe_clk = devm_clk_get(dev, "pipe"); return PTR_ERR_OR_ZERO(res->pipe_clk); } @@ -1189,6 +1208,10 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie) return ret; } + /* Set TCXO as clock source for pcie_pipe_clk_src */ + if (pcie->pipe_clk_need_muxing) + clk_set_parent(res->pipe_clk_src, res->ref_clk_src); + ret = clk_bulk_prepare_enable(res->num_clks, res->clks); if (ret < 0) goto err_disable_regulators; @@ -1260,6 +1283,10 @@ static int qcom_pcie_post_init_2_7_0(struct qcom_pcie *pcie) { struct qcom_pcie_resources_2_7_0 *res = &pcie->res.v2_7_0; + /* Set pipe clock as clock source for pcie_pipe_clk_src */ + if (pcie->pipe_clk_need_muxing) + clk_set_parent(res->pipe_clk_src, res->phy_pipe_clk); + return clk_prepare_enable(res->pipe_clk); } @@ -1490,6 +1517,7 @@ static const struct qcom_pcie_cfg sm8250_cfg = { static const struct qcom_pcie_cfg sc7280_cfg = { .ops = &ops_1_9_0, + .pipe_clk_need_muxing = true, }; static const struct dw_pcie_ops dw_pcie_ops = { @@ -1532,6 +1560,7 @@ static int qcom_pcie_probe(struct platform_device *pdev) } pcie->ops = pcie_cfg->ops; + pcie->pipe_clk_need_muxing = pcie_cfg->pipe_clk_need_muxing; pcie->reset = devm_gpiod_get_optional(dev, "perst", GPIOD_OUT_HIGH); if (IS_ERR(pcie->reset)) { From 45a3ec891370bba7984bb4cf57299a6f5744b91e Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 23 Aug 2021 08:49:58 -0700 Subject: [PATCH 107/512] PCI: qcom: Add sc8180x compatible The SC8180x platform comes with 4 PCIe controllers, typically used for things such as NVME storage or connecting a SDX55 5G modem. Add a compatible for this, that just reuses the 1.9.0 ops. Link: https://lore.kernel.org/linux-arm-msm/20210725040038.3966348-4-bjorn.andersson@linaro.org/ Link: https://lore.kernel.org/r/20210823154958.305677-2-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson [lorenzo.pieralisi@arm.com: updated match data structure] Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- Documentation/devicetree/bindings/pci/qcom,pcie.txt | 5 +++-- drivers/pci/controller/dwc/pcie-qcom.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt index 3f646875f8c2..a0ae024c2d0c 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.txt +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt @@ -12,6 +12,7 @@ - "qcom,pcie-ipq4019" for ipq4019 - "qcom,pcie-ipq8074" for ipq8074 - "qcom,pcie-qcs404" for qcs404 + - "qcom,pcie-sc8180x" for sc8180x - "qcom,pcie-sdm845" for sdm845 - "qcom,pcie-sm8250" for sm8250 - "qcom,pcie-ipq6018" for ipq6018 @@ -156,7 +157,7 @@ - "pipe" PIPE clock - clock-names: - Usage: required for sm8250 + Usage: required for sc8180x and sm8250 Value type: Definition: Should contain the following entries - "aux" Auxiliary clock @@ -245,7 +246,7 @@ - "ahb" AHB reset - reset-names: - Usage: required for sdm845 and sm8250 + Usage: required for sc8180x, sdm845 and sm8250 Value type: Definition: Should contain the following entries - "pci" PCIe core reset diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index c4ff996dc443..1c3d1116bb60 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -1627,6 +1627,7 @@ static const struct of_device_id qcom_pcie_match[] = { { .compatible = "qcom,pcie-qcs404", .data = &ipq4019_cfg }, { .compatible = "qcom,pcie-sdm845", .data = &sdm845_cfg }, { .compatible = "qcom,pcie-sm8250", .data = &sm8250_cfg }, + { .compatible = "qcom,pcie-sc8180x", .data = &sm8250_cfg }, { .compatible = "qcom,pcie-sc7280", .data = &sc7280_cfg }, { } }; From 4caab28a6215da5f3c1b505ff08810bc6acfe365 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Sat, 18 Sep 2021 09:22:59 +0900 Subject: [PATCH 108/512] PCI: uniphier: Serialize INTx masking/unmasking and fix the bit operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The condition register PCI_RCV_INTX is used in irq_mask() and irq_unmask() callbacks. Accesses to register can occur at the same time without a lock. Add a lock into each callback to prevent the issue. And INTX mask and unmask fields in PCL_RCV_INTX register should only be set/reset for each bit. Clearing by PCL_RCV_INTX_ALL_MASK should be removed. INTX status fields in PCL_RCV_INTX register only indicates each INTX interrupt status, so the handler can't clear by writing 1 to the field. The status is expected to be cleared by the interrupt origin. The ack function has no meaning, so should remove it. Suggested-by: Pali Rohár Link: https://lore.kernel.org/r/1631924579-24567-1-git-send-email-hayashi.kunihiko@socionext.com Fixes: 7e6d5cd88a6f ("PCI: uniphier: Add UniPhier PCIe host controller support") Signed-off-by: Kunihiko Hayashi Signed-off-by: Lorenzo Pieralisi Acked-by: Pali Rohár Acked-by: Marc Zyngier --- drivers/pci/controller/dwc/pcie-uniphier.c | 26 +++++++++------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-uniphier.c b/drivers/pci/controller/dwc/pcie-uniphier.c index d842fd018129..d05be942956e 100644 --- a/drivers/pci/controller/dwc/pcie-uniphier.c +++ b/drivers/pci/controller/dwc/pcie-uniphier.c @@ -168,30 +168,21 @@ static void uniphier_pcie_irq_enable(struct uniphier_pcie_priv *priv) writel(PCL_RCV_INTX_ALL_ENABLE, priv->base + PCL_RCV_INTX); } -static void uniphier_pcie_irq_ack(struct irq_data *d) -{ - struct pcie_port *pp = irq_data_get_irq_chip_data(d); - struct dw_pcie *pci = to_dw_pcie_from_pp(pp); - struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci); - u32 val; - - val = readl(priv->base + PCL_RCV_INTX); - val &= ~PCL_RCV_INTX_ALL_STATUS; - val |= BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_STATUS_SHIFT); - writel(val, priv->base + PCL_RCV_INTX); -} - static void uniphier_pcie_irq_mask(struct irq_data *d) { struct pcie_port *pp = irq_data_get_irq_chip_data(d); struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci); + unsigned long flags; u32 val; + raw_spin_lock_irqsave(&pp->lock, flags); + val = readl(priv->base + PCL_RCV_INTX); - val &= ~PCL_RCV_INTX_ALL_MASK; val |= BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_MASK_SHIFT); writel(val, priv->base + PCL_RCV_INTX); + + raw_spin_unlock_irqrestore(&pp->lock, flags); } static void uniphier_pcie_irq_unmask(struct irq_data *d) @@ -199,17 +190,20 @@ static void uniphier_pcie_irq_unmask(struct irq_data *d) struct pcie_port *pp = irq_data_get_irq_chip_data(d); struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci); + unsigned long flags; u32 val; + raw_spin_lock_irqsave(&pp->lock, flags); + val = readl(priv->base + PCL_RCV_INTX); - val &= ~PCL_RCV_INTX_ALL_MASK; val &= ~BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_MASK_SHIFT); writel(val, priv->base + PCL_RCV_INTX); + + raw_spin_unlock_irqrestore(&pp->lock, flags); } static struct irq_chip uniphier_pcie_irq_chip = { .name = "PCI", - .irq_ack = uniphier_pcie_irq_ack, .irq_mask = uniphier_pcie_irq_mask, .irq_unmask = uniphier_pcie_irq_unmask, }; From 34ab316d7287692bad41ce4a431b43d60a8bd20f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Oct 2021 18:22:08 -0500 Subject: [PATCH 109/512] xen/pcifront: Drop pcifront_common_process() tests of pcidev, pdrv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pcifront_common_process() exits early if pcidev or pcidev->driver are NULL, so simplify it by not checking them again. [bhelgaas: split flag change to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-4-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Reviewed-by: Boris Ostrovsky Reviewed-by: Christoph Hellwig --- drivers/pci/xen-pcifront.c | 52 +++++++++++++------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 2156c632524d..aa3b84823e68 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -588,61 +588,43 @@ static pci_ers_result_t pcifront_common_process(int cmd, struct pcifront_device *pdev, pci_channel_state_t state) { - pci_ers_result_t result; struct pci_driver *pdrv; int bus = pdev->sh_info->aer_op.bus; int devfn = pdev->sh_info->aer_op.devfn; int domain = pdev->sh_info->aer_op.domain; struct pci_dev *pcidev; - int flag = 0; dev_dbg(&pdev->xdev->dev, "pcifront AER process: cmd %x (bus:%x, devfn%x)", cmd, bus, devfn); - result = PCI_ERS_RESULT_NONE; pcidev = pci_get_domain_bus_and_slot(domain, bus, devfn); if (!pcidev || !pcidev->driver) { dev_err(&pdev->xdev->dev, "device or AER driver is NULL\n"); pci_dev_put(pcidev); - return result; + return PCI_ERS_RESULT_NONE; } pdrv = pcidev->driver; - if (pdrv) { - if (pdrv->err_handler && pdrv->err_handler->error_detected) { - pci_dbg(pcidev, "trying to call AER service\n"); - if (pcidev) { - flag = 1; - switch (cmd) { - case XEN_PCI_OP_aer_detected: - result = pdrv->err_handler-> - error_detected(pcidev, state); - break; - case XEN_PCI_OP_aer_mmio: - result = pdrv->err_handler-> - mmio_enabled(pcidev); - break; - case XEN_PCI_OP_aer_slotreset: - result = pdrv->err_handler-> - slot_reset(pcidev); - break; - case XEN_PCI_OP_aer_resume: - pdrv->err_handler->resume(pcidev); - break; - default: - dev_err(&pdev->xdev->dev, - "bad request in aer recovery " - "operation!\n"); - - } - } + if (pdrv->err_handler && pdrv->err_handler->error_detected) { + pci_dbg(pcidev, "trying to call AER service\n"); + switch (cmd) { + case XEN_PCI_OP_aer_detected: + return pdrv->err_handler->error_detected(pcidev, state); + case XEN_PCI_OP_aer_mmio: + return pdrv->err_handler->mmio_enabled(pcidev); + case XEN_PCI_OP_aer_slotreset: + return pdrv->err_handler->slot_reset(pcidev); + case XEN_PCI_OP_aer_resume: + pdrv->err_handler->resume(pcidev); + return PCI_ERS_RESULT_NONE; + default: + dev_err(&pdev->xdev->dev, + "bad request in aer recovery operation!\n"); } } - if (!flag) - result = PCI_ERS_RESULT_NONE; - return result; + return PCI_ERS_RESULT_NONE; } From 43e85554d4ed1cb2eec417cc43cb5fc60157235e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 16:35:34 -0500 Subject: [PATCH 110/512] xen/pcifront: Use to_pci_driver() instead of pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct pci_driver contains a struct device_driver, so for PCI devices, it's easy to convert a device_driver * to a pci_driver * with to_pci_driver(). The device_driver * is in struct device, so we don't need to also keep track of the pci_driver * in struct pci_dev. Replace pdev->driver with to_pci_driver(). This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-11-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Reviewed-by: Boris Ostrovsky --- drivers/pci/xen-pcifront.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index aa3b84823e68..d858d25b6cab 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -599,12 +599,12 @@ static pci_ers_result_t pcifront_common_process(int cmd, cmd, bus, devfn); pcidev = pci_get_domain_bus_and_slot(domain, bus, devfn); - if (!pcidev || !pcidev->driver) { + if (!pcidev || !pcidev->dev.driver) { dev_err(&pdev->xdev->dev, "device or AER driver is NULL\n"); pci_dev_put(pcidev); return PCI_ERS_RESULT_NONE; } - pdrv = pcidev->driver; + pdrv = to_pci_driver(pcidev->dev.driver); if (pdrv->err_handler && pdrv->err_handler->error_detected) { pci_dbg(pcidev, "trying to call AER service\n"); From 3134689f98f9e09004a4727370adc46e7635b4be Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Fri, 15 Oct 2021 13:58:40 -0500 Subject: [PATCH 111/512] PCI/portdrv: Rename pm_iter() to pcie_port_device_iter() Rename pm_iter() to pcie_port_device_iter() and make it visible outside CONFIG_PM and portdrv_core.c so it can be used for pciehp slot reset recovery. [bhelgaas: split into its own patch] Link: https://lore.kernel.org/linux-pci/08c046b0-c9f2-3489-eeef-7e7aca435bb9@gmail.com/ Link: https://lore.kernel.org/r/251f4edcc04c14f873ff1c967bc686169cd07d2d.1627638184.git.lukas@wunner.de Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/portdrv.h | 1 + drivers/pci/pcie/portdrv_core.c | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 2ff5724b8f13..6126ee4676a7 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -110,6 +110,7 @@ void pcie_port_service_unregister(struct pcie_port_service_driver *new); extern struct bus_type pcie_port_bus_type; int pcie_port_device_register(struct pci_dev *dev); +int pcie_port_device_iter(struct device *dev, void *data); #ifdef CONFIG_PM int pcie_port_device_suspend(struct device *dev); int pcie_port_device_resume_noirq(struct device *dev); diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 3ee63968deaa..604feeb84ee4 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -367,24 +367,24 @@ error_disable: return status; } -#ifdef CONFIG_PM -typedef int (*pcie_pm_callback_t)(struct pcie_device *); +typedef int (*pcie_callback_t)(struct pcie_device *); -static int pm_iter(struct device *dev, void *data) +int pcie_port_device_iter(struct device *dev, void *data) { struct pcie_port_service_driver *service_driver; size_t offset = *(size_t *)data; - pcie_pm_callback_t cb; + pcie_callback_t cb; if ((dev->bus == &pcie_port_bus_type) && dev->driver) { service_driver = to_service_driver(dev->driver); - cb = *(pcie_pm_callback_t *)((void *)service_driver + offset); + cb = *(pcie_callback_t *)((void *)service_driver + offset); if (cb) return cb(to_pcie_device(dev)); } return 0; } +#ifdef CONFIG_PM /** * pcie_port_device_suspend - suspend port services associated with a PCIe port * @dev: PCI Express port to handle @@ -392,13 +392,13 @@ static int pm_iter(struct device *dev, void *data) int pcie_port_device_suspend(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, suspend); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } int pcie_port_device_resume_noirq(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, resume_noirq); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } /** @@ -408,7 +408,7 @@ int pcie_port_device_resume_noirq(struct device *dev) int pcie_port_device_resume(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, resume); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } /** @@ -418,7 +418,7 @@ int pcie_port_device_resume(struct device *dev) int pcie_port_device_runtime_suspend(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, runtime_suspend); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } /** @@ -428,7 +428,7 @@ int pcie_port_device_runtime_suspend(struct device *dev) int pcie_port_device_runtime_resume(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, runtime_resume); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } #endif /* PM */ From ea401499e943c307e6d44af6c2b4e068643e7884 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sat, 31 Jul 2021 14:39:01 +0200 Subject: [PATCH 112/512] PCI: pciehp: Ignore Link Down/Up caused by error-induced Hot Reset Stuart Hayes reports that an error handled by DPC at a Root Port results in pciehp gratuitously bringing down a subordinate hotplug port: RP -- UP -- DP -- UP -- DP (hotplug) -- EP pciehp brings the slot down because the Link to the Endpoint goes down. That is caused by a Hot Reset being propagated as a result of DPC. Per PCIe Base Spec 5.0, section 6.6.1 "Conventional Reset": For a Switch, the following must cause a hot reset to be sent on all Downstream Ports: [...] * The Data Link Layer of the Upstream Port reporting DL_Down status. In Switches that support Link speeds greater than 5.0 GT/s, the Upstream Port must direct the LTSSM of each Downstream Port to the Hot Reset state, but not hold the LTSSMs in that state. This permits each Downstream Port to begin Link training immediately after its hot reset completes. This behavior is recommended for all Switches. * Receiving a hot reset on the Upstream Port. Once DPC recovers, pcie_do_recovery() walks down the hierarchy and invokes pcie_portdrv_slot_reset() to restore each port's config space. At that point, a hotplug interrupt is signaled per PCIe Base Spec r5.0, section 6.7.3.4 "Software Notification of Hot-Plug Events": If the Port is enabled for edge-triggered interrupt signaling using MSI or MSI-X, an interrupt message must be sent every time the logical AND of the following conditions transitions from FALSE to TRUE: [...] * The Hot-Plug Interrupt Enable bit in the Slot Control register is set to 1b. * At least one hot-plug event status bit in the Slot Status register and its associated enable bit in the Slot Control register are both set to 1b. Prevent pciehp from gratuitously bringing down the slot by clearing the error-induced Data Link Layer State Changed event before restoring config space. Afterwards, check whether the link has unexpectedly failed to retrain and synthesize a DLLSC event if so. Allow each pcie_port_service_driver (one of them being pciehp) to define a slot_reset callback and re-use the existing pm_iter() function to iterate over the callbacks. Thereby, the Endpoint driver remains bound throughout error recovery and may restore the device to working state. Surprise removal during error recovery is detected through a Presence Detect Changed event. The hotplug port is expected to not signal that event as a result of a Hot Reset. The issue isn't DPC-specific, it also occurs when an error is handled by AER through aer_root_reset(). So while the issue was noticed only now, it's been around since 2006 when AER support was first introduced. [bhelgaas: drop PCI_ERROR_RECOVERY Kconfig, split pm_iter() rename to preparatory patch] Link: https://lore.kernel.org/linux-pci/08c046b0-c9f2-3489-eeef-7e7aca435bb9@gmail.com/ Fixes: 6c2b374d7485 ("PCI-Express AER implemetation: AER core and aerdriver") Link: https://lore.kernel.org/r/251f4edcc04c14f873ff1c967bc686169cd07d2d.1627638184.git.lukas@wunner.de Reported-by: Stuart Hayes Tested-by: Stuart Hayes Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v2.6.19+: ba952824e6c1: PCI/portdrv: Report reset for frozen channel Cc: Keith Busch --- drivers/pci/hotplug/pciehp.h | 2 ++ drivers/pci/hotplug/pciehp_core.c | 2 ++ drivers/pci/hotplug/pciehp_hpc.c | 26 ++++++++++++++++++++++++++ drivers/pci/pcie/portdrv.h | 2 ++ drivers/pci/pcie/portdrv_pci.c | 3 +++ 5 files changed, 35 insertions(+) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 69fd401691be..918dccbc74b6 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -189,6 +189,8 @@ int pciehp_get_attention_status(struct hotplug_slot *hotplug_slot, u8 *status); int pciehp_set_raw_indicator_status(struct hotplug_slot *h_slot, u8 status); int pciehp_get_raw_indicator_status(struct hotplug_slot *h_slot, u8 *status); +int pciehp_slot_reset(struct pcie_device *dev); + static inline const char *slot_name(struct controller *ctrl) { return hotplug_slot_name(&ctrl->hotplug_slot); diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index ad3393930ecb..f34114d45259 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -351,6 +351,8 @@ static struct pcie_port_service_driver hpdriver_portdrv = { .runtime_suspend = pciehp_runtime_suspend, .runtime_resume = pciehp_runtime_resume, #endif /* PM */ + + .slot_reset = pciehp_slot_reset, }; int __init pcie_hp_init(void) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 3024d7e85e6a..83a0fa119cae 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -862,6 +862,32 @@ void pcie_disable_interrupt(struct controller *ctrl) pcie_write_cmd(ctrl, 0, mask); } +/** + * pciehp_slot_reset() - ignore link event caused by error-induced hot reset + * @dev: PCI Express port service device + * + * Called from pcie_portdrv_slot_reset() after AER or DPC initiated a reset + * further up in the hierarchy to recover from an error. The reset was + * propagated down to this hotplug port. Ignore the resulting link flap. + * If the link failed to retrain successfully, synthesize the ignored event. + * Surprise removal during reset is detected through Presence Detect Changed. + */ +int pciehp_slot_reset(struct pcie_device *dev) +{ + struct controller *ctrl = get_service_data(dev); + + if (ctrl->state != ON_STATE) + return 0; + + pcie_capability_write_word(dev->port, PCI_EXP_SLTSTA, + PCI_EXP_SLTSTA_DLLSC); + + if (!pciehp_check_link_active(ctrl)) + pciehp_request(ctrl, PCI_EXP_SLTSTA_DLLSC); + + return 0; +} + /* * pciehp has a 1:1 bus:slot relationship so we ultimately want a secondary * bus reset of the bridge, but at the same time we want to ensure that it is diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 6126ee4676a7..41fe1ffd5907 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -85,6 +85,8 @@ struct pcie_port_service_driver { int (*runtime_suspend)(struct pcie_device *dev); int (*runtime_resume)(struct pcie_device *dev); + int (*slot_reset)(struct pcie_device *dev); + /* Device driver may resume normal operations */ void (*error_resume)(struct pci_dev *dev); diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index c7ff1eea225a..1af74c3d9d5d 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -160,6 +160,9 @@ static pci_ers_result_t pcie_portdrv_error_detected(struct pci_dev *dev, static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev) { + size_t off = offsetof(struct pcie_port_service_driver, slot_reset); + device_for_each_child(&dev->dev, &off, pcie_port_device_iter); + pci_restore_state(dev); pci_save_state(dev); return PCI_ERS_RESULT_RECOVERED; From 80dcd36c388a34d5c2d0b9c8c171887903dbefbb Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sat, 31 Jul 2021 14:39:02 +0200 Subject: [PATCH 113/512] PCI/portdrv: Remove unused resume err_handler Commit 3e41a317ae45 ("PCI/AER: Remove unused aer_error_resume()") removed the resume err_handler from AER. Since no other port service implements the callback, support for it can be removed from portdrv. It can be revived later if need be, preferably by re-using the pcie_port_device_iter() iterator. Link: https://lore.kernel.org/r/25334149b604e005058aeb0fdf51e01f991d5d74.1627638184.git.lukas@wunner.de Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Cc: Keith Busch --- drivers/pci/pcie/portdrv.h | 3 --- drivers/pci/pcie/portdrv_pci.c | 24 ------------------------ 2 files changed, 27 deletions(-) diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 41fe1ffd5907..e9fe0fef6cde 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -87,9 +87,6 @@ struct pcie_port_service_driver { int (*slot_reset)(struct pcie_device *dev); - /* Device driver may resume normal operations */ - void (*error_resume)(struct pci_dev *dev); - int port_type; /* Type of the port this driver can handle */ u32 service; /* Port service this device represents */ diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index 1af74c3d9d5d..35eca6277a96 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -173,29 +173,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev) return PCI_ERS_RESULT_RECOVERED; } -static int resume_iter(struct device *device, void *data) -{ - struct pcie_device *pcie_device; - struct pcie_port_service_driver *driver; - - if (device->bus == &pcie_port_bus_type && device->driver) { - driver = to_service_driver(device->driver); - if (driver && driver->error_resume) { - pcie_device = to_pcie_device(device); - - /* Forward error message to service drivers */ - driver->error_resume(pcie_device->port); - } - } - - return 0; -} - -static void pcie_portdrv_err_resume(struct pci_dev *dev) -{ - device_for_each_child(&dev->dev, NULL, resume_iter); -} - /* * LINUX Device Driver Model */ @@ -213,7 +190,6 @@ static const struct pci_error_handlers pcie_portdrv_err_handler = { .error_detected = pcie_portdrv_error_detected, .slot_reset = pcie_portdrv_slot_reset, .mmio_enabled = pcie_portdrv_mmio_enabled, - .resume = pcie_portdrv_err_resume, }; static struct pci_driver pcie_portdriver = { From bb6951b84fb46f4048ead76c524a084e7b132463 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sat, 31 Jul 2021 14:39:03 +0200 Subject: [PATCH 114/512] PCI/portdrv: Remove unused pcie_port_bus_{,un}register() declarations Commit c6c889d932bb ("PCI/portdrv: Remove pcie_port_bus_type link order dependency") removed pcie_port_bus_{,un}register() but erroneously retained their declarations in portdrv.h. Remove them as well. Link: https://lore.kernel.org/r/7fd76b0591c37287ab94d911d8fd9ab9a2afcd16.1627638184.git.lukas@wunner.de Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/portdrv.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index e9fe0fef6cde..0ef4bf5f811d 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -118,8 +118,6 @@ int pcie_port_device_runtime_suspend(struct device *dev); int pcie_port_device_runtime_resume(struct device *dev); #endif void pcie_port_device_remove(struct pci_dev *dev); -int __must_check pcie_port_bus_register(void); -void pcie_port_bus_unregister(void); struct pci_dev; From f9a6c8ad4922bf386c366e5ece453d459e628784 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sat, 31 Jul 2021 14:39:04 +0200 Subject: [PATCH 115/512] PCI/ERR: Reduce compile time for CONFIG_PCIEAER=n The sole non-static function in err.c, pcie_do_recovery(), is only called from: * aer.c (if CONFIG_PCIEAER=y) * dpc.c (if CONFIG_PCIE_DPC=y, which depends on CONFIG_PCIEAER) * edr.c (if CONFIG_PCIE_EDR=y, which depends on CONFIG_PCIE_DPC) Thus, err.c need not be compiled if CONFIG_PCIEAER=n. Also, pci_uevent_ers() and pcie_clear_device_status(), which are called from err.c, can be #ifdef'ed away unless CONFIG_PCIEAER=y. Since x86_64_defconfig doesn't enable CONFIG_PCIEAER, this change may slightly reduce compile time for anyone doing a test build with that config. Link: https://lore.kernel.org/r/98f9041151268c1c035ab64cca320ad86803f64a.1627638184.git.lukas@wunner.de Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-driver.c | 2 +- drivers/pci/pci.c | 2 ++ drivers/pci/pcie/Makefile | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 2761ab86490d..b0923bdcdb2e 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1542,7 +1542,7 @@ static int pci_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH) +#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) /** * pci_uevent_ers - emit a uevent during recovery path of PCI device * @pdev: PCI device undergoing error recovery diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ce2ab62b64cf..e9d95189c79b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2218,6 +2218,7 @@ int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) } EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state); +#ifdef CONFIG_PCIEAER void pcie_clear_device_status(struct pci_dev *dev) { u16 sta; @@ -2225,6 +2226,7 @@ void pcie_clear_device_status(struct pci_dev *dev) pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta); pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta); } +#endif /** * pcie_clear_root_pme_status - Clear root port PME interrupt status. diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile index b2980db88cc0..5783a2f79e6a 100644 --- a/drivers/pci/pcie/Makefile +++ b/drivers/pci/pcie/Makefile @@ -2,12 +2,12 @@ # # Makefile for PCI Express features and port driver -pcieportdrv-y := portdrv_core.o portdrv_pci.o err.o rcec.o +pcieportdrv-y := portdrv_core.o portdrv_pci.o rcec.o obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o obj-$(CONFIG_PCIEASPM) += aspm.o -obj-$(CONFIG_PCIEAER) += aer.o +obj-$(CONFIG_PCIEAER) += aer.o err.o obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o obj-$(CONFIG_PCIE_PME) += pme.o obj-$(CONFIG_PCIE_DPC) += dpc.o From 4e59b75430f0bf515ac0a6b2d61861aefc601776 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 12 Oct 2021 15:51:32 -0500 Subject: [PATCH 116/512] cxl: Factor out common dev->driver expressions Save the struct pci_driver and struct pci_error_handlers pointers from pdev->driver instead of chasing the pointers several times. No functional change intended. Signed-off-by: Bjorn Helgaas --- drivers/misc/cxl/guest.c | 30 +++++++++++++++++------------- drivers/misc/cxl/pci.c | 35 ++++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 24 deletions(-) diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 186308f1f8eb..94e29bab6b44 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -20,34 +20,38 @@ static void pci_error_handlers(struct cxl_afu *afu, pci_channel_state_t state) { struct pci_dev *afu_dev; + struct pci_driver *afu_drv; + const struct pci_error_handlers *err_handler; if (afu->phb == NULL) return; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - if (!afu_dev->driver) + afu_drv = afu_dev->driver; + if (!afu_drv) continue; + err_handler = afu_drv->err_handler; switch (bus_error_event) { case CXL_ERROR_DETECTED_EVENT: afu_dev->error_state = state; - if (afu_dev->driver->err_handler && - afu_dev->driver->err_handler->error_detected) - afu_dev->driver->err_handler->error_detected(afu_dev, state); - break; + if (err_handler && + err_handler->error_detected) + err_handler->error_detected(afu_dev, state); + break; case CXL_SLOT_RESET_EVENT: afu_dev->error_state = state; - if (afu_dev->driver->err_handler && - afu_dev->driver->err_handler->slot_reset) - afu_dev->driver->err_handler->slot_reset(afu_dev); - break; + if (err_handler && + err_handler->slot_reset) + err_handler->slot_reset(afu_dev); + break; case CXL_RESUME_EVENT: - if (afu_dev->driver->err_handler && - afu_dev->driver->err_handler->resume) - afu_dev->driver->err_handler->resume(afu_dev); - break; + if (err_handler && + err_handler->resume) + err_handler->resume(afu_dev); + break; } } } diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 2ba899f5659f..27393b72ea6f 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1795,6 +1795,8 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu, pci_channel_state_t state) { struct pci_dev *afu_dev; + struct pci_driver *afu_drv; + const struct pci_error_handlers *err_handler; pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET; pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET; @@ -1805,14 +1807,16 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu, return result; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - if (!afu_dev->driver) + afu_drv = afu_dev->driver; + if (!afu_drv) continue; afu_dev->error_state = state; - if (afu_dev->driver->err_handler) - afu_result = afu_dev->driver->err_handler->error_detected(afu_dev, - state); + err_handler = afu_drv->err_handler; + if (err_handler) + afu_result = err_handler->error_detected(afu_dev, + state); /* Disconnect trumps all, NONE trumps NEED_RESET */ if (afu_result == PCI_ERS_RESULT_DISCONNECT) result = PCI_ERS_RESULT_DISCONNECT; @@ -1972,6 +1976,8 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) struct cxl_afu *afu; struct cxl_context *ctx; struct pci_dev *afu_dev; + struct pci_driver *afu_drv; + const struct pci_error_handlers *err_handler; pci_ers_result_t afu_result = PCI_ERS_RESULT_RECOVERED; pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED; int i; @@ -2028,12 +2034,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) * shouldn't start new work until we call * their resume function. */ - if (!afu_dev->driver) + afu_drv = afu_dev->driver; + if (!afu_drv) continue; - if (afu_dev->driver->err_handler && - afu_dev->driver->err_handler->slot_reset) - afu_result = afu_dev->driver->err_handler->slot_reset(afu_dev); + err_handler = afu_drv->err_handler; + if (err_handler && err_handler->slot_reset) + afu_result = err_handler->slot_reset(afu_dev); if (afu_result == PCI_ERS_RESULT_DISCONNECT) result = PCI_ERS_RESULT_DISCONNECT; @@ -2060,6 +2067,8 @@ static void cxl_pci_resume(struct pci_dev *pdev) struct cxl *adapter = pci_get_drvdata(pdev); struct cxl_afu *afu; struct pci_dev *afu_dev; + struct pci_driver *afu_drv; + const struct pci_error_handlers *err_handler; int i; /* Everything is back now. Drivers should restart work now. @@ -2074,9 +2083,13 @@ static void cxl_pci_resume(struct pci_dev *pdev) continue; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - if (afu_dev->driver && afu_dev->driver->err_handler && - afu_dev->driver->err_handler->resume) - afu_dev->driver->err_handler->resume(afu_dev); + afu_drv = afu_dev->driver; + if (!afu_drv) + continue; + + err_handler = afu_drv->err_handler; + if (err_handler && err_handler->resume) + err_handler->resume(afu_dev); } } spin_unlock(&adapter->afu_list_lock); From 16bd44e54dfba2a403833d11acea63e186de23c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 16:03:39 -0500 Subject: [PATCH 117/512] cxl: Use to_pci_driver() instead of pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct pci_driver contains a struct device_driver, so for PCI devices, it's easy to convert a device_driver * to a pci_driver * with to_pci_driver(). The device_driver * is in struct device, so we don't need to also keep track of the pci_driver * in struct pci_dev. Replace pdev->driver with to_pci_driver(). This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-11-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/misc/cxl/guest.c | 2 +- drivers/misc/cxl/pci.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 94e29bab6b44..9d485c9e3fff 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -27,7 +27,7 @@ static void pci_error_handlers(struct cxl_afu *afu, return; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - afu_drv = afu_dev->driver; + afu_drv = to_pci_driver(afu_dev->dev.driver); if (!afu_drv) continue; diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 27393b72ea6f..3de0aea62ade 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1807,7 +1807,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu, return result; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - afu_drv = afu_dev->driver; + afu_drv = to_pci_driver(afu_dev->dev.driver); if (!afu_drv) continue; @@ -2034,7 +2034,7 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) * shouldn't start new work until we call * their resume function. */ - afu_drv = afu_dev->driver; + afu_drv = to_pci_driver(afu_dev->dev.driver); if (!afu_drv) continue; @@ -2083,7 +2083,7 @@ static void cxl_pci_resume(struct pci_dev *pdev) continue; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - afu_drv = afu_dev->driver; + afu_drv = to_pci_driver(afu_dev->dev.driver); if (!afu_drv) continue; From 97918f794027a844e7f6e3ae7acdd22ac9055b18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 16:36:05 -0500 Subject: [PATCH 118/512] usb: xhci: Use to_pci_driver() instead of pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct pci_driver contains a struct device_driver, so for PCI devices, it's easy to convert a device_driver * to a pci_driver * with to_pci_driver(). The device_driver * is in struct device, so we don't need to also keep track of the pci_driver * in struct pci_dev. Replace pdev->driver with to_pci_driver(). This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-11-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/usb/host/xhci-pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 2c9f25ca8edd..2f4729f4f1e0 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -103,7 +103,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) struct xhci_driver_data *driver_data; const struct pci_device_id *id; - id = pci_match_id(pdev->driver->id_table, pdev); + id = pci_match_id(to_pci_driver(pdev->dev.driver)->id_table, pdev); if (id && id->driver_data) { driver_data = (struct xhci_driver_data *)id->driver_data; From 4141127c44a9b00d941885fc41392697d22a62c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 16:37:55 -0500 Subject: [PATCH 119/512] powerpc/eeh: Use to_pci_driver() instead of pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct pci_driver contains a struct device_driver, so for PCI devices, it's easy to convert a device_driver * to a pci_driver * with to_pci_driver(). The device_driver * is in struct device, so we don't need to also keep track of the pci_driver * in struct pci_dev. Replace pdev->driver with to_pci_driver(). This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-11-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- arch/powerpc/kernel/eeh_driver.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 3eff6a4888e7..350dab18e137 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -104,13 +104,13 @@ static bool eeh_edev_actionable(struct eeh_dev *edev) */ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) { - if (!pdev || !pdev->driver) + if (!pdev || !pdev->dev.driver) return NULL; - if (!try_module_get(pdev->driver->driver.owner)) + if (!try_module_get(pdev->dev.driver->owner)) return NULL; - return pdev->driver; + return to_pci_driver(pdev->dev.driver); } /** @@ -122,10 +122,10 @@ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) */ static inline void eeh_pcid_put(struct pci_dev *pdev) { - if (!pdev || !pdev->driver) + if (!pdev || !pdev->dev.driver) return; - module_put(pdev->driver->driver.owner); + module_put(pdev->dev.driver->owner); } /** From ba51521b11a15caaae3fddd93c1ead2ffe6b557b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 16:38:32 -0500 Subject: [PATCH 120/512] perf/x86/intel/uncore: Use to_pci_driver() instead of pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct pci_driver contains a struct device_driver, so for PCI devices, it's easy to convert a device_driver * to a pci_driver * with to_pci_driver(). The device_driver * is in struct device, so we don't need to also keep track of the pci_driver * in struct pci_dev. Replace pdev->driver with to_pci_driver(). This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-11-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- arch/x86/events/intel/uncore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index c72e368dd164..f1ba6ab2e97e 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1187,7 +1187,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id * PCI slot and func to indicate the uncore box. */ if (id->driver_data & ~0xffff) { - struct pci_driver *pci_drv = pdev->driver; + struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver); pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table); if (pmu == NULL) From d98d53331b727838122bc80e1898c4e1fc201fb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 16:05:47 -0500 Subject: [PATCH 121/512] x86/pci/probe_roms: Use to_pci_driver() instead of pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct pci_driver contains a struct device_driver, so for PCI devices, it's easy to convert a device_driver * to a pci_driver * with to_pci_driver(). The device_driver * is in struct device, so we don't need to also keep track of the pci_driver * in struct pci_dev. Replace pdev->driver with to_pci_driver(). This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-11-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- arch/x86/kernel/probe_roms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 9e1def3744f2..36e84d904260 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -80,7 +80,7 @@ static struct resource video_rom_resource = { */ static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) { - struct pci_driver *drv = pdev->driver; + struct pci_driver *drv = to_pci_driver(pdev->dev.driver); const struct pci_device_id *id; if (pdev->vendor == vendor && pdev->device == device) From 2a4d9408c9e8b6f6fc150c66f3fef755c9e20d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 16:11:24 -0500 Subject: [PATCH 122/512] PCI: Use to_pci_driver() instead of pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct pci_driver contains a struct device_driver, so for PCI devices, it's easy to convert a device_driver * to a pci_driver * with to_pci_driver(). The device_driver * is in struct device, so we don't need to also keep track of the pci_driver * in struct pci_dev. Replace pci_dev->driver with to_pci_driver(). This is a step toward removing pci_dev->driver. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20211004125935.2300113-11-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/pci/iov.c | 24 +++++++++++++++--------- drivers/pci/pci-driver.c | 33 +++++++++++++++++---------------- drivers/pci/pci.c | 17 +++++++++-------- drivers/pci/pcie/err.c | 8 ++++---- 4 files changed, 45 insertions(+), 37 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index dafdc652fcd0..fa4b52bb1e05 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -164,13 +164,15 @@ static ssize_t sriov_vf_total_msix_show(struct device *dev, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); + struct pci_driver *pdrv; u32 vf_total_msix = 0; device_lock(dev); - if (!pdev->driver || !pdev->driver->sriov_get_vf_total_msix) + pdrv = to_pci_driver(dev->driver); + if (!pdrv || !pdrv->sriov_get_vf_total_msix) goto unlock; - vf_total_msix = pdev->driver->sriov_get_vf_total_msix(pdev); + vf_total_msix = pdrv->sriov_get_vf_total_msix(pdev); unlock: device_unlock(dev); return sysfs_emit(buf, "%u\n", vf_total_msix); @@ -183,6 +185,7 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev, { struct pci_dev *vf_dev = to_pci_dev(dev); struct pci_dev *pdev = pci_physfn(vf_dev); + struct pci_driver *pdrv; int val, ret; ret = kstrtoint(buf, 0, &val); @@ -193,13 +196,14 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev, return -EINVAL; device_lock(&pdev->dev); - if (!pdev->driver || !pdev->driver->sriov_set_msix_vec_count) { + pdrv = to_pci_driver(dev->driver); + if (!pdrv || !pdrv->sriov_set_msix_vec_count) { ret = -EOPNOTSUPP; goto err_pdev; } device_lock(&vf_dev->dev); - if (vf_dev->driver) { + if (to_pci_driver(vf_dev->dev.driver)) { /* * A driver is already attached to this VF and has configured * itself based on the current MSI-X vector count. Changing @@ -209,7 +213,7 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev, goto err_dev; } - ret = pdev->driver->sriov_set_msix_vec_count(vf_dev, val); + ret = pdrv->sriov_set_msix_vec_count(vf_dev, val); err_dev: device_unlock(&vf_dev->dev); @@ -376,6 +380,7 @@ static ssize_t sriov_numvfs_store(struct device *dev, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); + struct pci_driver *pdrv; int ret; u16 num_vfs; @@ -392,14 +397,15 @@ static ssize_t sriov_numvfs_store(struct device *dev, goto exit; /* is PF driver loaded */ - if (!pdev->driver) { + pdrv = to_pci_driver(dev->driver); + if (!pdrv) { pci_info(pdev, "no driver bound to device; cannot configure SR-IOV\n"); ret = -ENOENT; goto exit; } /* is PF driver loaded w/callback */ - if (!pdev->driver->sriov_configure) { + if (!pdrv->sriov_configure) { pci_info(pdev, "driver does not support SR-IOV configuration via sysfs\n"); ret = -ENOENT; goto exit; @@ -407,7 +413,7 @@ static ssize_t sriov_numvfs_store(struct device *dev, if (num_vfs == 0) { /* disable VFs */ - ret = pdev->driver->sriov_configure(pdev, 0); + ret = pdrv->sriov_configure(pdev, 0); goto exit; } @@ -419,7 +425,7 @@ static ssize_t sriov_numvfs_store(struct device *dev, goto exit; } - ret = pdev->driver->sriov_configure(pdev, num_vfs); + ret = pdrv->sriov_configure(pdev, num_vfs); if (ret < 0) goto exit; diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 50449ec622a3..b919c5361694 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -457,7 +457,7 @@ static int pci_device_probe(struct device *dev) static void pci_device_remove(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(dev->driver); if (drv->remove) { pm_runtime_get_sync(dev); @@ -493,7 +493,7 @@ static void pci_device_remove(struct device *dev) static void pci_device_shutdown(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(dev->driver); pm_runtime_resume(dev); @@ -589,7 +589,7 @@ static int pci_pm_reenable_device(struct pci_dev *pci_dev) static int pci_legacy_suspend(struct device *dev, pm_message_t state) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(dev->driver); if (drv && drv->suspend) { pci_power_t prev = pci_dev->current_state; @@ -630,7 +630,7 @@ static int pci_legacy_suspend_late(struct device *dev, pm_message_t state) static int pci_legacy_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(dev->driver); pci_fixup_device(pci_fixup_resume, pci_dev); @@ -649,7 +649,7 @@ static void pci_pm_default_suspend(struct pci_dev *pci_dev) static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev) { - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(pci_dev->dev.driver); bool ret = drv && (drv->suspend || drv->resume); /* @@ -1242,11 +1242,11 @@ static int pci_pm_runtime_suspend(struct device *dev) int error; /* - * If pci_dev->driver is not set (unbound), we leave the device in D0, - * but it may go to D3cold when the bridge above it runtime suspends. - * Save its config space in case that happens. + * If the device has no driver, we leave it in D0, but it may go to + * D3cold when the bridge above it runtime suspends. Save its + * config space in case that happens. */ - if (!pci_dev->driver) { + if (!to_pci_driver(dev->driver)) { pci_save_state(pci_dev); return 0; } @@ -1303,7 +1303,7 @@ static int pci_pm_runtime_resume(struct device *dev) */ pci_restore_standard_config(pci_dev); - if (!pci_dev->driver) + if (!to_pci_driver(dev->driver)) return 0; pci_fixup_device(pci_fixup_resume_early, pci_dev); @@ -1322,14 +1322,13 @@ static int pci_pm_runtime_resume(struct device *dev) static int pci_pm_runtime_idle(struct device *dev) { - struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* - * If pci_dev->driver is not set (unbound), the device should - * always remain in D0 regardless of the runtime PM status + * If the device has no driver, it should always remain in D0 + * regardless of the runtime PM status */ - if (!pci_dev->driver) + if (!to_pci_driver(dev->driver)) return 0; if (!pm) @@ -1436,8 +1435,10 @@ static struct pci_driver pci_compat_driver = { */ struct pci_driver *pci_dev_driver(const struct pci_dev *dev) { - if (dev->driver) - return dev->driver; + struct pci_driver *drv = to_pci_driver(dev->dev.driver); + + if (drv) + return drv; else { int i; for (i = 0; i <= PCI_ROM_RESOURCE; i++) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ce2ab62b64cf..5298ce131f8c 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5088,13 +5088,14 @@ EXPORT_SYMBOL_GPL(pci_dev_unlock); static void pci_dev_save_and_disable(struct pci_dev *dev) { + struct pci_driver *drv = to_pci_driver(dev->dev.driver); const struct pci_error_handlers *err_handler = - dev->driver ? dev->driver->err_handler : NULL; + drv ? drv->err_handler : NULL; /* - * dev->driver->err_handler->reset_prepare() is protected against - * races with ->remove() by the device lock, which must be held by - * the caller. + * drv->err_handler->reset_prepare() is protected against races + * with ->remove() by the device lock, which must be held by the + * caller. */ if (err_handler && err_handler->reset_prepare) err_handler->reset_prepare(dev); @@ -5119,15 +5120,15 @@ static void pci_dev_save_and_disable(struct pci_dev *dev) static void pci_dev_restore(struct pci_dev *dev) { + struct pci_driver *drv = to_pci_driver(dev->dev.driver); const struct pci_error_handlers *err_handler = - dev->driver ? dev->driver->err_handler : NULL; + drv ? drv->err_handler : NULL; pci_restore_state(dev); /* - * dev->driver->err_handler->reset_done() is protected against - * races with ->remove() by the device lock, which must be held by - * the caller. + * drv->err_handler->reset_done() is protected against races with + * ->remove() by the device lock, which must be held by the caller. */ if (err_handler && err_handler->reset_done) err_handler->reset_done(dev); diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 0c5a143025af..356b9317297e 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -54,7 +54,7 @@ static int report_error_detected(struct pci_dev *dev, const struct pci_error_handlers *err_handler; device_lock(&dev->dev); - pdrv = dev->driver; + pdrv = to_pci_driver(dev->dev.driver); if (!pci_dev_set_io_state(dev, state) || !pdrv || !pdrv->err_handler || @@ -98,7 +98,7 @@ static int report_mmio_enabled(struct pci_dev *dev, void *data) const struct pci_error_handlers *err_handler; device_lock(&dev->dev); - pdrv = dev->driver; + pdrv = to_pci_driver(dev->dev.driver); if (!pdrv || !pdrv->err_handler || !pdrv->err_handler->mmio_enabled) @@ -119,7 +119,7 @@ static int report_slot_reset(struct pci_dev *dev, void *data) const struct pci_error_handlers *err_handler; device_lock(&dev->dev); - pdrv = dev->driver; + pdrv = to_pci_driver(dev->dev.driver); if (!pdrv || !pdrv->err_handler || !pdrv->err_handler->slot_reset) @@ -139,7 +139,7 @@ static int report_resume(struct pci_dev *dev, void *data) const struct pci_error_handlers *err_handler; device_lock(&dev->dev); - pdrv = dev->driver; + pdrv = to_pci_driver(dev->dev.driver); if (!pci_dev_set_io_state(dev, pci_channel_io_normal) || !pdrv || !pdrv->err_handler || From b5f9c644eb1baafcd349ad134e2110773f8d0a38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:35 +0200 Subject: [PATCH 123/512] PCI: Remove struct pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no remaining uses of the struct pci_dev->driver pointer, so remove it. Link: https://lore.kernel.org/r/20211004125935.2300113-12-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-driver.c | 4 ---- include/linux/pci.h | 1 - 2 files changed, 5 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index b919c5361694..3da121cb8cbf 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -319,12 +319,10 @@ static long local_pci_probe(void *_ddi) * its remove routine. */ pm_runtime_get_sync(dev); - pci_dev->driver = pci_drv; rc = pci_drv->probe(pci_dev, ddi->id); if (!rc) return rc; if (rc < 0) { - pci_dev->driver = NULL; pm_runtime_put_sync(dev); return rc; } @@ -390,7 +388,6 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, * @pci_dev: PCI device being probed * * returns 0 on success, else error. - * side-effect: pci_dev->driver is set to drv when drv claims pci_dev. */ static int __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev) { @@ -465,7 +462,6 @@ static void pci_device_remove(struct device *dev) pm_runtime_put_noidle(dev); } pcibios_free_irq(pci_dev); - pci_dev->driver = NULL; pci_iov_remove(pci_dev); /* Undo the runtime PM settings in local_pci_probe() */ diff --git a/include/linux/pci.h b/include/linux/pci.h index a2d62165a7f7..03bfdb25a55c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -342,7 +342,6 @@ struct pci_dev { u16 pcie_flags_reg; /* Cached PCIe Capabilities Register */ unsigned long *dma_alias_mask;/* Mask of enabled devfn aliases */ - struct pci_driver *driver; /* Driver bound to this device */ u64 dma_mask; /* Mask of the bits of bus address this device implements. Normally this is 0xffffffff. You only need to change From 88dee3b0efe4b769fb22ce2e963aabae403e6801 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Mon, 18 Oct 2021 20:41:09 +0800 Subject: [PATCH 124/512] PCI: Remove unused pci_pool wrappers The pci_pool users have been converted to dma_pool. Remove the unused pci_pool wrappers. Link: https://lore.kernel.org/r/20211018124110.214-1-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..2ae9dd7f975f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1498,19 +1498,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, #define PCI_IRQ_ALL_TYPES \ (PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX) -/* kmem_cache style wrapper around pci_alloc_consistent() */ - #include -#define pci_pool dma_pool -#define pci_pool_create(name, pdev, size, align, allocation) \ - dma_pool_create(name, &pdev->dev, size, align, allocation) -#define pci_pool_destroy(pool) dma_pool_destroy(pool) -#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle) -#define pci_pool_zalloc(pool, flags, handle) \ - dma_pool_zalloc(pool, flags, handle) -#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr) - struct msix_entry { u32 vector; /* Kernel uses to write allocated vector */ u16 entry; /* Driver uses to specify entry, OS writes */ From 5e3be666f46b43169e80041657c4b63eaf1ae8de Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 25 Aug 2021 18:26:34 +0800 Subject: [PATCH 125/512] PCI: Document /sys/bus/pci/devices/.../irq Document /sys/bus/pci/devices/.../irq. This file contains the IRQ of the INTx interrupt (or zero if the device doesn't support INTx interrupts). If the device has enabled MSI (not MSI-X), it contains the first MSI IRQ instead. This is a historical mistake because devices may support several MSI or MSI-X vectors, and this file can't contain them all. But we preserve this behavior to avoid breaking userspace. [bhelgaas: commit log] Link: https://lore.kernel.org/r/20210825102636.52757-2-21cnbao@gmail.com Signed-off-by: Barry Song Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-bus-pci | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index d4ae03296861..fef498c34fd9 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -96,6 +96,17 @@ Description: This attribute indicates the mode that the irq vector named by the file is in (msi vs. msix) +What: /sys/bus/pci/devices/.../irq +Date: August 2021 +Contact: Linux PCI developers +Description: + If a driver has enabled MSI (not MSI-X), "irq" contains the + IRQ of the first MSI vector. Otherwise "irq" contains the + IRQ of the legacy INTx interrupt. + + "irq" being set to 0 indicates that the device isn't + capable of generating legacy INTx interrupts. + What: /sys/bus/pci/devices/.../remove Date: January 2009 Contact: Linux PCI developers From ac8e3cef588c5affc6dfa7b693ec64bbf3cead6a Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 25 Aug 2021 18:26:35 +0800 Subject: [PATCH 126/512] PCI/sysfs: Explicitly show first MSI IRQ for 'irq' The sysfs "irq" file contains the legacy INTx IRQ. Or, if the device has MSI enabled, it contains the first MSI IRQ instead. Previously this file showed the pci_dev.irq value directly. But we'd prefer to use pci_dev.irq only for the INTx IRQ and decouple that from any MSI or MSI-X IRQs. If the device has MSI enabled, explicitly look up and show the first MSI IRQ in the sysfs "irq" file. Otherwise, show the INTx IRQ. This removes the requirement that msi_capability_init() set pci_dev.irq to the first MSI IRQ when enabling MSI and pci_msi_shutdown() restore the INTx IRQ when disabling MSI. [bhelgaas: commit log] Link: https://lore.kernel.org/r/20210825102636.52757-3-21cnbao@gmail.com Signed-off-by: Barry Song Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-sysfs.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 7fb5cd17cc98..b50fe7ce234b 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "pci.h" @@ -49,7 +50,28 @@ pci_config_attr(subsystem_vendor, "0x%04x\n"); pci_config_attr(subsystem_device, "0x%04x\n"); pci_config_attr(revision, "0x%02x\n"); pci_config_attr(class, "0x%06x\n"); -pci_config_attr(irq, "%u\n"); + +static ssize_t irq_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + +#ifdef CONFIG_PCI_MSI + /* + * For MSI, show the first MSI IRQ; for all other cases including + * MSI-X, show the legacy INTx IRQ. + */ + if (pdev->msi_enabled) { + struct msi_desc *desc = first_pci_msi_entry(pdev); + + return sysfs_emit(buf, "%u\n", desc->irq); + } +#endif + + return sysfs_emit(buf, "%u\n", pdev->irq); +} +static DEVICE_ATTR_RO(irq); static ssize_t broken_parity_status_show(struct device *dev, struct device_attribute *attr, From e369953a5ba3295379095060f4ac72958da7c125 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 25 Jul 2021 13:51:02 -0700 Subject: [PATCH 127/512] xtensa: move _SimulateUserKernelVectorException out of WindowVectors In configurations without window registers support the section .WindowVectors.text may never be linked. _SimulateUserKernelVectorException is a common handler for high priority interrupts, it does not belong in that section anyway. Move it out of that section and mark it as __XTENSA_HANDLER so it gets bundled with other vector helpers. Signed-off-by: Max Filippov --- arch/xtensa/kernel/vectors.S | 40 +++++++++++++++++------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/xtensa/kernel/vectors.S b/arch/xtensa/kernel/vectors.S index 1a7538ccfc5a..0eed5aa82914 100644 --- a/arch/xtensa/kernel/vectors.S +++ b/arch/xtensa/kernel/vectors.S @@ -650,6 +650,25 @@ ENTRY(_Level\level\()InterruptVector) irq_entry_level 5 irq_entry_level 6 +#if XCHAL_EXCM_LEVEL >= 2 + /* + * Continuation of medium priority interrupt dispatch code. + * On entry here, a0 contains PS, and EPC2 contains saved a0: + */ + __XTENSA_HANDLER + .align 4 +_SimulateUserKernelVectorException: + addi a0, a0, (1 << PS_EXCM_BIT) +#if !XTENSA_FAKE_NMI + wsr a0, ps +#endif + bbsi.l a0, PS_UM_BIT, 1f # branch if user mode + xsr a0, excsave2 # restore a0 + j _KernelExceptionVector # simulate kernel vector exception +1: xsr a0, excsave2 # restore a0 + j _UserExceptionVector # simulate user vector exception +#endif + /* Window overflow and underflow handlers. * The handlers must be 64 bytes apart, first starting with the underflow @@ -680,27 +699,6 @@ ENTRY_ALIGN64(_WindowOverflow4) ENDPROC(_WindowOverflow4) - -#if XCHAL_EXCM_LEVEL >= 2 - /* Not a window vector - but a convenient location - * (where we know there's space) for continuation of - * medium priority interrupt dispatch code. - * On entry here, a0 contains PS, and EPC2 contains saved a0: - */ - .align 4 -_SimulateUserKernelVectorException: - addi a0, a0, (1 << PS_EXCM_BIT) -#if !XTENSA_FAKE_NMI - wsr a0, ps -#endif - bbsi.l a0, PS_UM_BIT, 1f # branch if user mode - xsr a0, excsave2 # restore a0 - j _KernelExceptionVector # simulate kernel vector exception -1: xsr a0, excsave2 # restore a0 - j _UserExceptionVector # simulate user vector exception -#endif - - /* 4-Register Window Underflow Vector (Handler) */ ENTRY_ALIGN64(_WindowUnderflow4) From eda8dd1224d6c1c89eb6b687264da9ccfbffb0fd Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 23 Jul 2021 23:17:04 -0700 Subject: [PATCH 128/512] xtensa: use a14 instead of a15 in inline assembly a15 is a frame pointer in the call0 xtensa ABI, don't use it explicitly in the inline assembly. Use a14 instead, as it has the same properties as a15 w.r.t. window overflow. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/atomic.h | 26 +++++++++++++------------- arch/xtensa/include/asm/cmpxchg.h | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index 4361fe4247e3..52da614f953c 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -25,15 +25,15 @@ * * Locking interrupts looks like this: * - * rsil a15, TOPLEVEL + * rsil a14, TOPLEVEL * - * wsr a15, PS + * wsr a14, PS * rsync * - * Note that a15 is used here because the register allocation + * Note that a14 is used here because the register allocation * done by the compiler is not guaranteed and a window overflow * may not occur between the rsil and wsr instructions. By using - * a15 in the rsil, the machine is guaranteed to be in a state + * a14 in the rsil, the machine is guaranteed to be in a state * where no register reference will cause an overflow. */ @@ -185,15 +185,15 @@ static inline void arch_atomic_##op(int i, atomic_t * v) \ unsigned int vval; \ \ __asm__ __volatile__( \ - " rsil a15, "__stringify(TOPLEVEL)"\n" \ + " rsil a14, "__stringify(TOPLEVEL)"\n" \ " l32i %[result], %[mem]\n" \ " " #op " %[result], %[result], %[i]\n" \ " s32i %[result], %[mem]\n" \ - " wsr a15, ps\n" \ + " wsr a14, ps\n" \ " rsync\n" \ : [result] "=&a" (vval), [mem] "+m" (*v) \ : [i] "a" (i) \ - : "a15", "memory" \ + : "a14", "memory" \ ); \ } \ @@ -203,15 +203,15 @@ static inline int arch_atomic_##op##_return(int i, atomic_t * v) \ unsigned int vval; \ \ __asm__ __volatile__( \ - " rsil a15,"__stringify(TOPLEVEL)"\n" \ + " rsil a14,"__stringify(TOPLEVEL)"\n" \ " l32i %[result], %[mem]\n" \ " " #op " %[result], %[result], %[i]\n" \ " s32i %[result], %[mem]\n" \ - " wsr a15, ps\n" \ + " wsr a14, ps\n" \ " rsync\n" \ : [result] "=&a" (vval), [mem] "+m" (*v) \ : [i] "a" (i) \ - : "a15", "memory" \ + : "a14", "memory" \ ); \ \ return vval; \ @@ -223,16 +223,16 @@ static inline int arch_atomic_fetch_##op(int i, atomic_t * v) \ unsigned int tmp, vval; \ \ __asm__ __volatile__( \ - " rsil a15,"__stringify(TOPLEVEL)"\n" \ + " rsil a14,"__stringify(TOPLEVEL)"\n" \ " l32i %[result], %[mem]\n" \ " " #op " %[tmp], %[result], %[i]\n" \ " s32i %[tmp], %[mem]\n" \ - " wsr a15, ps\n" \ + " wsr a14, ps\n" \ " rsync\n" \ : [result] "=&a" (vval), [tmp] "=&a" (tmp), \ [mem] "+m" (*v) \ : [i] "a" (i) \ - : "a15", "memory" \ + : "a14", "memory" \ ); \ \ return vval; \ diff --git a/arch/xtensa/include/asm/cmpxchg.h b/arch/xtensa/include/asm/cmpxchg.h index 3699e2818efb..eb87810357ad 100644 --- a/arch/xtensa/include/asm/cmpxchg.h +++ b/arch/xtensa/include/asm/cmpxchg.h @@ -52,16 +52,16 @@ __cmpxchg_u32(volatile int *p, int old, int new) return new; #else __asm__ __volatile__( - " rsil a15, "__stringify(TOPLEVEL)"\n" + " rsil a14, "__stringify(TOPLEVEL)"\n" " l32i %[old], %[mem]\n" " bne %[old], %[cmp], 1f\n" " s32i %[new], %[mem]\n" "1:\n" - " wsr a15, ps\n" + " wsr a14, ps\n" " rsync\n" : [old] "=&a" (old), [mem] "+m" (*p) : [cmp] "a" (old), [new] "r" (new) - : "a15", "memory"); + : "a14", "memory"); return old; #endif } @@ -116,10 +116,10 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, /* * xchg_u32 * - * Note that a15 is used here because the register allocation + * Note that a14 is used here because the register allocation * done by the compiler is not guaranteed and a window overflow * may not occur between the rsil and wsr instructions. By using - * a15 in the rsil, the machine is guaranteed to be in a state + * a14 in the rsil, the machine is guaranteed to be in a state * where no register reference will cause an overflow. */ @@ -157,14 +157,14 @@ static inline unsigned long xchg_u32(volatile int * m, unsigned long val) #else unsigned long tmp; __asm__ __volatile__( - " rsil a15, "__stringify(TOPLEVEL)"\n" + " rsil a14, "__stringify(TOPLEVEL)"\n" " l32i %[tmp], %[mem]\n" " s32i %[val], %[mem]\n" - " wsr a15, ps\n" + " wsr a14, ps\n" " rsync\n" : [tmp] "=&a" (tmp), [mem] "+m" (*m) : [val] "a" (val) - : "a15", "memory"); + : "a14", "memory"); return tmp; #endif } From d191323bc02370667fede74228135440efd98d2b Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 25 Jul 2021 16:31:34 -0700 Subject: [PATCH 129/512] xtensa: don't use a12 in strncpy_user a12 is callee-saved register in xtensa call0 ABI, so a function must not change it. a10 is not used in this function at all, use it instead of a12 to avoid saving/restoring it. Signed-off-by: Max Filippov --- arch/xtensa/lib/strncpy_user.S | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/xtensa/lib/strncpy_user.S b/arch/xtensa/lib/strncpy_user.S index 4faf46fe3f38..0731912227d3 100644 --- a/arch/xtensa/lib/strncpy_user.S +++ b/arch/xtensa/lib/strncpy_user.S @@ -45,7 +45,6 @@ # a9/ tmp # a10/ tmp # a11/ dst -# a12/ tmp .text ENTRY(__strncpy_user) @@ -61,7 +60,7 @@ ENTRY(__strncpy_user) bbsi.l a3, 0, .Lsrc1mod2 # if only 8-bit aligned bbsi.l a3, 1, .Lsrc2mod4 # if only 16-bit aligned .Lsrcaligned: # return here when src is word-aligned - srli a12, a4, 2 # number of loop iterations with 4B per loop + srli a10, a4, 2 # number of loop iterations with 4B per loop movi a9, 3 bnone a11, a9, .Laligned j .Ldstunaligned @@ -102,11 +101,11 @@ EX(10f) s8i a9, a11, 0 # store byte 0 .byte 0 # (0 mod 4 alignment for LBEG) .Laligned: #if XCHAL_HAVE_LOOPS - loopnez a12, .Loop1done + loopnez a10, .Loop1done #else - beqz a12, .Loop1done - slli a12, a12, 2 - add a12, a12, a11 # a12 = end of last 4B chunck + beqz a10, .Loop1done + slli a10, a10, 2 + add a10, a10, a11 # a10 = end of last 4B chunck #endif .Loop1: EX(11f) l32i a9, a3, 0 # get word from src @@ -118,7 +117,7 @@ EX(10f) s32i a9, a11, 0 # store word to dst bnone a9, a8, .Lz3 # if byte 3 is zero addi a11, a11, 4 # advance dst pointer #if !XCHAL_HAVE_LOOPS - blt a11, a12, .Loop1 + blt a11, a10, .Loop1 #endif .Loop1done: @@ -185,7 +184,7 @@ EX(10f) s8i a9, a11, 2 loopnez a4, .Lunalignedend #else beqz a4, .Lunalignedend - add a12, a11, a4 # a12 = ending address + add a10, a11, a4 # a10 = ending address #endif /* XCHAL_HAVE_LOOPS */ .Lnextbyte: EX(11f) l8ui a9, a3, 0 @@ -194,7 +193,7 @@ EX(10f) s8i a9, a11, 0 beqz a9, .Lunalignedend addi a11, a11, 1 #if !XCHAL_HAVE_LOOPS - blt a11, a12, .Lnextbyte + blt a11, a10, .Lnextbyte #endif .Lunalignedend: From 61a6b91283b4c9e308fcf9377d3f0598ceccb252 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 25 Jul 2021 16:31:34 -0700 Subject: [PATCH 130/512] xtensa: don't use a12 in __xtensa_copy_user in call0 ABI a12 is callee-saved register in xtensa call0 ABI, so a function must not change it. The main unaligned copy loop of __xtensa_copy_user uses all low-numbered registers, so a register must be spilled to avoid using a12 as a loop counter. Signed-off-by: Max Filippov --- arch/xtensa/lib/usercopy.S | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/arch/xtensa/lib/usercopy.S b/arch/xtensa/lib/usercopy.S index a0aa4047f94a..16128c094c62 100644 --- a/arch/xtensa/lib/usercopy.S +++ b/arch/xtensa/lib/usercopy.S @@ -60,7 +60,12 @@ .text ENTRY(__xtensa_copy_user) - abi_entry_default +#if !XCHAL_HAVE_LOOPS && defined(__XTENSA_CALL0_ABI__) +#define STACK_SIZE 4 +#else +#define STACK_SIZE 0 +#endif + abi_entry(STACK_SIZE) # a2/ dst, a3/ src, a4/ len mov a5, a2 # copy dst so that a2 is return value mov a11, a4 # preserve original len for error case @@ -75,7 +80,7 @@ ENTRY(__xtensa_copy_user) __ssa8 a3 # set shift amount from byte offset bnez a4, .Lsrcunaligned movi a2, 0 # return success for len==0 - abi_ret_default + abi_ret(STACK_SIZE) /* * Destination is unaligned @@ -127,7 +132,7 @@ EX(10f) s8i a6, a5, 0 #endif /* !XCHAL_HAVE_LOOPS */ .Lbytecopydone: movi a2, 0 # return success for len bytes copied - abi_ret_default + abi_ret(STACK_SIZE) /* * Destination and source are word-aligned. @@ -187,7 +192,7 @@ EX(10f) l8ui a6, a3, 0 EX(10f) s8i a6, a5, 0 .L5: movi a2, 0 # return success for len bytes copied - abi_ret_default + abi_ret(STACK_SIZE) /* * Destination is aligned, Source is unaligned @@ -205,8 +210,14 @@ EX(10f) l32i a6, a3, 0 # load first word loopnez a7, .Loop2done #else /* !XCHAL_HAVE_LOOPS */ beqz a7, .Loop2done +#if defined(__XTENSA_CALL0_ABI__) + s32i a10, a1, 0 + slli a10, a7, 4 + add a10, a10, a3 # a10 = end of last 16B source chunk +#else slli a12, a7, 4 add a12, a12, a3 # a12 = end of last 16B source chunk +#endif #endif /* !XCHAL_HAVE_LOOPS */ .Loop2: EX(10f) l32i a7, a3, 4 @@ -224,7 +235,12 @@ EX(10f) s32i a8, a5, 8 EX(10f) s32i a9, a5, 12 addi a5, a5, 16 #if !XCHAL_HAVE_LOOPS +#if defined(__XTENSA_CALL0_ABI__) + blt a3, a10, .Loop2 + l32i a10, a1, 0 +#else blt a3, a12, .Loop2 +#endif #endif /* !XCHAL_HAVE_LOOPS */ .Loop2done: bbci.l a4, 3, .L12 @@ -264,7 +280,7 @@ EX(10f) l8ui a6, a3, 0 EX(10f) s8i a6, a5, 0 .L15: movi a2, 0 # return success for len bytes copied - abi_ret_default + abi_ret(STACK_SIZE) ENDPROC(__xtensa_copy_user) @@ -281,4 +297,4 @@ ENDPROC(__xtensa_copy_user) 10: sub a2, a5, a2 /* a2 <-- bytes copied */ sub a2, a11, a2 /* a2 <-- bytes not copied */ - abi_ret_default + abi_ret(STACK_SIZE) From 5cce39b6aaa02da77e071b2b0880bedfb903330f Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sat, 1 May 2021 15:30:06 -0700 Subject: [PATCH 131/512] xtensa: definitions for call0 ABI Add assembly macros for calls, call arguments, preserved registers, function entry and return for windowed and call0 ABIs. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/asmmacro.h | 65 +++++++++++++++++++++++++++++ arch/xtensa/include/asm/core.h | 11 +++++ arch/xtensa/include/asm/processor.h | 32 +++++++++++--- 3 files changed, 102 insertions(+), 6 deletions(-) diff --git a/arch/xtensa/include/asm/asmmacro.h b/arch/xtensa/include/asm/asmmacro.h index bfc89e11f469..809c507d1825 100644 --- a/arch/xtensa/include/asm/asmmacro.h +++ b/arch/xtensa/include/asm/asmmacro.h @@ -194,6 +194,12 @@ #define XTENSA_STACK_ALIGNMENT 16 #if defined(__XTENSA_WINDOWED_ABI__) + +/* Assembly instructions for windowed kernel ABI. */ +#define KABI_W +/* Assembly instructions for call0 kernel ABI (will be ignored). */ +#define KABI_C0 # + #define XTENSA_FRAME_SIZE_RESERVE 16 #define XTENSA_SPILL_STACK_RESERVE 32 @@ -206,8 +212,34 @@ #define abi_ret(frame_size) retw #define abi_ret_default retw + /* direct call */ +#define abi_call call4 + /* indirect call */ +#define abi_callx callx4 + /* outgoing call argument registers */ +#define abi_arg0 a6 +#define abi_arg1 a7 +#define abi_arg2 a8 +#define abi_arg3 a9 +#define abi_arg4 a10 +#define abi_arg5 a11 + /* return value */ +#define abi_rv a6 + /* registers preserved across call */ +#define abi_saved0 a2 +#define abi_saved1 a3 + + /* none of the above */ +#define abi_tmp0 a4 +#define abi_tmp1 a5 + #elif defined(__XTENSA_CALL0_ABI__) +/* Assembly instructions for windowed kernel ABI (will be ignored). */ +#define KABI_W # +/* Assembly instructions for call0 kernel ABI. */ +#define KABI_C0 + #define XTENSA_SPILL_STACK_RESERVE 0 #define abi_entry(frame_size) __abi_entry (frame_size) @@ -233,10 +265,43 @@ #define abi_ret_default ret + /* direct call */ +#define abi_call call0 + /* indirect call */ +#define abi_callx callx0 + /* outgoing call argument registers */ +#define abi_arg0 a2 +#define abi_arg1 a3 +#define abi_arg2 a4 +#define abi_arg3 a5 +#define abi_arg4 a6 +#define abi_arg5 a7 + /* return value */ +#define abi_rv a2 + /* registers preserved across call */ +#define abi_saved0 a12 +#define abi_saved1 a13 + + /* none of the above */ +#define abi_tmp0 a8 +#define abi_tmp1 a9 + #else #error Unsupported Xtensa ABI #endif +#if defined(USER_SUPPORT_WINDOWED) +/* Assembly instructions for windowed user ABI. */ +#define UABI_W +/* Assembly instructions for call0 user ABI (will be ignored). */ +#define UABI_C0 # +#else +/* Assembly instructions for windowed user ABI (will be ignored). */ +#define UABI_W # +/* Assembly instructions for call0 user ABI. */ +#define UABI_C0 +#endif + #define __XTENSA_HANDLER .section ".exception.text", "ax" #endif /* _XTENSA_ASMMACRO_H */ diff --git a/arch/xtensa/include/asm/core.h b/arch/xtensa/include/asm/core.h index 5590b0f68837..9138077e567d 100644 --- a/arch/xtensa/include/asm/core.h +++ b/arch/xtensa/include/asm/core.h @@ -26,4 +26,15 @@ #define XCHAL_SPANNING_WAY 0 #endif +#if XCHAL_HAVE_WINDOWED +#if defined(CONFIG_USER_ABI_DEFAULT) || defined(CONFIG_USER_ABI_CALL0_PROBE) +/* Whether windowed ABI is supported in userspace. */ +#define USER_SUPPORT_WINDOWED +#endif +#if defined(__XTENSA_WINDOWED_ABI__) || defined(USER_SUPPORT_WINDOWED) +/* Whether windowed ABI is supported either in userspace or in the kernel. */ +#define SUPPORT_WINDOWED +#endif +#endif + #endif diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 7f63aca6a0d3..aeec2916030c 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -18,12 +18,6 @@ #include #include -/* Assertions. */ - -#if (XCHAL_HAVE_WINDOWED != 1) -# error Linux requires the Xtensa Windowed Registers Option. -#endif - /* Xtensa ABI requires stack alignment to be at least 16 */ #define STACK_ALIGN (XCHAL_DATA_WIDTH > 16 ? XCHAL_DATA_WIDTH : 16) @@ -105,8 +99,18 @@ #define WSBITS (XCHAL_NUM_AREGS / 4) /* width of WINDOWSTART in bits */ #define WBBITS (XCHAL_NUM_AREGS_LOG2 - 2) /* width of WINDOWBASE in bits */ +#if defined(__XTENSA_WINDOWED_ABI__) +#define KERNEL_PS_WOE_MASK PS_WOE_MASK +#elif defined(__XTENSA_CALL0_ABI__) +#define KERNEL_PS_WOE_MASK 0 +#else +#error Unsupported xtensa ABI +#endif + #ifndef __ASSEMBLY__ +#if defined(__XTENSA_WINDOWED_ABI__) + /* Build a valid return address for the specified call winsize. * winsize must be 1 (call4), 2 (call8), or 3 (call12) */ @@ -117,6 +121,22 @@ */ #define MAKE_PC_FROM_RA(ra,sp) (((ra) & 0x3fffffff) | ((sp) & 0xc0000000)) +#elif defined(__XTENSA_CALL0_ABI__) + +/* Build a valid return address for the specified call winsize. + * winsize must be 1 (call4), 2 (call8), or 3 (call12) + */ +#define MAKE_RA_FOR_CALL(ra, ws) (ra) + +/* Convert return address to a valid pc + * Note: We assume that the stack pointer is in the same 1GB ranges as the ra + */ +#define MAKE_PC_FROM_RA(ra, sp) (ra) + +#else +#error Unsupported Xtensa ABI +#endif + /* Spill slot location for the register reg in the spill area under the stack * pointer sp. reg must be in the range [0..4). */ From 0b5372570b1f3fcb35255d28e707846e613c27f2 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sat, 1 May 2021 15:32:58 -0700 Subject: [PATCH 132/512] xtensa: implement call0 ABI support in assembly Replace hardcoded register and opcode names with ABI-agnostic macros. Add register save/restore code where necessary. Conditionalize windowed only or call0 only code. Add stack initialization matching _switch_to epilogue to copy_thread. Signed-off-by: Max Filippov --- arch/xtensa/boot/boot-redboot/bootstrap.S | 68 +++---- arch/xtensa/kernel/entry.S | 209 ++++++++++++++-------- arch/xtensa/kernel/head.S | 22 +-- arch/xtensa/kernel/mcount.S | 38 +++- arch/xtensa/kernel/process.c | 27 ++- 5 files changed, 243 insertions(+), 121 deletions(-) diff --git a/arch/xtensa/boot/boot-redboot/bootstrap.S b/arch/xtensa/boot/boot-redboot/bootstrap.S index 48ba5a232d94..51e8f3b88e82 100644 --- a/arch/xtensa/boot/boot-redboot/bootstrap.S +++ b/arch/xtensa/boot/boot-redboot/bootstrap.S @@ -3,6 +3,7 @@ #include #include #include +#include /* * RB-Data: RedBoot data/bss * P: Boot-Parameters @@ -36,7 +37,7 @@ .globl __start /* this must be the first byte of the loader! */ __start: - entry sp, 32 # we do not intend to return + abi_entry(32) # we do not intend to return _call0 _start __start_a0: .align 4 @@ -62,10 +63,12 @@ _start: wsr a4, windowstart rsync - movi a4, 0x00040000 + movi a4, KERNEL_PS_WOE_MASK wsr a4, ps rsync +KABI_C0 mov abi_saved0, abi_arg0 + /* copy the loader to its address * Note: The loader itself is a very small piece, so we assume we * don't partially overlap. We also assume (even more important) @@ -168,52 +171,52 @@ _reloc: movi a3, __image_load sub a4, a3, a4 - add a8, a0, a4 + add abi_arg2, a0, a4 # a1 Stack # a8(a4) Load address of the image - movi a6, _image_start - movi a10, _image_end - movi a7, 0x1000000 - sub a11, a10, a6 - movi a9, complen - s32i a11, a9, 0 + movi abi_arg0, _image_start + movi abi_arg4, _image_end + movi abi_arg1, 0x1000000 + sub abi_tmp0, abi_arg4, abi_arg0 + movi abi_arg3, complen + s32i abi_tmp0, abi_arg3, 0 movi a0, 0 - # a6 destination - # a7 maximum size of destination - # a8 source - # a9 ptr to length + # abi_arg0 destination + # abi_arg1 maximum size of destination + # abi_arg2 source + # abi_arg3 ptr to length .extern gunzip - movi a4, gunzip - beqz a4, 1f + movi abi_tmp0, gunzip + beqz abi_tmp0, 1f - callx4 a4 + abi_callx abi_tmp0 j 2f - # a6 destination start - # a7 maximum size of destination - # a8 source start - # a9 ptr to length - # a10 destination end + # abi_arg0 destination start + # abi_arg1 maximum size of destination + # abi_arg2 source start + # abi_arg3 ptr to length + # abi_arg4 destination end 1: - l32i a9, a8, 0 - l32i a11, a8, 4 - s32i a9, a6, 0 - s32i a11, a6, 4 - l32i a9, a8, 8 - l32i a11, a8, 12 - s32i a9, a6, 8 - s32i a11, a6, 12 - addi a6, a6, 16 - addi a8, a8, 16 - blt a6, a10, 1b + l32i abi_tmp0, abi_arg2, 0 + l32i abi_tmp1, abi_arg2, 4 + s32i abi_tmp0, abi_arg0, 0 + s32i abi_tmp1, abi_arg0, 4 + l32i abi_tmp0, abi_arg2, 8 + l32i abi_tmp1, abi_arg2, 12 + s32i abi_tmp0, abi_arg0, 8 + s32i abi_tmp1, abi_arg0, 12 + addi abi_arg0, abi_arg0, 16 + addi abi_arg2, abi_arg2, 16 + blt abi_arg0, abi_arg4, 1b /* jump to the kernel */ @@ -230,6 +233,7 @@ _reloc: # a2 Boot parameter list +KABI_C0 mov abi_arg0, abi_saved0 movi a0, _image_start jx a0 diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index 647b162f959b..a144b467c3fd 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -158,6 +158,7 @@ _user_exception: /* Rotate ws so that the current windowbase is at bit0. */ /* Assume ws = xxwww1yyyy. Rotate ws right, so that a2 = yyyyxxwww1 */ +#if defined(USER_SUPPORT_WINDOWED) rsr a2, windowbase rsr a3, windowstart ssr a2 @@ -167,24 +168,33 @@ _user_exception: src a2, a3, a2 srli a2, a2, 32-WSBITS s32i a2, a1, PT_WMASK # needed for restoring registers +#else + movi a2, 0 + movi a3, 1 + s32i a2, a1, PT_WINDOWBASE + s32i a3, a1, PT_WINDOWSTART + s32i a3, a1, PT_WMASK +#endif /* Save only live registers. */ - _bbsi.l a2, 1, 1f +UABI_W _bbsi.l a2, 1, 1f s32i a4, a1, PT_AREG4 s32i a5, a1, PT_AREG5 s32i a6, a1, PT_AREG6 s32i a7, a1, PT_AREG7 - _bbsi.l a2, 2, 1f +UABI_W _bbsi.l a2, 2, 1f s32i a8, a1, PT_AREG8 s32i a9, a1, PT_AREG9 s32i a10, a1, PT_AREG10 s32i a11, a1, PT_AREG11 - _bbsi.l a2, 3, 1f +UABI_W _bbsi.l a2, 3, 1f s32i a12, a1, PT_AREG12 s32i a13, a1, PT_AREG13 s32i a14, a1, PT_AREG14 s32i a15, a1, PT_AREG15 + +#if defined(USER_SUPPORT_WINDOWED) _bnei a2, 1, 1f # only one valid frame? /* Only one valid frame, skip saving regs. */ @@ -239,7 +249,7 @@ _user_exception: rsync /* We are back to the original stack pointer (a1) */ - +#endif 2: /* Now, jump to the common exception handler. */ j common_exception @@ -295,6 +305,7 @@ _kernel_exception: s32i a3, a1, PT_SAR s32i a2, a1, PT_ICOUNTLEVEL +#if defined(__XTENSA_WINDOWED_ABI__) /* Rotate ws so that the current windowbase is at bit0. */ /* Assume ws = xxwww1yyyy. Rotate ws right, so that a2 = yyyyxxwww1 */ @@ -305,27 +316,28 @@ _kernel_exception: src a2, a3, a2 srli a2, a2, 32-WSBITS s32i a2, a1, PT_WMASK # needed for kernel_exception_exit +#endif /* Save only the live window-frame */ - _bbsi.l a2, 1, 1f +KABI_W _bbsi.l a2, 1, 1f s32i a4, a1, PT_AREG4 s32i a5, a1, PT_AREG5 s32i a6, a1, PT_AREG6 s32i a7, a1, PT_AREG7 - _bbsi.l a2, 2, 1f +KABI_W _bbsi.l a2, 2, 1f s32i a8, a1, PT_AREG8 s32i a9, a1, PT_AREG9 s32i a10, a1, PT_AREG10 s32i a11, a1, PT_AREG11 - _bbsi.l a2, 3, 1f +KABI_W _bbsi.l a2, 3, 1f s32i a12, a1, PT_AREG12 s32i a13, a1, PT_AREG13 s32i a14, a1, PT_AREG14 s32i a15, a1, PT_AREG15 +#ifdef __XTENSA_WINDOWED_ABI__ _bnei a2, 1, 1f - /* Copy spill slots of a0 and a1 to imitate movsp * in order to keep exception stack continuous */ @@ -333,6 +345,7 @@ _kernel_exception: l32i a0, a1, PT_SIZE + 4 s32e a3, a1, -16 s32e a0, a1, -12 +#endif 1: l32i a0, a1, PT_AREG0 # restore saved a0 wsr a0, depc @@ -419,16 +432,16 @@ common_exception: movi a3, LOCKLEVEL .Lexception: - movi a0, PS_WOE_MASK - or a3, a3, a0 +KABI_W movi a0, PS_WOE_MASK +KABI_W or a3, a3, a0 #else addi a2, a2, -EXCCAUSE_LEVEL1_INTERRUPT movi a0, LOCKLEVEL extui a3, a3, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH # a3 = PS.INTLEVEL moveqz a3, a0, a2 # a3 = LOCKLEVEL iff interrupt - movi a2, PS_WOE_MASK - or a3, a3, a2 +KABI_W movi a2, PS_WOE_MASK +KABI_W or a3, a3, a2 rsr a2, exccause #endif @@ -461,14 +474,14 @@ common_exception: */ rsr a4, excsave1 - mov a6, a1 # pass stack frame - mov a7, a2 # pass EXCCAUSE addx4 a4, a2, a4 l32i a4, a4, EXC_TABLE_DEFAULT # load handler + mov abi_arg1, a2 # pass EXCCAUSE + mov abi_arg0, a1 # pass stack frame /* Call the second-level handler */ - callx4 a4 + abi_callx a4 /* Jump here for exception exit */ .global common_exception_return @@ -482,15 +495,15 @@ common_exception_return: 1: irq_save a2, a3 #ifdef CONFIG_TRACE_IRQFLAGS - call4 trace_hardirqs_off + abi_call trace_hardirqs_off #endif /* Jump if we are returning from kernel exceptions. */ - l32i a3, a1, PT_PS + l32i abi_saved1, a1, PT_PS GET_THREAD_INFO(a2, a1) l32i a4, a2, TI_FLAGS - _bbci.l a3, PS_UM_BIT, 6f + _bbci.l abi_saved1, PS_UM_BIT, 6f /* Specific to a user exception exit: * We need to check some flags for signal handling and rescheduling, @@ -509,20 +522,20 @@ common_exception_return: /* Call do_signal() */ #ifdef CONFIG_TRACE_IRQFLAGS - call4 trace_hardirqs_on + abi_call trace_hardirqs_on #endif rsil a2, 0 - mov a6, a1 - call4 do_notify_resume # int do_notify_resume(struct pt_regs*) + mov abi_arg0, a1 + abi_call do_notify_resume # int do_notify_resume(struct pt_regs*) j 1b 3: /* Reschedule */ #ifdef CONFIG_TRACE_IRQFLAGS - call4 trace_hardirqs_on + abi_call trace_hardirqs_on #endif rsil a2, 0 - call4 schedule # void schedule (void) + abi_call schedule # void schedule (void) j 1b #ifdef CONFIG_PREEMPTION @@ -533,33 +546,33 @@ common_exception_return: l32i a4, a2, TI_PRE_COUNT bnez a4, 4f - call4 preempt_schedule_irq + abi_call preempt_schedule_irq j 4f #endif #if XTENSA_FAKE_NMI .LNMIexit: - l32i a3, a1, PT_PS - _bbci.l a3, PS_UM_BIT, 4f + l32i abi_saved1, a1, PT_PS + _bbci.l abi_saved1, PS_UM_BIT, 4f #endif 5: #ifdef CONFIG_HAVE_HW_BREAKPOINT _bbci.l a4, TIF_DB_DISABLED, 7f - call4 restore_dbreak + abi_call restore_dbreak 7: #endif #ifdef CONFIG_DEBUG_TLB_SANITY l32i a4, a1, PT_DEPC bgeui a4, VALID_DOUBLE_EXCEPTION_ADDRESS, 4f - call4 check_tlb_sanity + abi_call check_tlb_sanity #endif 6: 4: #ifdef CONFIG_TRACE_IRQFLAGS - extui a4, a3, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH + extui a4, abi_saved1, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH bgei a4, LOCKLEVEL, 1f - call4 trace_hardirqs_on + abi_call trace_hardirqs_on 1: #endif /* Restore optional registers. */ @@ -572,14 +585,15 @@ common_exception_return: l32i a2, a1, PT_SCOMPARE1 wsr a2, scompare1 #endif - wsr a3, ps /* disable interrupts */ + wsr abi_saved1, ps /* disable interrupts */ - _bbci.l a3, PS_UM_BIT, kernel_exception_exit + _bbci.l abi_saved1, PS_UM_BIT, kernel_exception_exit user_exception_exit: /* Restore the state of the task and return from the exception. */ +#if defined(USER_SUPPORT_WINDOWED) /* Switch to the user thread WINDOWBASE. Save SP temporarily in DEPC */ l32i a2, a1, PT_WINDOWBASE @@ -634,8 +648,10 @@ user_exception_exit: * frame where we had loaded a2), or at least the lower 4 bits * (if we have restored WSBITS-1 frames). */ - 2: +#else + movi a2, 1 +#endif #if XCHAL_HAVE_THREADPTR l32i a3, a1, PT_THREADPTR wur a3, threadptr @@ -650,6 +666,7 @@ user_exception_exit: kernel_exception_exit: +#if defined(__XTENSA_WINDOWED_ABI__) /* Check if we have to do a movsp. * * We only have to do a movsp if the previous window-frame has @@ -702,6 +719,9 @@ kernel_exception_exit: * * Note: We expect a2 to hold PT_WMASK */ +#else + movi a2, 1 +#endif common_exception_exit: @@ -927,7 +947,7 @@ ENTRY(unrecoverable_exception) wsr a1, windowbase rsync - movi a1, PS_WOE_MASK | LOCKLEVEL + movi a1, KERNEL_PS_WOE_MASK | LOCKLEVEL wsr a1, ps rsync @@ -935,8 +955,8 @@ ENTRY(unrecoverable_exception) movi a0, 0 addi a1, a1, PT_REGS_OFFSET - movi a6, unrecoverable_text - call4 panic + movi abi_arg0, unrecoverable_text + abi_call panic 1: j 1b @@ -1403,12 +1423,12 @@ ENTRY(fast_syscall_spill_registers) rsr a3, excsave1 l32i a1, a3, EXC_TABLE_KSTK - movi a4, PS_WOE_MASK | LOCKLEVEL + movi a4, KERNEL_PS_WOE_MASK | LOCKLEVEL wsr a4, ps rsync - movi a6, SIGSEGV - call4 do_exit + movi abi_arg0, SIGSEGV + abi_call do_exit /* shouldn't return, so panic */ @@ -1887,57 +1907,77 @@ ENDPROC(fast_store_prohibited) ENTRY(system_call) +#if defined(__XTENSA_WINDOWED_ABI__) abi_entry_default +#elif defined(__XTENSA_CALL0_ABI__) + abi_entry(12) + + s32i a0, sp, 0 + s32i abi_saved0, sp, 4 + s32i abi_saved1, sp, 8 + mov abi_saved0, a2 +#else +#error Unsupported Xtensa ABI +#endif /* regs->syscall = regs->areg[2] */ - l32i a7, a2, PT_AREG2 - s32i a7, a2, PT_SYSCALL + l32i a7, abi_saved0, PT_AREG2 + s32i a7, abi_saved0, PT_SYSCALL GET_THREAD_INFO(a4, a1) - l32i a3, a4, TI_FLAGS + l32i abi_saved1, a4, TI_FLAGS movi a4, _TIF_WORK_MASK - and a3, a3, a4 - beqz a3, 1f + and abi_saved1, abi_saved1, a4 + beqz abi_saved1, 1f - mov a6, a2 - call4 do_syscall_trace_enter - beqz a6, .Lsyscall_exit - l32i a7, a2, PT_SYSCALL + mov abi_arg0, abi_saved0 + abi_call do_syscall_trace_enter + beqz abi_rv, .Lsyscall_exit + l32i a7, abi_saved0, PT_SYSCALL 1: /* syscall = sys_call_table[syscall_nr] */ movi a4, sys_call_table movi a5, __NR_syscalls - movi a6, -ENOSYS + movi abi_rv, -ENOSYS bgeu a7, a5, 1f addx4 a4, a7, a4 - l32i a4, a4, 0 + l32i abi_tmp0, a4, 0 /* Load args: arg0 - arg5 are passed via regs. */ - l32i a6, a2, PT_AREG6 - l32i a7, a2, PT_AREG3 - l32i a8, a2, PT_AREG4 - l32i a9, a2, PT_AREG5 - l32i a10, a2, PT_AREG8 - l32i a11, a2, PT_AREG9 + l32i abi_arg0, abi_saved0, PT_AREG6 + l32i abi_arg1, abi_saved0, PT_AREG3 + l32i abi_arg2, abi_saved0, PT_AREG4 + l32i abi_arg3, abi_saved0, PT_AREG5 + l32i abi_arg4, abi_saved0, PT_AREG8 + l32i abi_arg5, abi_saved0, PT_AREG9 - callx4 a4 + abi_callx abi_tmp0 1: /* regs->areg[2] = return_value */ - s32i a6, a2, PT_AREG2 - bnez a3, 1f + s32i abi_rv, abi_saved0, PT_AREG2 + bnez abi_saved1, 1f .Lsyscall_exit: +#if defined(__XTENSA_WINDOWED_ABI__) abi_ret_default +#elif defined(__XTENSA_CALL0_ABI__) + l32i a0, sp, 0 + l32i abi_saved0, sp, 4 + l32i abi_saved1, sp, 8 + abi_ret(12) +#else +#error Unsupported Xtensa ABI +#endif 1: - mov a6, a2 - call4 do_syscall_trace_leave - abi_ret_default + mov abi_arg0, abi_saved0 + abi_call do_syscall_trace_leave + j .Lsyscall_exit ENDPROC(system_call) @@ -1988,8 +2028,18 @@ ENDPROC(system_call) ENTRY(_switch_to) +#if defined(__XTENSA_WINDOWED_ABI__) abi_entry(XTENSA_SPILL_STACK_RESERVE) +#elif defined(__XTENSA_CALL0_ABI__) + abi_entry(16) + s32i a12, sp, 0 + s32i a13, sp, 4 + s32i a14, sp, 8 + s32i a15, sp, 12 +#else +#error Unsupported Xtensa ABI +#endif mov a11, a3 # and 'next' (a3) l32i a4, a2, TASK_THREAD_INFO @@ -2033,7 +2083,9 @@ ENTRY(_switch_to) /* Flush register file. */ +#if defined(__XTENSA_WINDOWED_ABI__) spill_registers_kernel +#endif /* Set kernel stack (and leave critical section) * Note: It's save to set it here. The stack will not be overwritten @@ -2055,34 +2107,43 @@ ENTRY(_switch_to) wsr a14, ps rsync +#if defined(__XTENSA_WINDOWED_ABI__) abi_ret(XTENSA_SPILL_STACK_RESERVE) +#elif defined(__XTENSA_CALL0_ABI__) + l32i a12, sp, 0 + l32i a13, sp, 4 + l32i a14, sp, 8 + l32i a15, sp, 12 + abi_ret(16) +#else +#error Unsupported Xtensa ABI +#endif ENDPROC(_switch_to) ENTRY(ret_from_fork) /* void schedule_tail (struct task_struct *prev) - * Note: prev is still in a6 (return value from fake call4 frame) + * Note: prev is still in abi_arg0 (return value from fake call frame) */ - call4 schedule_tail + abi_call schedule_tail - mov a6, a1 - call4 do_syscall_trace_leave - - j common_exception_return + mov abi_arg0, a1 + abi_call do_syscall_trace_leave + j common_exception_return ENDPROC(ret_from_fork) /* * Kernel thread creation helper - * On entry, set up by copy_thread: a2 = thread_fn, a3 = thread_fn arg - * left from _switch_to: a6 = prev + * On entry, set up by copy_thread: abi_saved0 = thread_fn, + * abi_saved1 = thread_fn arg. Left from _switch_to: abi_arg0 = prev */ ENTRY(ret_from_kernel_thread) - call4 schedule_tail - mov a6, a3 - callx4 a2 - j common_exception_return + abi_call schedule_tail + mov abi_arg0, abi_saved1 + abi_callx abi_saved0 + j common_exception_return ENDPROC(ret_from_kernel_thread) diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S index b9b81e76beea..8972d64e0b86 100644 --- a/arch/xtensa/kernel/head.S +++ b/arch/xtensa/kernel/head.S @@ -15,6 +15,7 @@ * Kevin Chea */ +#include #include #include #include @@ -193,9 +194,10 @@ ENTRY(_startup) movi a1, start_info l32i a1, a1, 0 - movi a2, PS_WOE_MASK | LOCKLEVEL - # WOE=1, INTLEVEL=LOCKLEVEL, UM=0 - wsr a2, ps # (enable reg-windows; progmode stack) + /* Disable interrupts. */ + /* Enable window exceptions if kernel is built with windowed ABI. */ + movi a2, KERNEL_PS_WOE_MASK | LOCKLEVEL + wsr a2, ps rsync #ifdef CONFIG_SMP @@ -267,13 +269,13 @@ ENTRY(_startup) l32i a1, a1, 0 #endif - movi a6, 0 - xsr a6, excsave1 + movi abi_arg0, 0 + xsr abi_arg0, excsave1 /* init_arch kick-starts the linux kernel */ - call4 init_arch - call4 start_kernel + abi_call init_arch + abi_call start_kernel should_never_return: j should_never_return @@ -297,10 +299,10 @@ should_never_return: s32i a3, a2, 0 memw - movi a6, 0 - wsr a6, excsave1 + movi abi_arg0, 0 + wsr abi_arg0, excsave1 - call4 secondary_start_kernel + abi_call secondary_start_kernel j should_never_return #endif /* CONFIG_SMP */ diff --git a/arch/xtensa/kernel/mcount.S b/arch/xtensa/kernel/mcount.S index 5e4619f52858..51daaf4e0b82 100644 --- a/arch/xtensa/kernel/mcount.S +++ b/arch/xtensa/kernel/mcount.S @@ -17,11 +17,16 @@ /* * Entry condition: * - * a2: a0 of the caller + * a2: a0 of the caller in windowed ABI + * a10: a0 of the caller in call0 ABI + * + * In call0 ABI the function _mcount is called with the special ABI: + * its argument is in a10 and all the usual argument registers (a2 - a7) + * must be preserved in addition to callee-saved a12 - a15. */ ENTRY(_mcount) - +#if defined(__XTENSA_WINDOWED_ABI__) abi_entry_default movi a4, ftrace_trace_function @@ -42,7 +47,36 @@ ENTRY(_mcount) callx4 a4 abi_ret_default +#elif defined(__XTENSA_CALL0_ABI__) + abi_entry_default + movi a9, ftrace_trace_function + l32i a9, a9, 0 + movi a11, ftrace_stub + bne a9, a11, 1f + abi_ret_default + +1: abi_entry(28) + s32i a0, sp, 0 + s32i a2, sp, 4 + s32i a3, sp, 8 + s32i a4, sp, 12 + s32i a5, sp, 16 + s32i a6, sp, 20 + s32i a7, sp, 24 + addi a2, a10, -MCOUNT_INSN_SIZE + callx0 a9 + l32i a0, sp, 0 + l32i a2, sp, 4 + l32i a3, sp, 8 + l32i a4, sp, 12 + l32i a5, sp, 16 + l32i a6, sp, 20 + l32i a7, sp, 24 + abi_ret(28) +#else +#error Unsupported Xtensa ABI +#endif ENDPROC(_mcount) ENTRY(ftrace_stub) diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 060165340612..de6eb9ddea44 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -211,11 +211,18 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, struct thread_info *ti; #endif +#if defined(__XTENSA_WINDOWED_ABI__) /* Create a call4 dummy-frame: a0 = 0, a1 = childregs. */ SPILL_SLOT(childregs, 1) = (unsigned long)childregs; SPILL_SLOT(childregs, 0) = 0; p->thread.sp = (unsigned long)childregs; +#elif defined(__XTENSA_CALL0_ABI__) + /* Reserve 16 bytes for the _switch_to stack frame. */ + p->thread.sp = (unsigned long)childregs - 16; +#else +#error Unsupported Xtensa ABI +#endif if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { struct pt_regs *regs = current_pt_regs(); @@ -272,11 +279,25 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, p->thread.ra = MAKE_RA_FOR_CALL( (unsigned long)ret_from_kernel_thread, 1); - /* pass parameters to ret_from_kernel_thread: - * a2 = thread_fn, a3 = thread_fn arg + /* pass parameters to ret_from_kernel_thread: */ +#if defined(__XTENSA_WINDOWED_ABI__) + /* + * a2 = thread_fn, a3 = thread_fn arg. + * Window underflow will load registers from the + * spill slots on the stack on return from _switch_to. */ - SPILL_SLOT(childregs, 3) = thread_fn_arg; SPILL_SLOT(childregs, 2) = usp_thread_fn; + SPILL_SLOT(childregs, 3) = thread_fn_arg; +#elif defined(__XTENSA_CALL0_ABI__) + /* + * a12 = thread_fn, a13 = thread_fn arg. + * _switch_to epilogue will load registers from the stack. + */ + ((unsigned long *)p->thread.sp)[0] = usp_thread_fn; + ((unsigned long *)p->thread.sp)[1] = thread_fn_arg; +#else +#error Unsupported Xtensa ABI +#endif /* Childregs are only used when we're going to userspace * in which case start_thread will set them up. From 09af39f649dac66c2681ca53977275fe876690cc Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 26 Jul 2021 07:25:28 -0700 Subject: [PATCH 133/512] xtensa: use register window specific opcodes only when present xtensa core may be configured without register windows support, don't use register window specific opcodes in that case. Use window register specific opcodes to initialize hardware or reset core to a known state regardless of the chosen ABI. Signed-off-by: Max Filippov --- arch/xtensa/boot/boot-elf/bootstrap.S | 2 ++ arch/xtensa/boot/boot-redboot/bootstrap.S | 4 ++-- arch/xtensa/kernel/align.S | 2 ++ arch/xtensa/kernel/entry.S | 2 ++ arch/xtensa/kernel/head.S | 2 ++ 5 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/boot/boot-elf/bootstrap.S b/arch/xtensa/boot/boot-elf/bootstrap.S index 99e98c9bae41..2dd28931d699 100644 --- a/arch/xtensa/boot/boot-elf/bootstrap.S +++ b/arch/xtensa/boot/boot-elf/bootstrap.S @@ -42,12 +42,14 @@ _bootparam: .align 4 _SetupMMU: +#if XCHAL_HAVE_WINDOWED movi a0, 0 wsr a0, windowbase rsync movi a0, 1 wsr a0, windowstart rsync +#endif movi a0, 0x1F wsr a0, ps rsync diff --git a/arch/xtensa/boot/boot-redboot/bootstrap.S b/arch/xtensa/boot/boot-redboot/bootstrap.S index 51e8f3b88e82..3ed94ad35000 100644 --- a/arch/xtensa/boot/boot-redboot/bootstrap.S +++ b/arch/xtensa/boot/boot-redboot/bootstrap.S @@ -56,13 +56,13 @@ _start: movi a4, 1 wsr a4, ps rsync - +#if XCHAL_HAVE_WINDOWED rsr a5, windowbase ssl a5 sll a4, a4 wsr a4, windowstart rsync - +#endif movi a4, KERNEL_PS_WOE_MASK wsr a4, ps rsync diff --git a/arch/xtensa/kernel/align.S b/arch/xtensa/kernel/align.S index 9301452e521e..d062c732ef18 100644 --- a/arch/xtensa/kernel/align.S +++ b/arch/xtensa/kernel/align.S @@ -58,7 +58,9 @@ * BE shift left / mask 0 0 X X */ +#if XCHAL_HAVE_WINDOWED #define UNALIGNED_USER_EXCEPTION +#endif #if XCHAL_HAVE_BE diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index a144b467c3fd..8029ce24af92 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -940,12 +940,14 @@ unrecoverable_text: ENTRY(unrecoverable_exception) +#if XCHAL_HAVE_WINDOWED movi a0, 1 movi a1, 0 wsr a0, windowstart wsr a1, windowbase rsync +#endif movi a1, KERNEL_PS_WOE_MASK | LOCKLEVEL wsr a1, ps diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S index 8972d64e0b86..8484294bc623 100644 --- a/arch/xtensa/kernel/head.S +++ b/arch/xtensa/kernel/head.S @@ -67,11 +67,13 @@ _SetupOCD: * xt-gdb to single step via DEBUG exceptions received directly * by ocd. */ +#if XCHAL_HAVE_WINDOWED movi a1, 1 movi a0, 0 wsr a1, windowstart wsr a0, windowbase rsync +#endif movi a1, LOCKLEVEL wsr a1, ps From da0a4e5c8fbcce3d1afebf9f2a967083bb19634d Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 26 Jul 2021 07:32:55 -0700 Subject: [PATCH 134/512] xtensa: only build windowed register support code when needed There's no need in window overflow/underflow/alloca exception handlers or window spill code when neither kernel nor userspace support windowed registers. Don't build or link it. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/traps.h | 2 ++ arch/xtensa/kernel/entry.S | 5 ++++- arch/xtensa/kernel/setup.c | 2 ++ arch/xtensa/kernel/signal.c | 12 ++++++++++-- arch/xtensa/kernel/traps.c | 2 ++ arch/xtensa/kernel/vectors.S | 15 +++++++++++++++ arch/xtensa/kernel/vmlinux.lds.S | 12 ++++++++++-- 7 files changed, 45 insertions(+), 5 deletions(-) diff --git a/arch/xtensa/include/asm/traps.h b/arch/xtensa/include/asm/traps.h index f720a57d0a5b..6fa47cd8e02d 100644 --- a/arch/xtensa/include/asm/traps.h +++ b/arch/xtensa/include/asm/traps.h @@ -56,6 +56,7 @@ void secondary_trap_init(void); static inline void spill_registers(void) { +#if defined(__XTENSA_WINDOWED_ABI__) #if XCHAL_NUM_AREGS > 16 __asm__ __volatile__ ( " call8 1f\n" @@ -96,6 +97,7 @@ static inline void spill_registers(void) " mov a12, a12\n" : : : "memory"); #endif +#endif } struct debug_table { diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index 8029ce24af92..99ab3c1a3387 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -969,6 +969,7 @@ ENDPROC(unrecoverable_exception) __XTENSA_HANDLER .literal_position +#ifdef SUPPORT_WINDOWED /* * Fast-handler for alloca exceptions * @@ -1032,6 +1033,7 @@ ENTRY(fast_alloca) 8: j _WindowUnderflow8 4: j _WindowUnderflow4 ENDPROC(fast_alloca) +#endif #ifdef CONFIG_USER_ABI_CALL0_PROBE /* @@ -1228,7 +1230,8 @@ ENDPROC(fast_syscall_xtensa) * Note: We assume the stack pointer is EXC_TABLE_KSTK in the fixup handler. */ -#ifdef CONFIG_FAST_SYSCALL_SPILL_REGISTERS +#if defined(CONFIG_FAST_SYSCALL_SPILL_REGISTERS) && \ + defined(USER_SUPPORT_WINDOWED) ENTRY(fast_syscall_spill_registers) diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index ee9082a142fe..ad665e8c9416 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -349,8 +349,10 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_VECTORS_ADDR +#ifdef SUPPORT_WINDOWED mem_reserve(__pa(&_WindowVectors_text_start), __pa(&_WindowVectors_text_end)); +#endif mem_reserve(__pa(&_DebugInterruptVector_text_start), __pa(&_DebugInterruptVector_text_end)); diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index c4d77dbfb61a..f6c949895b3e 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -45,12 +45,13 @@ struct rt_sigframe unsigned int window[4]; }; -/* +#if defined(USER_SUPPORT_WINDOWED) +/* * Flush register windows stored in pt_regs to stack. * Returns 1 for errors. */ -int +static int flush_window_regs_user(struct pt_regs *regs) { const unsigned long ws = regs->windowstart; @@ -121,6 +122,13 @@ flush_window_regs_user(struct pt_regs *regs) errout: return err; } +#else +static int +flush_window_regs_user(struct pt_regs *regs) +{ + return 0; +} +#endif /* * Note: We don't copy double exception 'regs', we have to finish double exc. diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 874b6efc6fb3..4418438b13e6 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -97,7 +97,9 @@ static dispatch_init_table_t __initdata dispatch_init_table[] = { /* EXCCAUSE_INSTRUCTION_FETCH unhandled */ /* EXCCAUSE_LOAD_STORE_ERROR unhandled*/ { EXCCAUSE_LEVEL1_INTERRUPT, 0, do_interrupt }, +#ifdef SUPPORT_WINDOWED { EXCCAUSE_ALLOCA, USER|KRNL, fast_alloca }, +#endif /* EXCCAUSE_INTEGER_DIVIDE_BY_ZERO unhandled */ /* EXCCAUSE_PRIVILEGED unhandled */ #if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION diff --git a/arch/xtensa/kernel/vectors.S b/arch/xtensa/kernel/vectors.S index 0eed5aa82914..407ece204e7c 100644 --- a/arch/xtensa/kernel/vectors.S +++ b/arch/xtensa/kernel/vectors.S @@ -226,6 +226,7 @@ ENTRY(_DoubleExceptionVector) xsr a0, depc # get DEPC, save a0 +#ifdef SUPPORT_WINDOWED movi a2, WINDOW_VECTORS_VADDR _bltu a0, a2, .Lfixup addi a2, a2, WINDOW_VECTORS_SIZE @@ -275,6 +276,10 @@ _DoubleExceptionVector_WindowUnderflow: l32i a0, a0, EXC_TABLE_FAST_USER jx a0 +#else + j .Lfixup +#endif + /* * We only allow the ITLB miss exception if we are in kernel space. * All other exceptions are unexpected and thus unrecoverable! @@ -343,6 +348,7 @@ _DoubleExceptionVector_WindowUnderflow: l32i a0, a0, EXC_TABLE_FAST_USER jx a0 +#ifdef SUPPORT_WINDOWED /* * Restart window OVERFLOW exception. * Currently: @@ -475,9 +481,12 @@ _DoubleExceptionVector_handle_exception: rsr a0, depc rotw -3 j 1b +#endif ENDPROC(_DoubleExceptionVector) +#ifdef SUPPORT_WINDOWED + /* * Fixup handler for TLB miss in double exception handler for window owerflow. * We get here with windowbase set to the window that was being spilled and @@ -590,6 +599,8 @@ ENTRY(window_overflow_restore_a0_fixup) ENDPROC(window_overflow_restore_a0_fixup) +#endif + /* * Debug interrupt vector * @@ -687,6 +698,8 @@ _SimulateUserKernelVectorException: .section .WindowVectors.text, "ax" +#ifdef SUPPORT_WINDOWED + /* 4-Register Window Overflow Vector (Handler) */ ENTRY_ALIGN64(_WindowOverflow4) @@ -787,4 +800,6 @@ ENTRY_ALIGN64(_WindowUnderflow12) ENDPROC(_WindowUnderflow12) +#endif + .text diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index d23a6e38f062..eee270a039a4 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -94,7 +94,9 @@ SECTIONS . = ALIGN(PAGE_SIZE); _vecbase = .; +#ifdef SUPPORT_WINDOWED SECTION_VECTOR2 (.WindowVectors.text, WINDOW_VECTORS_VADDR) +#endif #if XCHAL_EXCM_LEVEL >= 2 SECTION_VECTOR2 (.Level2InterruptVector.text, INTLEVEL2_VECTOR_VADDR) #endif @@ -166,8 +168,10 @@ SECTIONS __boot_reloc_table_start = ABSOLUTE(.); #if !MERGED_VECTORS +#ifdef SUPPORT_WINDOWED RELOCATE_ENTRY(_WindowVectors_text, .WindowVectors.text); +#endif #if XCHAL_EXCM_LEVEL >= 2 RELOCATE_ENTRY(_Level2InterruptVector_text, .Level2InterruptVector.text); @@ -229,14 +233,18 @@ SECTIONS #if !MERGED_VECTORS /* The vectors are relocated to the real position at startup time */ +#ifdef SUPPORT_WINDOWED SECTION_VECTOR4 (_WindowVectors_text, .WindowVectors.text, WINDOW_VECTORS_VADDR, - .dummy) + LAST) +#undef LAST +#define LAST .WindowVectors.text +#endif SECTION_VECTOR4 (_DebugInterruptVector_text, .DebugInterruptVector.text, DEBUG_VECTOR_VADDR, - .WindowVectors.text) + LAST) #undef LAST #define LAST .DebugInterruptVector.text #if XCHAL_EXCM_LEVEL >= 2 From 431d1a34dfb6ce0d824ac22fc0f0c67c94123bc7 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 22 Aug 2021 21:28:01 -0700 Subject: [PATCH 135/512] xtensa: remove unused variable wmask After a cleanup the function show_regs doesn't use variable wmask but still computes it. Drop it. Fixes: 8d7e8240e66c ("[XTENSA] Clean up elf-gregset.") Signed-off-by: Max Filippov --- arch/xtensa/kernel/traps.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 4418438b13e6..35a7d47f28cf 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -464,12 +464,10 @@ void secondary_trap_init(void) void show_regs(struct pt_regs * regs) { - int i, wmask; + int i; show_regs_print_info(KERN_DEFAULT); - wmask = regs->wmask & ~1; - for (i = 0; i < 16; i++) { if ((i % 8) == 0) pr_info("a%02d:", i); From bd47cdb78997f83bd170c389ef59de9eec65976a Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 29 Aug 2021 07:42:46 -0700 Subject: [PATCH 136/512] xtensa: move section symbols to asm/sections.h Introduce asm/sections.h and move section declarations to this header from setup.c. Assign section symbols char array type uniformly and drop address operator from section symbol references in code. Sort headers in setup.c while at it. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/sections.h | 41 ++++++++++++ arch/xtensa/kernel/setup.c | 100 +++++++++-------------------- 2 files changed, 70 insertions(+), 71 deletions(-) create mode 100644 arch/xtensa/include/asm/sections.h diff --git a/arch/xtensa/include/asm/sections.h b/arch/xtensa/include/asm/sections.h new file mode 100644 index 000000000000..a8c42d08e281 --- /dev/null +++ b/arch/xtensa/include/asm/sections.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _XTENSA_SECTIONS_H +#define _XTENSA_SECTIONS_H + +#include + +#ifdef CONFIG_VECTORS_ADDR +extern char _WindowVectors_text_start[]; +extern char _WindowVectors_text_end[]; +extern char _DebugInterruptVector_text_start[]; +extern char _DebugInterruptVector_text_end[]; +extern char _KernelExceptionVector_text_start[]; +extern char _KernelExceptionVector_text_end[]; +extern char _UserExceptionVector_text_start[]; +extern char _UserExceptionVector_text_end[]; +extern char _DoubleExceptionVector_text_start[]; +extern char _DoubleExceptionVector_text_end[]; +extern char _exception_text_start[]; +extern char _exception_text_end[]; +extern char _Level2InterruptVector_text_start[]; +extern char _Level2InterruptVector_text_end[]; +extern char _Level3InterruptVector_text_start[]; +extern char _Level3InterruptVector_text_end[]; +extern char _Level4InterruptVector_text_start[]; +extern char _Level4InterruptVector_text_end[]; +extern char _Level5InterruptVector_text_start[]; +extern char _Level5InterruptVector_text_end[]; +extern char _Level6InterruptVector_text_start[]; +extern char _Level6InterruptVector_text_end[]; +#endif +#ifdef CONFIG_SMP +extern char _SecondaryResetVector_text_start[]; +extern char _SecondaryResetVector_text_end[]; +#endif +#ifdef CONFIG_XIP_KERNEL +extern char _xip_start[]; +extern char _xip_end[]; +#endif + +#endif diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index ad665e8c9416..8db20cfb44ab 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -37,14 +37,15 @@ #include #include #include -#include -#include -#include #include -#include #include +#include +#include +#include +#include #include #include +#include #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) struct screen_info screen_info = { @@ -271,49 +272,6 @@ void __init init_arch(bp_tag_t *bp_start) * Initialize system. Setup memory and reserve regions. */ -extern char _end[]; -extern char _stext[]; -extern char _WindowVectors_text_start; -extern char _WindowVectors_text_end; -extern char _DebugInterruptVector_text_start; -extern char _DebugInterruptVector_text_end; -extern char _KernelExceptionVector_text_start; -extern char _KernelExceptionVector_text_end; -extern char _UserExceptionVector_text_start; -extern char _UserExceptionVector_text_end; -extern char _DoubleExceptionVector_text_start; -extern char _DoubleExceptionVector_text_end; -extern char _exception_text_start; -extern char _exception_text_end; -#if XCHAL_EXCM_LEVEL >= 2 -extern char _Level2InterruptVector_text_start; -extern char _Level2InterruptVector_text_end; -#endif -#if XCHAL_EXCM_LEVEL >= 3 -extern char _Level3InterruptVector_text_start; -extern char _Level3InterruptVector_text_end; -#endif -#if XCHAL_EXCM_LEVEL >= 4 -extern char _Level4InterruptVector_text_start; -extern char _Level4InterruptVector_text_end; -#endif -#if XCHAL_EXCM_LEVEL >= 5 -extern char _Level5InterruptVector_text_start; -extern char _Level5InterruptVector_text_end; -#endif -#if XCHAL_EXCM_LEVEL >= 6 -extern char _Level6InterruptVector_text_start; -extern char _Level6InterruptVector_text_end; -#endif -#ifdef CONFIG_SMP -extern char _SecondaryResetVector_text_start; -extern char _SecondaryResetVector_text_end; -#endif -#ifdef CONFIG_XIP_KERNEL -extern char _xip_start[]; -extern char _xip_end[]; -#endif - static inline int __init_memblock mem_reserve(unsigned long start, unsigned long end) { @@ -350,50 +308,50 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_VECTORS_ADDR #ifdef SUPPORT_WINDOWED - mem_reserve(__pa(&_WindowVectors_text_start), - __pa(&_WindowVectors_text_end)); + mem_reserve(__pa(_WindowVectors_text_start), + __pa(_WindowVectors_text_end)); #endif - mem_reserve(__pa(&_DebugInterruptVector_text_start), - __pa(&_DebugInterruptVector_text_end)); + mem_reserve(__pa(_DebugInterruptVector_text_start), + __pa(_DebugInterruptVector_text_end)); - mem_reserve(__pa(&_KernelExceptionVector_text_start), - __pa(&_KernelExceptionVector_text_end)); + mem_reserve(__pa(_KernelExceptionVector_text_start), + __pa(_KernelExceptionVector_text_end)); - mem_reserve(__pa(&_UserExceptionVector_text_start), - __pa(&_UserExceptionVector_text_end)); + mem_reserve(__pa(_UserExceptionVector_text_start), + __pa(_UserExceptionVector_text_end)); - mem_reserve(__pa(&_DoubleExceptionVector_text_start), - __pa(&_DoubleExceptionVector_text_end)); + mem_reserve(__pa(_DoubleExceptionVector_text_start), + __pa(_DoubleExceptionVector_text_end)); - mem_reserve(__pa(&_exception_text_start), - __pa(&_exception_text_end)); + mem_reserve(__pa(_exception_text_start), + __pa(_exception_text_end)); #if XCHAL_EXCM_LEVEL >= 2 - mem_reserve(__pa(&_Level2InterruptVector_text_start), - __pa(&_Level2InterruptVector_text_end)); + mem_reserve(__pa(_Level2InterruptVector_text_start), + __pa(_Level2InterruptVector_text_end)); #endif #if XCHAL_EXCM_LEVEL >= 3 - mem_reserve(__pa(&_Level3InterruptVector_text_start), - __pa(&_Level3InterruptVector_text_end)); + mem_reserve(__pa(_Level3InterruptVector_text_start), + __pa(_Level3InterruptVector_text_end)); #endif #if XCHAL_EXCM_LEVEL >= 4 - mem_reserve(__pa(&_Level4InterruptVector_text_start), - __pa(&_Level4InterruptVector_text_end)); + mem_reserve(__pa(_Level4InterruptVector_text_start), + __pa(_Level4InterruptVector_text_end)); #endif #if XCHAL_EXCM_LEVEL >= 5 - mem_reserve(__pa(&_Level5InterruptVector_text_start), - __pa(&_Level5InterruptVector_text_end)); + mem_reserve(__pa(_Level5InterruptVector_text_start), + __pa(_Level5InterruptVector_text_end)); #endif #if XCHAL_EXCM_LEVEL >= 6 - mem_reserve(__pa(&_Level6InterruptVector_text_start), - __pa(&_Level6InterruptVector_text_end)); + mem_reserve(__pa(_Level6InterruptVector_text_start), + __pa(_Level6InterruptVector_text_end)); #endif #endif /* CONFIG_VECTORS_ADDR */ #ifdef CONFIG_SMP - mem_reserve(__pa(&_SecondaryResetVector_text_start), - __pa(&_SecondaryResetVector_text_end)); + mem_reserve(__pa(_SecondaryResetVector_text_start), + __pa(_SecondaryResetVector_text_end)); #endif parse_early_param(); bootmem_init(); From e96a1866b40570b5950cda8602c2819189c62a48 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 18 Oct 2021 12:37:41 +0200 Subject: [PATCH 137/512] isofs: Fix out of bound access for corrupted isofs image When isofs image is suitably corrupted isofs_read_inode() can read data beyond the end of buffer. Sanity-check the directory entry length before using it. Reported-and-tested-by: syzbot+6fc7fb214625d82af7d1@syzkaller.appspotmail.com CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/isofs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 678e2c51b855..0c6eacfcbeef 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1322,6 +1322,8 @@ static int isofs_read_inode(struct inode *inode, int relocated) de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; + if (de_len < sizeof(struct iso_directory_record)) + goto fail; if (offset + de_len > bufsize) { int frag1 = bufsize - offset; From 2ab3a0a9fad846e751148e9cdeda85a8f7765f31 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Oct 2021 15:37:59 +0200 Subject: [PATCH 138/512] s390/ftrace: add HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALL support This is the s390 variant of commit 562955fe6a55 ("ftrace/x86: Add register_ftrace_direct() for custom trampolines"). Acked-by: Ilya Leoshkevich Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20211012133802.2460757-2-hca@linux.ibm.com Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 + arch/s390/include/asm/ftrace.h | 12 ++++++++++++ arch/s390/kernel/mcount.S | 23 +++++++++++++++++------ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c35c98e515a2..c4b53c7313a7 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -154,6 +154,7 @@ config S390 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_ARGS + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 4a721b44d4f4..267f70f4393f 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -58,6 +58,18 @@ static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *f regs->psw.addr = ip; } +/* + * When an ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, + * place the direct caller in the ORIG_GPR2 part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + regs->orig_gpr2 = addr; +} + /* * Even though the system call numbers are identical for s390/s390x a * different system call table is used for compat tasks. This may lead diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index cc25fbd75ea9..39bcc0e39a10 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -22,10 +22,11 @@ ENTRY(ftrace_stub) BR_EX %r14 ENDPROC(ftrace_stub) -#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) -#define STACK_PTREGS (STACK_FRAME_OVERHEAD) -#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) -#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) +#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) +#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2) #ifdef __PACK_STACK /* allocate just enough for r14, r15 and backchain */ #define TRACED_FUNC_FRAME_SIZE 24 @@ -51,6 +52,7 @@ ENDPROC(ftrace_stub) # allocate pt_regs and stack frame for ftrace_trace_function aghi %r15,-STACK_FRAME_SIZE stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15) + xc STACK_PTREGS_ORIG_GPR2(8,%r15),STACK_PTREGS_ORIG_GPR2(%r15) .if \allregs == 1 stg %r14,(STACK_PTREGS_PSW)(%r15) @@ -101,8 +103,17 @@ SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) .Lftrace_graph_caller_end: #endif - lg %r1,(STACK_PTREGS_PSW+8)(%r15) - lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) + lg %r0,(STACK_PTREGS_PSW+8)(%r15) +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + ltg %r1,STACK_PTREGS_ORIG_GPR2(%r15) + locgrz %r1,%r0 +#else + lg %r1,STACK_PTREGS_ORIG_GPR2(%r15) + ltgr %r1,%r1 + jnz 0f + lgr %r1,%r0 +#endif +0: lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 SYM_CODE_END(ftrace_common) From 3d487acf1b1a5d1473e27c093c97a2a90062a41b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Oct 2021 15:38:00 +0200 Subject: [PATCH 139/512] s390: make STACK_FRAME_OVERHEAD available via asm-offsets.h Make STACK_FRAME_OVERHEAD available via asm-offsets.h. This allows to add s390 specific asm code to e.g. ftrace samples, without requiring to add random header files, which might cause all sort of problems on other architectures. asm-offsets.h can be assumed to be non-problematic. Acked-by: Ilya Leoshkevich Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20211012133802.2460757-3-hca@linux.ibm.com Signed-off-by: Vasily Gorbik --- arch/s390/kernel/asm-offsets.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index b57da9338588..b6ee3fd7fe0c 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -45,6 +45,7 @@ int main(void) OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[2]); OFFSET(__SF_SIE_REASON, stack_frame, empty1[3]); OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[4]); + DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); /* idle data offsets */ OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); From c316eb4460463b6dd1aee6d241cb20323a0030aa Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Oct 2021 15:38:01 +0200 Subject: [PATCH 140/512] samples: add HAVE_SAMPLE_FTRACE_DIRECT config option Add HAVE_SAMPLE_FTRACE_DIRECT config option which can be selected by architectures which have support for ftrace direct call samples. Acked-by: Ilya Leoshkevich Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20211012133802.2460757-4-hca@linux.ibm.com Signed-off-by: Vasily Gorbik --- arch/x86/Kconfig | 1 + samples/Kconfig | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab83c22d274e..620fce152be9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -190,6 +190,7 @@ config X86 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_SAMPLE_FTRACE_DIRECT if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA diff --git a/samples/Kconfig b/samples/Kconfig index b0503ef058d3..501f66309118 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -26,7 +26,7 @@ config SAMPLE_TRACE_PRINTK config SAMPLE_FTRACE_DIRECT tristate "Build register_ftrace_direct() example" depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m - depends on X86_64 # has x86_64 inlined asm + depends on HAVE_SAMPLE_FTRACE_DIRECT help This builds an ftrace direct function example that hooks to wake_up_process and prints the parameters. @@ -224,3 +224,6 @@ config SAMPLE_WATCH_QUEUE sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function. endif # SAMPLES + +config HAVE_SAMPLE_FTRACE_DIRECT + bool From 1254cfbc5f97bc1afd019c30c6ca1d4ddfa127eb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Oct 2021 15:38:02 +0200 Subject: [PATCH 141/512] samples: add s390 support for ftrace direct call samples Add s390 support for ftrace direct call samples, which also enables ftrace direct call selftests within ftrace selftests. Acked-by: Ilya Leoshkevich Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20211012133802.2460757-5-hca@linux.ibm.com Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 + samples/ftrace/ftrace-direct-modify.c | 44 +++++++++++++++++++++++++++ samples/ftrace/ftrace-direct-too.c | 28 +++++++++++++++++ samples/ftrace/ftrace-direct.c | 28 +++++++++++++++++ 4 files changed, 101 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c4b53c7313a7..d0d42c5c4141 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -193,6 +193,7 @@ config S390 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECT select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index 5b9a09957c6e..690e4a9ff333 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -2,6 +2,7 @@ #include #include #include +#include void my_direct_func1(void) { @@ -18,6 +19,8 @@ extern void my_tramp2(void *); static unsigned long my_ip = (unsigned long)schedule; +#ifdef CONFIG_X86_64 + asm ( " .pushsection .text, \"ax\", @progbits\n" " .type my_tramp1, @function\n" @@ -41,6 +44,47 @@ asm ( " .popsection\n" ); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func1\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp1, .-my_tramp1\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func2\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + static unsigned long my_tramp = (unsigned long)my_tramp1; static unsigned long tramps[2] = { (unsigned long)my_tramp1, diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index 3f0079c9bd6f..6e0de725bf22 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -3,6 +3,7 @@ #include /* for handle_mm_fault() */ #include +#include void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -13,6 +14,8 @@ void my_direct_func(struct vm_area_struct *vma, extern void my_tramp(void *); +#ifdef CONFIG_X86_64 + asm ( " .pushsection .text, \"ax\", @progbits\n" " .type my_tramp, @function\n" @@ -33,6 +36,31 @@ asm ( " .popsection\n" ); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ static int __init ftrace_direct_init(void) { diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index a2729d1ef17f..a30aa42ec76a 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -3,6 +3,7 @@ #include /* for wake_up_process() */ #include +#include void my_direct_func(struct task_struct *p) { @@ -11,6 +12,8 @@ void my_direct_func(struct task_struct *p) extern void my_tramp(void *); +#ifdef CONFIG_X86_64 + asm ( " .pushsection .text, \"ax\", @progbits\n" " .type my_tramp, @function\n" @@ -27,6 +30,31 @@ asm ( " .popsection\n" ); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ static int __init ftrace_direct_init(void) { From 1a446b24730e78777078c4e2d24ba71dbf06a213 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 7 Oct 2021 18:08:46 +0200 Subject: [PATCH 142/512] s390: update defconfigs Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 10 +++++++--- arch/s390/configs/defconfig | 6 ++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 1818d67c2c1c..fd825097cf04 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -61,8 +61,8 @@ CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m -CONFIG_S390_UNWIND_SELFTEST=y -CONFIG_S390_KPROBES_SANITY_TEST=y +CONFIG_S390_UNWIND_SELFTEST=m +CONFIG_S390_KPROBES_SANITY_TEST=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y @@ -777,7 +777,6 @@ CONFIG_CRC8=m CONFIG_RANDOM32_SELFTEST=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 -CONFIG_DMA_API_DEBUG=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y @@ -840,8 +839,13 @@ CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y CONFIG_FTRACE_STARTUP_TEST=y # CONFIG_EVENT_TRACE_STARTUP_TEST is not set +CONFIG_SAMPLES=y +CONFIG_SAMPLE_TRACE_PRINTK=m +CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_DEBUG_ENTRY=y CONFIG_CIO_INJECT=y +CONFIG_KUNIT=m +CONFIG_KUNIT_DEBUGFS=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index f08b161c9446..c9c3cedff2d8 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -60,6 +60,7 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_S390_UNWIND_SELFTEST=m +CONFIG_S390_KPROBES_SANITY_TEST=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y # CONFIG_GCC_PLUGINS is not set @@ -788,6 +789,11 @@ CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y +CONFIG_SAMPLES=y +CONFIG_SAMPLE_TRACE_PRINTK=m +CONFIG_SAMPLE_FTRACE_DIRECT=m +CONFIG_KUNIT=m +CONFIG_KUNIT_DEBUGFS=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y From e1b0d0bb2032d18c7718168e670d8d3f31e552d7 Mon Sep 17 00:00:00 2001 From: Mingchuang Qiao Date: Tue, 12 Oct 2021 15:56:14 +0800 Subject: [PATCH 143/512] PCI: Re-enable Downstream Port LTR after reset or hotplug Per PCIe r5.0, sec 7.5.3.16, Downstream Ports must disable LTR if the link goes down (the Port goes DL_Down status). This is a problem because the Downstream Port's dev->ltr_path is still set, so we think LTR is still enabled, and we enable LTR in the Endpoint. When it sends LTR messages, they cause Unsupported Request errors at the Downstream Port. This happens in the reset path, where we may enable LTR in pci_restore_pcie_state() even though the Downstream Port disabled LTR because the reset caused a link down event. It also happens in the hot-remove and hot-add path, where we may enable LTR in pci_configure_ltr() even though the Downstream Port disabled LTR when the hot-remove took the link down. In these two scenarios, check the upstream bridge and restore its LTR enable if appropriate. The Unsupported Request may be logged by AER as follows: pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: id=00e8 pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=00e8(Requester ID) pcieport 0000:00:1d.0: device [8086:9d18] error status/mask=00100000/00010000 pcieport 0000:00:1d.0: [20] Unsupported Request (First) In addition, if LTR is not configured correctly, the link cannot enter the L1.2 state, which prevents some machines from entering the S0ix low power state. [bhelgaas: commit log] Link: https://lore.kernel.org/r/20211012075614.54576-1-mingchuang.qiao@mediatek.com Reported-by: Utkarsh H Patel Signed-off-by: Mingchuang Qiao Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- drivers/pci/pci.c | 25 +++++++++++++++++++++++++ drivers/pci/pci.h | 1 + drivers/pci/probe.c | 18 +++++++++++++++--- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ce2ab62b64cf..17e4341df0ff 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1477,6 +1477,24 @@ static int pci_save_pcie_state(struct pci_dev *dev) return 0; } +void pci_bridge_reconfigure_ltr(struct pci_dev *dev) +{ +#ifdef CONFIG_PCIEASPM + struct pci_dev *bridge; + u32 ctl; + + bridge = pci_upstream_bridge(dev); + if (bridge && bridge->ltr_path) { + pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl); + if (!(ctl & PCI_EXP_DEVCTL2_LTR_EN)) { + pci_dbg(bridge, "re-enabling LTR\n"); + pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_LTR_EN); + } + } +#endif +} + static void pci_restore_pcie_state(struct pci_dev *dev) { int i = 0; @@ -1487,6 +1505,13 @@ static void pci_restore_pcie_state(struct pci_dev *dev) if (!save_state) return; + /* + * Downstream ports reset the LTR enable bit when link goes down. + * Check and re-configure the bit here before restoring device. + * PCIe r5.0, sec 7.5.3.16. + */ + pci_bridge_reconfigure_ltr(dev); + cap = (u16 *)&save_state->cap.data[0]; pcie_capability_write_word(dev, PCI_EXP_DEVCTL, cap[i++]); pcie_capability_write_word(dev, PCI_EXP_LNKCTL, cap[i++]); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1cce56c2aea0..bc7f3a10ff60 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -125,6 +125,7 @@ void pci_msix_init(struct pci_dev *dev); bool pci_bridge_d3_possible(struct pci_dev *dev); void pci_bridge_d3_update(struct pci_dev *dev); void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev); +void pci_bridge_reconfigure_ltr(struct pci_dev *dev); static inline void pci_wakeup_event(struct pci_dev *dev) { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index d9fc02a71baa..258350f80f6c 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2168,9 +2168,21 @@ static void pci_configure_ltr(struct pci_dev *dev) * Complex and all intermediate Switches indicate support for LTR. * PCIe r4.0, sec 6.18. */ - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || - ((bridge = pci_upstream_bridge(dev)) && - bridge->ltr_path)) { + if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) { + pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_LTR_EN); + dev->ltr_path = 1; + return; + } + + /* + * If we're configuring a hot-added device, LTR was likely + * disabled in the upstream bridge, so re-enable it before enabling + * it in the new device. + */ + bridge = pci_upstream_bridge(dev); + if (bridge && bridge->ltr_path) { + pci_bridge_reconfigure_ltr(dev); pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, PCI_EXP_DEVCTL2_LTR_EN); dev->ltr_path = 1; From 27cee7d7ceb0fad20a4d654cc051afe408dc1de8 Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Wed, 22 Sep 2021 07:00:33 +0200 Subject: [PATCH 144/512] dt-bindings: PCI: Add MT7621 SoC PCIe host controller Add device tree binding documentation for PCIe in MT7621 SoCs. Link: https://lore.kernel.org/r/20210922050035.18162-2-sergio.paracuellos@gmail.com Signed-off-by: Sergio Paracuellos Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring --- .../bindings/pci/mediatek,mt7621-pcie.yaml | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml diff --git a/Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml b/Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml new file mode 100644 index 000000000000..044fa967bc8b --- /dev/null +++ b/Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/mediatek,mt7621-pcie.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT7621 PCIe controller + +maintainers: + - Sergio Paracuellos + +description: |+ + MediaTek MT7621 PCIe subsys supports a single Root Complex (RC) + with 3 Root Ports. Each Root Port supports a Gen1 1-lane Link + +allOf: + - $ref: /schemas/pci/pci-bus.yaml# + +properties: + compatible: + const: mediatek,mt7621-pci + + reg: + items: + - description: host-pci bridge registers + - description: pcie port 0 RC control registers + - description: pcie port 1 RC control registers + - description: pcie port 2 RC control registers + + ranges: + maxItems: 2 + +patternProperties: + 'pcie@[0-2],0': + type: object + $ref: /schemas/pci/pci-bus.yaml# + + properties: + resets: + maxItems: 1 + + clocks: + maxItems: 1 + + phys: + maxItems: 1 + + required: + - "#interrupt-cells" + - interrupt-map-mask + - interrupt-map + - resets + - clocks + - phys + - phy-names + - ranges + + unevaluatedProperties: false + +required: + - compatible + - reg + - ranges + - "#interrupt-cells" + - interrupt-map-mask + - interrupt-map + - reset-gpios + +unevaluatedProperties: false + +examples: + - | + #include + #include + + pcie: pcie@1e140000 { + compatible = "mediatek,mt7621-pci"; + reg = <0x1e140000 0x100>, + <0x1e142000 0x100>, + <0x1e143000 0x100>, + <0x1e144000 0x100>; + + #address-cells = <3>; + #size-cells = <2>; + pinctrl-names = "default"; + pinctrl-0 = <&pcie_pins>; + device_type = "pci"; + ranges = <0x02000000 0 0x60000000 0x60000000 0 0x10000000>, /* pci memory */ + <0x01000000 0 0x1e160000 0x1e160000 0 0x00010000>; /* io space */ + #interrupt-cells = <1>; + interrupt-map-mask = <0xF800 0 0 0>; + interrupt-map = <0x0000 0 0 0 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>, + <0x0800 0 0 0 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>, + <0x1000 0 0 0 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>; + reset-gpios = <&gpio 19 GPIO_ACTIVE_LOW>; + + pcie@0,0 { + reg = <0x0000 0 0 0 0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>; + resets = <&rstctrl 24>; + clocks = <&clkctrl 24>; + phys = <&pcie0_phy 1>; + phy-names = "pcie-phy0"; + ranges; + }; + + pcie@1,0 { + reg = <0x0800 0 0 0 0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>; + resets = <&rstctrl 25>; + clocks = <&clkctrl 25>; + phys = <&pcie0_phy 1>; + phy-names = "pcie-phy1"; + ranges; + }; + + pcie@2,0 { + reg = <0x1000 0 0 0 0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>; + resets = <&rstctrl 26>; + clocks = <&clkctrl 26>; + phys = <&pcie2_phy 0>; + phy-names = "pcie-phy2"; + ranges; + }; + }; +... From 2bdd5238e756aac3ecbffc7c22b884485e84062e Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Wed, 22 Sep 2021 07:00:34 +0200 Subject: [PATCH 145/512] PCI: mt7621: Add MediaTek MT7621 PCIe host controller driver Add driver for the PCIe controller of the MT7621 SoC. [bhelgaas: rename from pci-mt7621.c to pcie-mt7621.c; also rename Kconfig symbol from PCI_MT7621 to PCIE_MT7621] Link: https://lore.kernel.org/r/20210922050035.18162-3-sergio.paracuellos@gmail.com Signed-off-by: Sergio Paracuellos Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman --- arch/mips/ralink/Kconfig | 3 +- drivers/pci/controller/Kconfig | 8 ++ drivers/pci/controller/Makefile | 2 + .../controller/pcie-mt7621.c} | 24 ++-- drivers/staging/Kconfig | 2 - drivers/staging/Makefile | 1 - drivers/staging/mt7621-pci/Kconfig | 8 -- drivers/staging/mt7621-pci/Makefile | 2 - drivers/staging/mt7621-pci/TODO | 4 - .../mt7621-pci/mediatek,mt7621-pci.txt | 104 ------------------ 10 files changed, 24 insertions(+), 134 deletions(-) rename drivers/{staging/mt7621-pci/pci-mt7621.c => pci/controller/pcie-mt7621.c} (95%) delete mode 100644 drivers/staging/mt7621-pci/Kconfig delete mode 100644 drivers/staging/mt7621-pci/Makefile delete mode 100644 drivers/staging/mt7621-pci/TODO delete mode 100644 drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt diff --git a/arch/mips/ralink/Kconfig b/arch/mips/ralink/Kconfig index c800bf5559b5..120adad51d6a 100644 --- a/arch/mips/ralink/Kconfig +++ b/arch/mips/ralink/Kconfig @@ -51,7 +51,8 @@ choice select SYS_SUPPORTS_HIGHMEM select MIPS_GIC select CLKSRC_MIPS_GIC - select HAVE_PCI if PCI_MT7621 + select HAVE_PCI + select PCI_DRIVERS_GENERIC select SOC_BUS endchoice diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 326f7d13024f..985498374bd7 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -312,6 +312,14 @@ config PCIE_HISI_ERR Say Y here if you want error handling support for the PCIe controller's errors on HiSilicon HIP SoCs +config PCIE_MT7621 + tristate "MediaTek MT7621 PCIe Controller" + depends on (RALINK && SOC_MT7621) || (MIPS && COMPILE_TEST) + select PHY_MT7621_PCI + default SOC_MT7621 + help + This selects a driver for the MediaTek MT7621 PCIe Controller. + source "drivers/pci/controller/dwc/Kconfig" source "drivers/pci/controller/mobiveil/Kconfig" source "drivers/pci/controller/cadence/Kconfig" diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index aaf30b3dcc14..5f3d6a07d145 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -37,6 +37,8 @@ obj-$(CONFIG_VMD) += vmd.o obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o obj-$(CONFIG_PCI_LOONGSON) += pci-loongson.o obj-$(CONFIG_PCIE_HISI_ERR) += pcie-hisi-error.o +obj-$(CONFIG_PCIE_MT7621) += pcie-mt7621.o + # pcie-hisi.o quirks are needed even without CONFIG_PCIE_DW obj-y += dwc/ obj-y += mobiveil/ diff --git a/drivers/staging/mt7621-pci/pci-mt7621.c b/drivers/pci/controller/pcie-mt7621.c similarity index 95% rename from drivers/staging/mt7621-pci/pci-mt7621.c rename to drivers/pci/controller/pcie-mt7621.c index 6acfc94a16e7..f76dbca0ab32 100644 --- a/drivers/staging/mt7621-pci/pci-mt7621.c +++ b/drivers/pci/controller/pcie-mt7621.c @@ -30,18 +30,18 @@ #include #include -/* MediaTek specific configuration registers */ +/* MediaTek-specific configuration registers */ #define PCIE_FTS_NUM 0x70c #define PCIE_FTS_NUM_MASK GENMASK(15, 8) #define PCIE_FTS_NUM_L0(x) (((x) & 0xff) << 8) /* Host-PCI bridge registers */ #define RALINK_PCI_PCICFG_ADDR 0x0000 -#define RALINK_PCI_PCIMSK_ADDR 0x000C +#define RALINK_PCI_PCIMSK_ADDR 0x000c #define RALINK_PCI_CONFIG_ADDR 0x0020 #define RALINK_PCI_CONFIG_DATA 0x0024 #define RALINK_PCI_MEMBASE 0x0028 -#define RALINK_PCI_IOBASE 0x002C +#define RALINK_PCI_IOBASE 0x002c /* PCIe RC control registers */ #define RALINK_PCI_ID 0x0030 @@ -132,7 +132,7 @@ static inline void pcie_port_write(struct mt7621_pcie_port *port, static inline u32 mt7621_pci_get_cfgaddr(unsigned int bus, unsigned int slot, unsigned int func, unsigned int where) { - return (((where & 0xF00) >> 8) << 24) | (bus << 16) | (slot << 11) | + return (((where & 0xf00) >> 8) << 24) | (bus << 16) | (slot << 11) | (func << 8) | (where & 0xfc) | 0x80000000; } @@ -217,7 +217,7 @@ static int setup_cm_memory_region(struct pci_host_bridge *host) entry = resource_list_first_type(&host->windows, IORESOURCE_MEM); if (!entry) { - dev_err(dev, "Cannot get memory resource\n"); + dev_err(dev, "cannot get memory resource\n"); return -EINVAL; } @@ -280,7 +280,7 @@ static int mt7621_pcie_parse_port(struct mt7621_pcie *pcie, port->gpio_rst = devm_gpiod_get_index_optional(dev, "reset", slot, GPIOD_OUT_LOW); if (IS_ERR(port->gpio_rst)) { - dev_err(dev, "Failed to get GPIO for PCIe%d\n", slot); + dev_err(dev, "failed to get GPIO for PCIe%d\n", slot); err = PTR_ERR(port->gpio_rst); goto remove_reset; } @@ -409,7 +409,7 @@ static int mt7621_pcie_init_ports(struct mt7621_pcie *pcie) err = mt7621_pcie_init_port(port); if (err) { - dev_err(dev, "Initiating port %d failed\n", slot); + dev_err(dev, "initializing port %d failed\n", slot); list_del(&port->list); } } @@ -476,7 +476,7 @@ static int mt7621_pcie_enable_ports(struct pci_host_bridge *host) entry = resource_list_first_type(&host->windows, IORESOURCE_IO); if (!entry) { - dev_err(dev, "Cannot get io resource\n"); + dev_err(dev, "cannot get io resource\n"); return -EINVAL; } @@ -541,25 +541,25 @@ static int mt7621_pci_probe(struct platform_device *pdev) err = mt7621_pcie_parse_dt(pcie); if (err) { - dev_err(dev, "Parsing DT failed\n"); + dev_err(dev, "parsing DT failed\n"); return err; } err = mt7621_pcie_init_ports(pcie); if (err) { - dev_err(dev, "Nothing connected in virtual bridges\n"); + dev_err(dev, "nothing connected in virtual bridges\n"); return 0; } err = mt7621_pcie_enable_ports(bridge); if (err) { - dev_err(dev, "Error enabling pcie ports\n"); + dev_err(dev, "error enabling pcie ports\n"); goto remove_resets; } err = setup_cm_memory_region(bridge); if (err) { - dev_err(dev, "Error setting up iocu mem regions\n"); + dev_err(dev, "error setting up iocu mem regions\n"); goto remove_resets; } diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index e03627ad4460..59af251e7576 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -86,8 +86,6 @@ source "drivers/staging/vc04_services/Kconfig" source "drivers/staging/pi433/Kconfig" -source "drivers/staging/mt7621-pci/Kconfig" - source "drivers/staging/mt7621-dma/Kconfig" source "drivers/staging/ralink-gdma/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index c7f8d8d8dd11..76f413470bc8 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_KS7010) += ks7010/ obj-$(CONFIG_GREYBUS) += greybus/ obj-$(CONFIG_BCM2835_VCHIQ) += vc04_services/ obj-$(CONFIG_PI433) += pi433/ -obj-$(CONFIG_PCI_MT7621) += mt7621-pci/ obj-$(CONFIG_SOC_MT7621) += mt7621-dma/ obj-$(CONFIG_DMA_RALINK) += ralink-gdma/ obj-$(CONFIG_SOC_MT7621) += mt7621-dts/ diff --git a/drivers/staging/mt7621-pci/Kconfig b/drivers/staging/mt7621-pci/Kconfig deleted file mode 100644 index ce58042f2f21..000000000000 --- a/drivers/staging/mt7621-pci/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -config PCI_MT7621 - tristate "MediaTek MT7621 PCI Controller" - depends on RALINK - select PCI_DRIVERS_GENERIC - help - This selects a driver for the MediaTek MT7621 PCI Controller. - diff --git a/drivers/staging/mt7621-pci/Makefile b/drivers/staging/mt7621-pci/Makefile deleted file mode 100644 index f4e651cf7ce3..000000000000 --- a/drivers/staging/mt7621-pci/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_PCI_MT7621) += pci-mt7621.o diff --git a/drivers/staging/mt7621-pci/TODO b/drivers/staging/mt7621-pci/TODO deleted file mode 100644 index d674a9ac85c1..000000000000 --- a/drivers/staging/mt7621-pci/TODO +++ /dev/null @@ -1,4 +0,0 @@ - -- general code review and cleanup - -Cc: NeilBrown diff --git a/drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt b/drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt deleted file mode 100644 index 327a68267309..000000000000 --- a/drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt +++ /dev/null @@ -1,104 +0,0 @@ -MediaTek MT7621 PCIe controller - -Required properties: -- compatible: "mediatek,mt7621-pci" -- device_type: Must be "pci" -- reg: Base addresses and lengths of the PCIe subsys and root ports. -- bus-range: Range of bus numbers associated with this controller. -- #address-cells: Address representation for root ports (must be 3) -- pinctrl-names : The pin control state names. -- pinctrl-0: The "default" pinctrl state. -- #size-cells: Size representation for root ports (must be 2) -- ranges: Ranges for the PCI memory and I/O regions. -- #interrupt-cells: Must be 1 -- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties. - Please refer to the standard PCI bus binding document for a more detailed - explanation. -- status: either "disabled" or "okay". -- resets: Must contain an entry for each entry in reset-names. - See ../reset/reset.txt for details. -- reset-names: Must be "pcie0", "pcie1", "pcieN"... based on the number of - root ports. -- clocks: Must contain an entry for each entry in clock-names. - See ../clocks/clock-bindings.txt for details. -- clock-names: Must be "pcie0", "pcie1", "pcieN"... based on the number of - root ports. -- reset-gpios: GPIO specs for the reset pins. - -In addition, the device tree node must have sub-nodes describing each PCIe port -interface, having the following mandatory properties: - -Required properties: -- reg: Only the first four bytes are used to refer to the correct bus number - and device number. -- #address-cells: Must be 3 -- #size-cells: Must be 2 -- ranges: Sub-ranges distributed from the PCIe controller node. An empty - property is sufficient. -- bus-range: Range of bus numbers associated with this port. - -Example for MT7621: - - pcie: pcie@1e140000 { - compatible = "mediatek,mt7621-pci"; - reg = <0x1e140000 0x100 /* host-pci bridge registers */ - 0x1e142000 0x100 /* pcie port 0 RC control registers */ - 0x1e143000 0x100 /* pcie port 1 RC control registers */ - 0x1e144000 0x100>; /* pcie port 2 RC control registers */ - - #address-cells = <3>; - #size-cells = <2>; - - pinctrl-names = "default"; - pinctrl-0 = <&pcie_pins>; - - device_type = "pci"; - - bus-range = <0 255>; - ranges = < - 0x02000000 0 0x00000000 0x60000000 0 0x10000000 /* pci memory */ - 0x01000000 0 0x00000000 0x1e160000 0 0x00010000 /* io space */ - >; - - #interrupt-cells = <1>; - interrupt-map-mask = <0xF0000 0 0 1>; - interrupt-map = <0x10000 0 0 1 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>, - <0x20000 0 0 1 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>, - <0x30000 0 0 1 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>; - - status = "disabled"; - - resets = <&rstctrl 24 &rstctrl 25 &rstctrl 26>; - reset-names = "pcie0", "pcie1", "pcie2"; - clocks = <&clkctrl 24 &clkctrl 25 &clkctrl 26>; - clock-names = "pcie0", "pcie1", "pcie2"; - - reset-gpios = <&gpio 19 GPIO_ACTIVE_LOW>, - <&gpio 8 GPIO_ACTIVE_LOW>, - <&gpio 7 GPIO_ACTIVE_LOW>; - - pcie@0,0 { - reg = <0x0000 0 0 0 0>; - #address-cells = <3>; - #size-cells = <2>; - ranges; - bus-range = <0x00 0xff>; - }; - - pcie@1,0 { - reg = <0x0800 0 0 0 0>; - #address-cells = <3>; - #size-cells = <2>; - ranges; - bus-range = <0x00 0xff>; - }; - - pcie@2,0 { - reg = <0x1000 0 0 0 0>; - #address-cells = <3>; - #size-cells = <2>; - ranges; - bus-range = <0x00 0xff>; - }; - }; - From 370ea5aa50d66c6447300d23467cdd1efd0efa72 Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Wed, 22 Sep 2021 07:00:35 +0200 Subject: [PATCH 146/512] MAINTAINERS: Add Sergio Paracuellos as MT7621 PCIe maintainer Add myself as maintainer of the PCIe Controller driver for MT7621 SoCs. Link: https://lore.kernel.org/r/20210922050035.18162-4-sergio.paracuellos@gmail.com Signed-off-by: Sergio Paracuellos Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ca6d6fde85cf..6f01cef6860c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11842,6 +11842,12 @@ S: Maintained F: Documentation/devicetree/bindings/i2c/i2c-mt7621.txt F: drivers/i2c/busses/i2c-mt7621.c +MEDIATEK MT7621 PCIE CONTROLLER DRIVER +M: Sergio Paracuellos +S: Maintained +F: Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml +F: drivers/pci/controller/pcie-mt7621.c + MEDIATEK MT7621 PHY PCI DRIVER M: Sergio Paracuellos S: Maintained From 3331325c6347492dfbe31f6b2bfdaee9b0689cd5 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 10 Sep 2021 08:22:49 +0200 Subject: [PATCH 147/512] PCI/VPD: Use pci_read_vpd_any() in pci_vpd_size() Use new function pci_read_vpd_any() to simplify the code. [bhelgaas: squash in fix for stack overflow reported & tested by Qian [1] and Kunihiko [2]: [1] https://lore.kernel.org/netdev/e89087c5-c495-c5ca-feb1-54cf3a8775c5@quicinc.com/ [2] https://lore.kernel.org/r/2f7e3770-ab47-42b5-719c-f7c661c07d28@socionext.com Link: https://lore.kernel.org/r/6211be8a-5d10-8f3a-6d33-af695dc35caf@gmail.com Reported-by: Qian Cai Tested-by: Qian Cai Reported-by: Kunihiko Hayashi Tested-by: Kunihiko Hayashi ] Link: https://lore.kernel.org/r/049fa71c-c7af-9c69-51c0-05c1bc2bf660@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Acked-by: Jakub Kicinski --- drivers/pci/vpd.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 0a804715d98e..a4fc4d0690fe 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -57,10 +57,7 @@ static size_t pci_vpd_size(struct pci_dev *dev) size_t off = 0, size; unsigned char tag, header[1+2]; /* 1 byte tag, 2 bytes length */ - /* Otherwise the following reads would fail. */ - dev->vpd.len = PCI_VPD_MAX_SIZE; - - while (pci_read_vpd(dev, off, 1, header) == 1) { + while (pci_read_vpd_any(dev, off, 1, header) == 1) { size = 0; if (off == 0 && (header[0] == 0x00 || header[0] == 0xff)) @@ -68,7 +65,7 @@ static size_t pci_vpd_size(struct pci_dev *dev) if (header[0] & PCI_VPD_LRDT) { /* Large Resource Data Type Tag */ - if (pci_read_vpd(dev, off + 1, 2, &header[1]) != 2) { + if (pci_read_vpd_any(dev, off + 1, 2, &header[1]) != 2) { pci_warn(dev, "failed VPD read at offset %zu\n", off + 1); return off ?: PCI_VPD_SZ_INVALID; @@ -99,14 +96,14 @@ error: return off ?: PCI_VPD_SZ_INVALID; } -static bool pci_vpd_available(struct pci_dev *dev) +static bool pci_vpd_available(struct pci_dev *dev, bool check_size) { struct pci_vpd *vpd = &dev->vpd; if (!vpd->cap) return false; - if (vpd->len == 0) { + if (vpd->len == 0 && check_size) { vpd->len = pci_vpd_size(dev); if (vpd->len == PCI_VPD_SZ_INVALID) { vpd->cap = 0; @@ -159,17 +156,19 @@ static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count, void *arg, bool check_size) { struct pci_vpd *vpd = &dev->vpd; - unsigned int max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE; + unsigned int max_len; int ret = 0; loff_t end = pos + count; u8 *buf = arg; - if (!pci_vpd_available(dev)) + if (!pci_vpd_available(dev, check_size)) return -ENODEV; if (pos < 0) return -EINVAL; + max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE; + if (pos >= max_len) return 0; @@ -221,17 +220,19 @@ static ssize_t pci_vpd_write(struct pci_dev *dev, loff_t pos, size_t count, const void *arg, bool check_size) { struct pci_vpd *vpd = &dev->vpd; - unsigned int max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE; + unsigned int max_len; const u8 *buf = arg; loff_t end = pos + count; int ret = 0; - if (!pci_vpd_available(dev)) + if (!pci_vpd_available(dev, check_size)) return -ENODEV; if (pos < 0 || (pos & 3) || (count & 3)) return -EINVAL; + max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE; + if (end > max_len) return -EINVAL; @@ -315,7 +316,7 @@ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size) void *buf; int cnt; - if (!pci_vpd_available(dev)) + if (!pci_vpd_available(dev, true)) return ERR_PTR(-ENODEV); len = dev->vpd.len; From 48225f1878bde8a4ec3c2a96bbf81161e32f03f8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 10 Sep 2021 08:24:07 +0200 Subject: [PATCH 148/512] cxgb3: Remove t3_seeprom_read and use VPD API Using the VPD API allows to simplify the code and completely get rid of t3_seeprom_read(). Note that we don't have to use pci_read_vpd_any() here because a VPD quirk sets dev->vpd.len to the full EEPROM size. Tested with a T320 card. Link: https://lore.kernel.org/r/68ef15bb-b6bf-40ad-160c-aaa72c4a70f8@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Acked-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb3/common.h | 1 - .../net/ethernet/chelsio/cxgb3/cxgb3_main.c | 27 ++++------ drivers/net/ethernet/chelsio/cxgb3/t3_hw.c | 54 +++---------------- 3 files changed, 18 insertions(+), 64 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb3/common.h b/drivers/net/ethernet/chelsio/cxgb3/common.h index b706f2fbe4f4..56312f9ed5d8 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/common.h +++ b/drivers/net/ethernet/chelsio/cxgb3/common.h @@ -676,7 +676,6 @@ void t3_link_changed(struct adapter *adapter, int port_id); void t3_link_fault(struct adapter *adapter, int port_id); int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc); const struct adapter_info *t3_get_adapter_info(unsigned int board_id); -int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data); int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data); int t3_seeprom_wp(struct adapter *adapter, int enable); int t3_get_tp_version(struct adapter *adapter, u32 *vers); diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index 38e47703f9ab..d73339682133 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2036,20 +2036,16 @@ static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e, { struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; - int i, err = 0; - - u8 *buf = kmalloc(EEPROMSIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; + int cnt; e->magic = EEPROM_MAGIC; - for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4) - err = t3_seeprom_read(adapter, i, (__le32 *) & buf[i]); + cnt = pci_read_vpd(adapter->pdev, e->offset, e->len, data); + if (cnt < 0) + return cnt; - if (!err) - memcpy(data, buf + e->offset, e->len); - kfree(buf); - return err; + e->len = cnt; + + return 0; } static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, @@ -2072,12 +2068,9 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, buf = kmalloc(aligned_len, GFP_KERNEL); if (!buf) return -ENOMEM; - err = t3_seeprom_read(adapter, aligned_offset, (__le32 *) buf); - if (!err && aligned_len > 4) - err = t3_seeprom_read(adapter, - aligned_offset + aligned_len - 4, - (__le32 *) & buf[aligned_len - 4]); - if (err) + err = pci_read_vpd(adapter->pdev, aligned_offset, aligned_len, + buf); + if (err < 0) goto out; memcpy(buf + (eeprom->offset & 3), data, eeprom->len); } else diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c index 7ff31d1026fb..4ecf40b02d7b 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c @@ -599,42 +599,6 @@ struct t3_vpd { #define EEPROM_STAT_ADDR 0x4000 #define VPD_BASE 0xc00 -/** - * t3_seeprom_read - read a VPD EEPROM location - * @adapter: adapter to read - * @addr: EEPROM address - * @data: where to store the read data - * - * Read a 32-bit word from a location in VPD EEPROM using the card's PCI - * VPD ROM capability. A zero is written to the flag bit when the - * address is written to the control register. The hardware device will - * set the flag to 1 when 4 bytes have been read into the data register. - */ -int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data) -{ - u16 val; - int attempts = EEPROM_MAX_POLL; - u32 v; - unsigned int base = adapter->params.pci.vpd_cap_addr; - - if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3)) - return -EINVAL; - - pci_write_config_word(adapter->pdev, base + PCI_VPD_ADDR, addr); - do { - udelay(10); - pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val); - } while (!(val & PCI_VPD_ADDR_F) && --attempts); - - if (!(val & PCI_VPD_ADDR_F)) { - CH_ERR(adapter, "reading EEPROM address 0x%x failed\n", addr); - return -EIO; - } - pci_read_config_dword(adapter->pdev, base + PCI_VPD_DATA, &v); - *data = cpu_to_le32(v); - return 0; -} - /** * t3_seeprom_write - write a VPD EEPROM location * @adapter: adapter to write @@ -708,24 +672,22 @@ static int vpdstrtou16(char *s, u8 len, unsigned int base, u16 *val) */ static int get_vpd_params(struct adapter *adapter, struct vpd_params *p) { - int i, addr, ret; struct t3_vpd vpd; + u8 base_val = 0; + int addr, ret; /* * Card information is normally at VPD_BASE but some early cards had * it at 0. */ - ret = t3_seeprom_read(adapter, VPD_BASE, (__le32 *)&vpd); - if (ret) + ret = pci_read_vpd(adapter->pdev, VPD_BASE, 1, &base_val); + if (ret < 0) return ret; - addr = vpd.id_tag == 0x82 ? VPD_BASE : 0; + addr = base_val == PCI_VPD_LRDT_ID_STRING ? VPD_BASE : 0; - for (i = 0; i < sizeof(vpd); i += 4) { - ret = t3_seeprom_read(adapter, addr + i, - (__le32 *)((u8 *)&vpd + i)); - if (ret) - return ret; - } + ret = pci_read_vpd(adapter->pdev, addr, sizeof(vpd), &vpd); + if (ret < 0) + return ret; ret = vpdstrtouint(vpd.cclk_data, vpd.cclk_len, 10, &p->cclk); if (ret) From 43f3b61e37e03ae917ffd62395478efb4fa4eb50 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 10 Sep 2021 08:25:49 +0200 Subject: [PATCH 149/512] cxgb3: Use VPD API in t3_seeprom_wp() Use standard VPD API to replace t3_seeprom_write(), this prepares for removing this function. Chelsio T3 maps the EEPROM write protect flag to an arbitrary place in VPD address space, therefore we have to use pci_write_vpd_any(). Link: https://lore.kernel.org/r/f768fdbe-3a16-d539-57d2-c7c908294336@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Acked-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb3/t3_hw.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c index 4ecf40b02d7b..ec4b49ebe62a 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c @@ -642,7 +642,14 @@ int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data) */ int t3_seeprom_wp(struct adapter *adapter, int enable) { - return t3_seeprom_write(adapter, EEPROM_STAT_ADDR, enable ? 0xc : 0); + u32 data = enable ? 0xc : 0; + int ret; + + /* EEPROM_STAT_ADDR is outside VPD area, use pci_write_vpd_any() */ + ret = pci_write_vpd_any(adapter->pdev, EEPROM_STAT_ADDR, sizeof(u32), + &data); + + return ret < 0 ? ret : 0; } static int vpdstrtouint(char *s, u8 len, unsigned int base, unsigned int *val) From 78b5d5c998537a34aec719af0a236119d53db752 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 10 Sep 2021 08:27:08 +0200 Subject: [PATCH 150/512] cxgb3: Remove seeprom_write and use VPD API Using the VPD API allows to simplify the code and completely get rid of t3_seeprom_write(). Link: https://lore.kernel.org/r/a0291004-dda3-ea08-4d6c-a2f8826c8527@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Acked-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb3/common.h | 1 - .../net/ethernet/chelsio/cxgb3/cxgb3_main.c | 11 ++---- drivers/net/ethernet/chelsio/cxgb3/t3_hw.c | 35 ------------------- 3 files changed, 3 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb3/common.h b/drivers/net/ethernet/chelsio/cxgb3/common.h index 56312f9ed5d8..d115ec93296d 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/common.h +++ b/drivers/net/ethernet/chelsio/cxgb3/common.h @@ -676,7 +676,6 @@ void t3_link_changed(struct adapter *adapter, int port_id); void t3_link_fault(struct adapter *adapter, int port_id); int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc); const struct adapter_info *t3_get_adapter_info(unsigned int board_id); -int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data); int t3_seeprom_wp(struct adapter *adapter, int enable); int t3_get_tp_version(struct adapter *adapter, u32 *vers); int t3_check_tpsram_version(struct adapter *adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index d73339682133..e185f5f24849 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2054,7 +2054,6 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; u32 aligned_offset, aligned_len; - __le32 *p; u8 *buf; int err; @@ -2080,17 +2079,13 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, if (err) goto out; - for (p = (__le32 *) buf; !err && aligned_len; aligned_len -= 4, p++) { - err = t3_seeprom_write(adapter, aligned_offset, *p); - aligned_offset += 4; - } - - if (!err) + err = pci_write_vpd(adapter->pdev, aligned_offset, aligned_len, buf); + if (err >= 0) err = t3_seeprom_wp(adapter, 1); out: if (buf != data) kfree(buf); - return err; + return err < 0 ? err : 0; } static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c index ec4b49ebe62a..cb85c2f21525 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c @@ -595,44 +595,9 @@ struct t3_vpd { u32 pad; /* for multiple-of-4 sizing and alignment */ }; -#define EEPROM_MAX_POLL 40 #define EEPROM_STAT_ADDR 0x4000 #define VPD_BASE 0xc00 -/** - * t3_seeprom_write - write a VPD EEPROM location - * @adapter: adapter to write - * @addr: EEPROM address - * @data: value to write - * - * Write a 32-bit word to a location in VPD EEPROM using the card's PCI - * VPD ROM capability. - */ -int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data) -{ - u16 val; - int attempts = EEPROM_MAX_POLL; - unsigned int base = adapter->params.pci.vpd_cap_addr; - - if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3)) - return -EINVAL; - - pci_write_config_dword(adapter->pdev, base + PCI_VPD_DATA, - le32_to_cpu(data)); - pci_write_config_word(adapter->pdev,base + PCI_VPD_ADDR, - addr | PCI_VPD_ADDR_F); - do { - msleep(1); - pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val); - } while ((val & PCI_VPD_ADDR_F) && --attempts); - - if (val & PCI_VPD_ADDR_F) { - CH_ERR(adapter, "write to EEPROM address 0x%x failed\n", addr); - return -EIO; - } - return 0; -} - /** * t3_seeprom_wp - enable/disable EEPROM write protection * @adapter: the adapter From 3826350e6dd435e244eb6e47abad5a47c169ebc2 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Thu, 14 Oct 2021 09:58:24 +0200 Subject: [PATCH 151/512] s390/ap: Fix hanging ioctl caused by orphaned replies When a queue is switched to soft offline during heavy load and later switched to soft online again and now used, it may be that the caller is blocked forever in the ioctl call. The failure occurs because there is a pending reply after the queue(s) have been switched to offline. This orphaned reply is received when the queue is switched to online and is accidentally counted for the outstanding replies. So when there was a valid outstanding reply and this orphaned reply is received it counts as the outstanding one thus dropping the outstanding counter to 0. Voila, with this counter the receive function is not called any more and the real outstanding reply is never received (until another request comes in...) and the ioctl blocks. The fix is simple. However, instead of readjusting the counter when an orphaned reply is detected, I check the queue status for not empty and compare this to the outstanding counter. So if the queue is not empty then the counter must not drop to 0 but at least have a value of 1. Signed-off-by: Harald Freudenberger Cc: stable@vger.kernel.org Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_queue.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 9ea48bf0ee40..032bf7b282ba 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -157,6 +157,8 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq) switch (status.response_code) { case AP_RESPONSE_NORMAL: aq->queue_count = max_t(int, 0, aq->queue_count - 1); + if (!status.queue_empty && !aq->queue_count) + aq->queue_count++; if (aq->queue_count > 0) mod_timer(&aq->timeout, jiffies + aq->request_timeout); From 3f74eb5f78198a88ebbad7b1d8168f7ea34b3f1a Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 15 Oct 2021 12:00:22 +0200 Subject: [PATCH 152/512] s390/zcrypt: rework of debug feature messages This patch reworks all the debug feature invocations to be more uniform. All invocations now use the macro with the level already part of the macro name. All messages now start with %s filled with __func__ (well there are still some exceptions), and some message text has been shortened or reworked. There is no functional code touched with this patch. Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.c | 72 +++++++++++++------------- drivers/s390/crypto/ap_debug.h | 2 +- drivers/s390/crypto/ap_queue.c | 7 +-- drivers/s390/crypto/zcrypt_api.c | 45 ++++++++-------- drivers/s390/crypto/zcrypt_card.c | 8 +-- drivers/s390/crypto/zcrypt_debug.h | 2 +- drivers/s390/crypto/zcrypt_error.h | 22 ++++---- drivers/s390/crypto/zcrypt_msgtype50.c | 18 +++---- drivers/s390/crypto/zcrypt_msgtype6.c | 40 +++++++------- drivers/s390/crypto/zcrypt_queue.c | 17 +++--- 10 files changed, 116 insertions(+), 117 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index d9b804943d19..38d71e076048 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -725,7 +725,7 @@ static void ap_check_bindings_complete(void) if (bound == apqns) { if (!completion_done(&ap_init_apqn_bindings_complete)) { complete_all(&ap_init_apqn_bindings_complete); - AP_DBF(DBF_INFO, "%s complete\n", __func__); + AP_DBF_INFO("%s complete\n", __func__); } ap_send_bindings_complete_uevent(); } @@ -786,8 +786,8 @@ static int __ap_revise_reserved(struct device *dev, void *dummy) drvres = to_ap_drv(dev->driver)->flags & AP_DRIVER_FLAG_DEFAULT; if (!!devres != !!drvres) { - AP_DBF_DBG("reprobing queue=%02x.%04x\n", - card, queue); + AP_DBF_DBG("%s reprobing queue=%02x.%04x\n", + __func__, card, queue); rc = device_reprobe(dev); } } @@ -1118,7 +1118,8 @@ static ssize_t ap_domain_store(struct bus_type *bus, ap_domain_index = domain; spin_unlock_bh(&ap_domain_lock); - AP_DBF_INFO("stored new default domain=%d\n", domain); + AP_DBF_INFO("%s stored new default domain=%d\n", + __func__, domain); return count; } @@ -1433,8 +1434,9 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) /* < CEX2A is not supported */ if (rawtype < AP_DEVICE_TYPE_CEX2A) { - AP_DBF_WARN("get_comp_type queue=%02x.%04x unsupported type %d\n", - AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype); + AP_DBF_WARN("%s queue=%02x.%04x unsupported type %d\n", + __func__, AP_QID_CARD(qid), + AP_QID_QUEUE(qid), rawtype); return 0; } /* up to CEX7 known and fully supported */ @@ -1458,11 +1460,12 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) comp_type = apinfo.cat; } if (!comp_type) - AP_DBF_WARN("get_comp_type queue=%02x.%04x unable to map type %d\n", - AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype); + AP_DBF_WARN("%s queue=%02x.%04x unable to map type %d\n", + __func__, AP_QID_CARD(qid), + AP_QID_QUEUE(qid), rawtype); else if (comp_type != rawtype) - AP_DBF_INFO("get_comp_type queue=%02x.%04x map type %d to %d\n", - AP_QID_CARD(qid), AP_QID_QUEUE(qid), + AP_DBF_INFO("%s queue=%02x.%04x map type %d to %d\n", + __func__, AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype, comp_type); return comp_type; } @@ -1535,7 +1538,7 @@ static inline void ap_scan_domains(struct ap_card *ac) aq = dev ? to_ap_queue(dev) : NULL; if (!ap_test_config_usage_domain(dom)) { if (dev) { - AP_DBF_INFO("%s(%d,%d) not in config any more, rm queue device\n", + AP_DBF_INFO("%s(%d,%d) not in config anymore, rm queue dev\n", __func__, ac->id, dom); device_unregister(dev); put_device(dev); @@ -1545,9 +1548,8 @@ static inline void ap_scan_domains(struct ap_card *ac) /* domain is valid, get info from this APQN */ if (!ap_queue_info(qid, &type, &func, &depth, &ml, &decfg)) { if (aq) { - AP_DBF_INFO( - "%s(%d,%d) ap_queue_info() not successful, rm queue device\n", - __func__, ac->id, dom); + AP_DBF_INFO("%s(%d,%d) queue_info() failed, rm queue dev\n", + __func__, ac->id, dom); device_unregister(dev); put_device(dev); } @@ -1577,10 +1579,10 @@ static inline void ap_scan_domains(struct ap_card *ac) /* get it and thus adjust reference counter */ get_device(dev); if (decfg) - AP_DBF_INFO("%s(%d,%d) new (decfg) queue device created\n", + AP_DBF_INFO("%s(%d,%d) new (decfg) queue dev created\n", __func__, ac->id, dom); else - AP_DBF_INFO("%s(%d,%d) new queue device created\n", + AP_DBF_INFO("%s(%d,%d) new queue dev created\n", __func__, ac->id, dom); goto put_dev_and_continue; } @@ -1594,7 +1596,7 @@ static inline void ap_scan_domains(struct ap_card *ac) aq->last_err_rc = AP_RESPONSE_DECONFIGURED; } spin_unlock_bh(&aq->lock); - AP_DBF_INFO("%s(%d,%d) queue device config off\n", + AP_DBF_INFO("%s(%d,%d) queue dev config off\n", __func__, ac->id, dom); ap_send_config_uevent(&aq->ap_dev, aq->config); /* 'receive' pending messages with -EAGAIN */ @@ -1609,7 +1611,7 @@ static inline void ap_scan_domains(struct ap_card *ac) aq->sm_state = AP_SM_STATE_RESET_START; } spin_unlock_bh(&aq->lock); - AP_DBF_INFO("%s(%d,%d) queue device config on\n", + AP_DBF_INFO("%s(%d,%d) queue dev config on\n", __func__, ac->id, dom); ap_send_config_uevent(&aq->ap_dev, aq->config); goto put_dev_and_continue; @@ -1621,7 +1623,7 @@ static inline void ap_scan_domains(struct ap_card *ac) ap_flush_queue(aq); /* re-init (with reset) the queue device */ ap_queue_init_state(aq); - AP_DBF_INFO("%s(%d,%d) queue device reinit enforced\n", + AP_DBF_INFO("%s(%d,%d) queue dev reinit enforced\n", __func__, ac->id, dom); goto put_dev_and_continue; } @@ -1653,7 +1655,7 @@ static inline void ap_scan_adapter(int ap) /* Adapter not in configuration ? */ if (!ap_test_config_card_id(ap)) { if (ac) { - AP_DBF_INFO("%s(%d) ap not in config any more, rm card and queue devices\n", + AP_DBF_INFO("%s(%d) ap not in config any more, rm card and queue devs\n", __func__, ap); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); @@ -1678,9 +1680,8 @@ static inline void ap_scan_adapter(int ap) if (dom > ap_max_domain_id) { /* Could not find a valid APQN for this adapter */ if (ac) { - AP_DBF_INFO( - "%s(%d) no type info (no APQN found), rm card and queue devices\n", - __func__, ap); + AP_DBF_INFO("%s(%d) no type info (no APQN found), rm card and queue devs\n", + __func__, ap); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); } else { @@ -1692,7 +1693,7 @@ static inline void ap_scan_adapter(int ap) if (!type) { /* No apdater type info available, an unusable adapter */ if (ac) { - AP_DBF_INFO("%s(%d) no valid type (0) info, rm card and queue devices\n", + AP_DBF_INFO("%s(%d) no valid type (0) info, rm card and queue devs\n", __func__, ap); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); @@ -1706,13 +1707,13 @@ static inline void ap_scan_adapter(int ap) if (ac) { /* Check APQN against existing card device for changes */ if (ac->raw_hwtype != type) { - AP_DBF_INFO("%s(%d) hwtype %d changed, rm card and queue devices\n", + AP_DBF_INFO("%s(%d) hwtype %d changed, rm card and queue devs\n", __func__, ap, type); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); ac = NULL; } else if (ac->functions != func) { - AP_DBF_INFO("%s(%d) functions 0x%08x changed, rm card and queue devices\n", + AP_DBF_INFO("%s(%d) functions 0x%08x changed, rm card and queue devs\n", __func__, ap, type); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); @@ -1720,13 +1721,13 @@ static inline void ap_scan_adapter(int ap) } else { if (decfg && ac->config) { ac->config = false; - AP_DBF_INFO("%s(%d) card device config off\n", + AP_DBF_INFO("%s(%d) card dev config off\n", __func__, ap); ap_send_config_uevent(&ac->ap_dev, ac->config); } if (!decfg && !ac->config) { ac->config = true; - AP_DBF_INFO("%s(%d) card device config on\n", + AP_DBF_INFO("%s(%d) card dev config on\n", __func__, ap); ap_send_config_uevent(&ac->ap_dev, ac->config); } @@ -1756,7 +1757,8 @@ static inline void ap_scan_adapter(int ap) if (ac->maxmsgsize > atomic_read(&ap_max_msg_size)) { atomic_set(&ap_max_msg_size, ac->maxmsgsize); AP_DBF_INFO("%s(%d) ap_max_msg_size update to %d byte\n", - __func__, ap, atomic_read(&ap_max_msg_size)); + __func__, ap, + atomic_read(&ap_max_msg_size)); } /* Register the new card device with AP bus */ rc = device_register(dev); @@ -1769,10 +1771,10 @@ static inline void ap_scan_adapter(int ap) /* get it and thus adjust reference counter */ get_device(dev); if (decfg) - AP_DBF_INFO("%s(%d) new (decfg) card device type=%d func=0x%08x created\n", + AP_DBF_INFO("%s(%d) new (decfg) card dev type=%d func=0x%08x created\n", __func__, ap, type, func); else - AP_DBF_INFO("%s(%d) new card device type=%d func=0x%08x created\n", + AP_DBF_INFO("%s(%d) new card dev type=%d func=0x%08x created\n", __func__, ap, type, func); } @@ -1810,12 +1812,12 @@ static void ap_scan_bus(struct work_struct *unused) if (dev) put_device(dev); else - AP_DBF_INFO("no queue device with default domain %d available\n", - ap_domain_index); + AP_DBF_INFO("%s no queue device with default domain %d available\n", + __func__, ap_domain_index); } if (atomic64_inc_return(&ap_scan_bus_count) == 1) { - AP_DBF(DBF_DEBUG, "%s init scan complete\n", __func__); + AP_DBF_DBG("%s init scan complete\n", __func__); ap_send_init_scan_done_uevent(); ap_check_bindings_complete(); } @@ -1830,7 +1832,7 @@ static void ap_config_timeout(struct timer_list *unused) static int __init ap_debug_init(void) { - ap_dbf_info = debug_register("ap", 1, 1, + ap_dbf_info = debug_register("ap", 2, 1, DBF_MAX_SPRINTF_ARGS * sizeof(long)); debug_register_view(ap_dbf_info, &debug_sprintf_view); debug_set_level(ap_dbf_info, DBF_ERR); diff --git a/drivers/s390/crypto/ap_debug.h b/drivers/s390/crypto/ap_debug.h index 34b0350d0b1a..c083ce88a9a6 100644 --- a/drivers/s390/crypto/ap_debug.h +++ b/drivers/s390/crypto/ap_debug.h @@ -16,7 +16,7 @@ #define RC2ERR(rc) ((rc) ? DBF_ERR : DBF_INFO) #define RC2WARN(rc) ((rc) ? DBF_WARN : DBF_INFO) -#define DBF_MAX_SPRINTF_ARGS 5 +#define DBF_MAX_SPRINTF_ARGS 6 #define AP_DBF(...) \ debug_sprintf_event(ap_dbf_info, ##__VA_ARGS__) diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 032bf7b282ba..1901449768dd 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -248,6 +248,7 @@ static enum ap_sm_wait ap_sm_write(struct ap_queue *aq) if (aq->requestq_count <= 0) return AP_SM_WAIT_NONE; + /* Start the next request on the queue. */ ap_msg = list_entry(aq->requestq.next, struct ap_message, list); #ifdef CONFIG_ZCRYPT_DEBUG @@ -281,7 +282,7 @@ static enum ap_sm_wait ap_sm_write(struct ap_queue *aq) aq->sm_state = AP_SM_STATE_RESET_WAIT; return AP_SM_WAIT_TIMEOUT; case AP_RESPONSE_INVALID_DOMAIN: - AP_DBF(DBF_WARN, "AP_RESPONSE_INVALID_DOMAIN on NQAP\n"); + AP_DBF_WARN("%s RESPONSE_INVALID_DOMAIN on NQAP\n", __func__); fallthrough; case AP_RESPONSE_MESSAGE_TOO_BIG: case AP_RESPONSE_REQ_FAC_NOT_INST: @@ -573,8 +574,8 @@ static ssize_t reset_store(struct device *dev, ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL)); spin_unlock_bh(&aq->lock); - AP_DBF(DBF_INFO, "reset queue=%02x.%04x triggered by user\n", - AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid)); + AP_DBF_INFO("%s reset queue=%02x.%04x triggered by user\n", + __func__, AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid)); return count; } diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 356318746dd1..4c3dcc435e83 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -82,8 +82,8 @@ static inline int zcrypt_process_rescan(void) atomic_set(&zcrypt_rescan_req, 0); atomic_inc(&zcrypt_rescan_count); ap_bus_force_rescan(); - ZCRYPT_DBF(DBF_INFO, "rescan count=%07d\n", - atomic_inc_return(&zcrypt_rescan_count)); + ZCRYPT_DBF_INFO("%s rescan count=%07d\n", __func__, + atomic_inc_return(&zcrypt_rescan_count)); return 1; } return 0; @@ -341,8 +341,8 @@ static void zcdn_device_release(struct device *dev) { struct zcdn_device *zcdndev = to_zcdn_dev(dev); - ZCRYPT_DBF(DBF_INFO, "releasing zcdn device %d:%d\n", - MAJOR(dev->devt), MINOR(dev->devt)); + ZCRYPT_DBF_INFO("%s releasing zcdn device %d:%d\n", + __func__, MAJOR(dev->devt), MINOR(dev->devt)); kfree(zcdndev); } @@ -407,8 +407,8 @@ static int zcdn_create(const char *name) goto unlockout; } - ZCRYPT_DBF(DBF_INFO, "created zcdn device %d:%d\n", - MAJOR(devt), MINOR(devt)); + ZCRYPT_DBF_INFO("%s created zcdn device %d:%d\n", + __func__, MAJOR(devt), MINOR(devt)); unlockout: mutex_unlock(&ap_perms_mutex); @@ -550,9 +550,8 @@ static inline int zcrypt_check_ioctl(struct ap_perms *perms, } if (rc) - ZCRYPT_DBF(DBF_WARN, - "ioctl check failed: ioctlnr=0x%04x rc=%d\n", - ioctlnr, rc); + ZCRYPT_DBF_WARN("%s ioctl check failed: ioctlnr=0x%04x rc=%d\n", + __func__, ioctlnr, rc); return rc; } @@ -1446,7 +1445,7 @@ static int icarsamodexpo_ioctl(struct ap_perms *perms, unsigned long arg) if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX) rc = -EIO; if (rc) { - ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSAMODEXPO rc=%d\n", rc); + ZCRYPT_DBF_DBG("ioctl ICARSAMODEXPO rc=%d\n", rc); return rc; } return put_user(mex.outputdatalength, &umex->outputdatalength); @@ -1491,7 +1490,7 @@ static int icarsacrt_ioctl(struct ap_perms *perms, unsigned long arg) if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX) rc = -EIO; if (rc) { - ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSACRT rc=%d\n", rc); + ZCRYPT_DBF_DBG("ioctl ICARSACRT rc=%d\n", rc); return rc; } return put_user(crt.outputdatalength, &ucrt->outputdatalength); @@ -1509,12 +1508,12 @@ static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg) return -EFAULT; #ifdef CONFIG_ZCRYPT_DEBUG - if (xcRB.status & (1U << 31)) { + if ((xcRB.status & 0x8000FFFF) == 0x80004649 /* 'FI' */) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; tr.fi.cmd = (u16)(xcRB.status >> 16); } - xcRB.status &= 0x0000FFFF; + xcRB.status = 0; #endif do { @@ -1536,8 +1535,8 @@ static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg) if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX) rc = -EIO; if (rc) - ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDCPRB rc=%d status=0x%x\n", - rc, xcRB.status); + ZCRYPT_DBF_DBG("ioctl ZSENDCPRB rc=%d status=0x%x\n", + rc, xcRB.status); if (copy_to_user(uxcRB, &xcRB, sizeof(xcRB))) return -EFAULT; return rc; @@ -1582,7 +1581,7 @@ static int zsendep11cprb_ioctl(struct ap_perms *perms, unsigned long arg) if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX) rc = -EIO; if (rc) - ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDEP11CPRB rc=%d\n", rc); + ZCRYPT_DBF_DBG("ioctl ZSENDEP11CPRB rc=%d\n", rc); if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb))) return -EFAULT; return rc; @@ -1709,7 +1708,7 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, } /* unknown ioctl number */ default: - ZCRYPT_DBF(DBF_DEBUG, "unknown ioctl 0x%08x\n", cmd); + ZCRYPT_DBF_DBG("unknown ioctl 0x%08x\n", cmd); return -ENOIOCTLCMD; } } @@ -2048,16 +2047,14 @@ int zcrypt_wait_api_operational(void) break; case -ETIME: /* timeout */ - ZCRYPT_DBF(DBF_WARN, - "%s ap_wait_init_apqn_bindings_complete() returned with ETIME\n", - __func__); + ZCRYPT_DBF_WARN("%s ap_wait_init_apqn_bindings_complete()=ETIME\n", + __func__); zcrypt_wait_api_state = -ETIME; break; default: /* other failure */ - ZCRYPT_DBF(DBF_DEBUG, - "%s ap_wait_init_apqn_bindings_complete() failure rc=%d\n", - __func__, rc); + ZCRYPT_DBF_DBG("%s ap_wait_init_apqn_bindings_complete()=%d\n", + __func__, rc); break; } break; @@ -2079,7 +2076,7 @@ EXPORT_SYMBOL(zcrypt_wait_api_operational); int __init zcrypt_debug_init(void) { - zcrypt_dbf_info = debug_register("zcrypt", 1, 1, + zcrypt_dbf_info = debug_register("zcrypt", 2, 1, DBF_MAX_SPRINTF_ARGS * sizeof(long)); debug_register_view(zcrypt_dbf_info, &debug_sprintf_view); debug_set_level(zcrypt_dbf_info, DBF_ERR); diff --git a/drivers/s390/crypto/zcrypt_card.c b/drivers/s390/crypto/zcrypt_card.c index ef11d2a0ca6c..3e259befd30a 100644 --- a/drivers/s390/crypto/zcrypt_card.c +++ b/drivers/s390/crypto/zcrypt_card.c @@ -76,7 +76,7 @@ static ssize_t online_store(struct device *dev, zc->online = online; id = zc->card->id; - ZCRYPT_DBF(DBF_INFO, "card=%02x online=%d\n", id, online); + ZCRYPT_DBF_INFO("%s card=%02x online=%d\n", __func__, id, online); ap_send_online_uevent(&ac->ap_dev, online); @@ -189,7 +189,8 @@ int zcrypt_card_register(struct zcrypt_card *zc) zc->online = 1; - ZCRYPT_DBF(DBF_INFO, "card=%02x register online=1\n", zc->card->id); + ZCRYPT_DBF_INFO("%s card=%02x register online=1\n", + __func__, zc->card->id); rc = sysfs_create_group(&zc->card->ap_dev.device.kobj, &zcrypt_card_attr_group); @@ -211,7 +212,8 @@ EXPORT_SYMBOL(zcrypt_card_register); */ void zcrypt_card_unregister(struct zcrypt_card *zc) { - ZCRYPT_DBF(DBF_INFO, "card=%02x unregister\n", zc->card->id); + ZCRYPT_DBF_INFO("%s card=%02x unregister\n", + __func__, zc->card->id); spin_lock(&zcrypt_list_lock); list_del_init(&zc->list); diff --git a/drivers/s390/crypto/zcrypt_debug.h b/drivers/s390/crypto/zcrypt_debug.h index 3225489a1c41..5cf88aabd64b 100644 --- a/drivers/s390/crypto/zcrypt_debug.h +++ b/drivers/s390/crypto/zcrypt_debug.h @@ -17,7 +17,7 @@ #define RC2ERR(rc) ((rc) ? DBF_ERR : DBF_INFO) #define RC2WARN(rc) ((rc) ? DBF_WARN : DBF_INFO) -#define DBF_MAX_SPRINTF_ARGS 5 +#define DBF_MAX_SPRINTF_ARGS 6 #define ZCRYPT_DBF(...) \ debug_sprintf_event(zcrypt_dbf_info, ##__VA_ARGS__) diff --git a/drivers/s390/crypto/zcrypt_error.h b/drivers/s390/crypto/zcrypt_error.h index 39e626e3a379..8b0ce600b749 100644 --- a/drivers/s390/crypto/zcrypt_error.h +++ b/drivers/s390/crypto/zcrypt_error.h @@ -98,9 +98,8 @@ static inline int convert_error(struct zcrypt_queue *zq, case REP88_ERROR_MESSAGE_MALFORMD: /* 0x22 */ case REP88_ERROR_KEY_TYPE: /* 0x34 */ /* RY indicates malformed request */ - ZCRYPT_DBF(DBF_WARN, - "dev=%02x.%04x RY=0x%02x => rc=EINVAL\n", - card, queue, ehdr->reply_code); + ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => rc=EINVAL\n", + __func__, card, queue, ehdr->reply_code); return -EINVAL; case REP82_ERROR_MACHINE_FAILURE: /* 0x10 */ case REP82_ERROR_MESSAGE_TYPE: /* 0x20 */ @@ -119,19 +118,18 @@ static inline int convert_error(struct zcrypt_queue *zq, } __packed * head = reply->msg; unsigned int apfs = *((u32 *)head->fmt2.apfs); - ZCRYPT_DBF(DBF_WARN, - "dev=%02x.%04x RY=0x%02x apfs=0x%x => bus rescan, rc=EAGAIN\n", - card, queue, ehdr->reply_code, apfs); + ZCRYPT_DBF_WARN( + "%s dev=%02x.%04x RY=0x%02x apfs=0x%x => bus rescan, rc=EAGAIN\n", + __func__, card, queue, ehdr->reply_code, apfs); } else - ZCRYPT_DBF(DBF_WARN, - "dev=%02x.%04x RY=0x%02x => bus rescan, rc=EAGAIN\n", - card, queue, ehdr->reply_code); + ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => bus rescan, rc=EAGAIN\n", + __func__, card, queue, + ehdr->reply_code); return -EAGAIN; default: /* Assume request is valid and a retry will be worth it */ - ZCRYPT_DBF(DBF_WARN, - "dev=%02x.%04x RY=0x%02x => rc=EAGAIN\n", - card, queue, ehdr->reply_code); + ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => rc=EAGAIN\n", + __func__, card, queue, ehdr->reply_code); return -EAGAIN; } } diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c index 99937f3e1d49..f42e8c511184 100644 --- a/drivers/s390/crypto/zcrypt_msgtype50.c +++ b/drivers/s390/crypto/zcrypt_msgtype50.c @@ -369,12 +369,10 @@ static int convert_type80(struct zcrypt_queue *zq, zq->online = 0; pr_err("Crypto dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n", AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - t80h->code); - ZCRYPT_DBF_ERR("dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - t80h->code); + AP_QID_QUEUE(zq->queue->qid), t80h->code); + ZCRYPT_DBF_ERR("%s dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), t80h->code); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } @@ -409,10 +407,10 @@ static int convert_response_cex2a(struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) rtype); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) rtype); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) rtype); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index bc5a8c31ba73..8582dd0d6969 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -649,8 +649,8 @@ static int convert_type86_ica(struct zcrypt_queue *zq, (service_rc == 8 && service_rs == 72) || (service_rc == 8 && service_rs == 770) || (service_rc == 12 && service_rs == 769)) { - ZCRYPT_DBF_WARN("dev=%02x.%04x rc/rs=%d/%d => rc=EINVAL\n", - AP_QID_CARD(zq->queue->qid), + ZCRYPT_DBF_WARN("%s dev=%02x.%04x rc/rs=%d/%d => rc=EINVAL\n", + __func__, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) service_rc, (int) service_rs); return -EINVAL; @@ -660,8 +660,8 @@ static int convert_type86_ica(struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) service_rc, (int) service_rs); - ZCRYPT_DBF_ERR("dev=%02x.%04x rc/rs=%d/%d => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), + ZCRYPT_DBF_ERR("%s dev=%02x.%04x rc/rs=%d/%d => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) service_rc, (int) service_rs); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); @@ -806,10 +806,10 @@ static int convert_response_ica(struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) msg->hdr.type); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } @@ -841,10 +841,10 @@ static int convert_response_xcrb(bool userspace, struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) msg->hdr.type); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } @@ -871,10 +871,10 @@ static int convert_response_ep11_xcrb(bool userspace, struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) msg->hdr.type); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } @@ -902,10 +902,10 @@ static int convert_response_rng(struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) msg->hdr.type); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } diff --git a/drivers/s390/crypto/zcrypt_queue.c b/drivers/s390/crypto/zcrypt_queue.c index 398bde237e37..1552a850a52e 100644 --- a/drivers/s390/crypto/zcrypt_queue.c +++ b/drivers/s390/crypto/zcrypt_queue.c @@ -65,10 +65,9 @@ static ssize_t online_store(struct device *dev, return -EINVAL; zq->online = online; - ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x online=%d\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - online); + ZCRYPT_DBF_INFO("%s queue=%02x.%04x online=%d\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), online); ap_send_online_uevent(&aq->ap_dev, online); @@ -175,8 +174,9 @@ int zcrypt_queue_register(struct zcrypt_queue *zq) zq->zcard = zc; zq->online = 1; /* New devices are online by default. */ - ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x register online=1\n", - AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid)); + ZCRYPT_DBF_INFO("%s queue=%02x.%04x register online=1\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid)); list_add_tail(&zq->list, &zc->zqueues); spin_unlock(&zcrypt_list_lock); @@ -215,8 +215,9 @@ void zcrypt_queue_unregister(struct zcrypt_queue *zq) { struct zcrypt_card *zc; - ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x unregister\n", - AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid)); + ZCRYPT_DBF_INFO("%s queue=%02x.%04x unregister\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid)); zc = zq->zcard; spin_lock(&zcrypt_list_lock); From 273cd173a1e09e250ba2f8841c95343a3aeec7b5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 11 Jan 2021 11:01:55 +0100 Subject: [PATCH 153/512] s390/pgtable: use physical address for Page-Table Origin Instructions IPTE, IDTE and CRDTE accept Page-Table Origin as one of the arguments, but instead the pgtable virtual address is passed. Fix that and also update the crdte() prototype to conform to csp() and cspg() friends. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pgtable.h | 12 ++++++------ arch/s390/mm/pageattr.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b61426c9ef17..0611986b551f 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -583,11 +583,11 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new #define CRDTE_DTT_REGION1 0x1cUL static inline void crdte(unsigned long old, unsigned long new, - unsigned long table, unsigned long dtt, + unsigned long *table, unsigned long dtt, unsigned long address, unsigned long asce) { union register_pair r1 = { .even = old, .odd = new, }; - union register_pair r2 = { .even = table | dtt, .odd = address, }; + union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, }; asm volatile(".insn rrf,0xb98f0000,%[r1],%[r2],%[asce],0" : [r1] "+&d" (r1.pair) @@ -1001,7 +1001,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, unsigned long opt, unsigned long asce, int local) { - unsigned long pto = (unsigned long) ptep; + unsigned long pto = __pa(ptep); if (__builtin_constant_p(opt) && opt == 0) { /* Invalidation + TLB flush for the pte */ @@ -1023,7 +1023,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, static __always_inline void __ptep_ipte_range(unsigned long address, int nr, pte_t *ptep, int local) { - unsigned long pto = (unsigned long) ptep; + unsigned long pto = __pa(ptep); /* Invalidate a range of ptes + TLB flush of the ptes */ do { @@ -1484,7 +1484,7 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, { unsigned long sto; - sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); + sto = __pa(pmdp) - pmd_index(addr) * sizeof(pmd_t); if (__builtin_constant_p(opt) && opt == 0) { /* flush without guest asce */ asm volatile( @@ -1510,7 +1510,7 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp, { unsigned long r3o; - r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); + r3o = __pa(pudp) - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; if (__builtin_constant_p(opt) && opt == 0) { /* flush without guest asce */ diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index fdc86c0e4e6c..654019181a37 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -57,7 +57,7 @@ void arch_report_meminfo(struct seq_file *m) static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, unsigned long dtt) { - unsigned long table, mask; + unsigned long *table, mask; mask = 0; if (MACHINE_HAS_EDAT2) { @@ -72,7 +72,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1); break; } - table = (unsigned long)old & mask; + table = (unsigned long *)((unsigned long)old & mask); crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); } else if (MACHINE_HAS_IDTE) { cspg(old, *old, new); From 5caca32fba20050caa938dd444eb19cdd320217c Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 18 Jun 2021 08:46:32 +0200 Subject: [PATCH 154/512] s390/cpcmd: use physical address for command and response Virtual Console Function DIAGNOSE 8 accepts physical addresses of command and response strings. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/cpcmd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 54efc279f54e..72e106cfd8c7 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -29,7 +29,7 @@ static int diag8_noresponse(int cmdlen) asm volatile( " diag %[rx],%[ry],0x8\n" : [ry] "+&d" (cmdlen) - : [rx] "d" ((addr_t) cpcmd_buf) + : [rx] "d" (__pa(cpcmd_buf)) : "cc"); return cmdlen; } @@ -39,8 +39,8 @@ static int diag8_response(int cmdlen, char *response, int *rlen) union register_pair rx, ry; int cc; - rx.even = (addr_t) cpcmd_buf; - rx.odd = (addr_t) response; + rx.even = __pa(cpcmd_buf); + rx.odd = __pa(response); ry.even = cmdlen | 0x40000000L; ry.odd = *rlen; asm volatile( From e035389b73b11e787cb58999d31532971f9eb950 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Oct 2021 12:14:09 +0200 Subject: [PATCH 155/512] s390/setup: use virtual address for STSI instruction Provide virtual memory pointer for system-information block. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 191fc96a41b2..5f0b0dca41eb 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -879,14 +879,12 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE, - PAGE_SIZE); + vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!vmms) panic("Failed to allocate memory for sysinfo structure\n"); - if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); - memblock_free((unsigned long) vmms, PAGE_SIZE); + memblock_free_ptr(vmms, PAGE_SIZE); } /* From 04f11ed7d8e018e1f01ebda5814ddfeb3a1e6ae1 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 21 Jan 2021 13:06:02 +0100 Subject: [PATCH 156/512] s390/setup: use physical pointers for memblock_reserve() memblock_reserve() function accepts physcal address of a memory block to be reserved, but provided with virtual memory pointers. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 5f0b0dca41eb..9dbde3697bdf 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -805,13 +805,10 @@ static void __init check_initrd(void) */ static void __init reserve_kernel(void) { - unsigned long start_pfn = PFN_UP(__pa(_end)); - memblock_reserve(0, STARTUP_NORMAL_OFFSET); - memblock_reserve((unsigned long)sclp_early_sccb, EXT_SCCB_READ_SCP); memblock_reserve(__amode31_base, __eamode31 - __samode31); - memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - - (unsigned long)_stext); + memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); + memblock_reserve(__pa(_stext), _end - _stext); } static void __init setup_memory(void) From dd9089b65407756e3490ab2737373f957a650375 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 7 Oct 2021 15:01:39 +0200 Subject: [PATCH 157/512] s390/setup: convert start and end initrd pointers to virtual Variables initrd_start and initrd_end are expected to hold virtual memory pointers, not physical. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 9dbde3697bdf..860a4e6ebaf9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -719,7 +719,7 @@ static void __init reserve_initrd(void) #ifdef CONFIG_BLK_DEV_INITRD if (!initrd_data.start || !initrd_data.size) return; - initrd_start = initrd_data.start; + initrd_start = (unsigned long)__va(initrd_data.start); initrd_end = initrd_start + initrd_data.size; memblock_reserve(initrd_data.start, initrd_data.size); #endif From ada1da31ce34248bc97ca8f801f2cf6efa378a81 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 8 Feb 2021 16:01:17 +0100 Subject: [PATCH 158/512] s390/sclp: sort out physical vs virtual pointers usage Provide physical addresses whenever the hardware interface expects it or a 32-bit value used for tracking. Variable sclp_early_sccb gets initialized in the decompressor and points to an address in physcal memory. Yet, it is used as virtual memory pointer and therefore should be converted. Note, the other two __bootdata variables sclp_info_sccb and sclp_info_sccb_valid contain plain data, but no pointers and do need any special care. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/sclp.h | 1 + arch/s390/kernel/early.c | 1 + drivers/s390/char/sclp.c | 14 +++++++------- drivers/s390/char/sclp.h | 2 +- drivers/s390/char/sclp_early.c | 5 +++++ drivers/s390/char/sclp_sd.c | 2 +- 6 files changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index e3ae937bef1c..c68ea35de498 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -117,6 +117,7 @@ struct zpci_report_error_header { extern char *sclp_early_sccb; +void sclp_early_adjust_va(void); void sclp_early_set_buffer(void *sccb); int sclp_early_read_info(void); int sclp_early_read_storage_info(void); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 9857cb046726..ba3ace5ac73c 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -296,6 +296,7 @@ static void __init check_image_bootable(void) void __init startup_init(void) { + sclp_early_adjust_va(); reset_tod_clock(); check_image_bootable(); time_early_init(); diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index 2cf7fe131ece..f0763e36b861 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -163,7 +163,7 @@ static inline void sclp_trace_req(int prio, char *id, struct sclp_req *req, summary.timeout = (u16)req->queue_timeout; summary.start_count = (u16)req->start_count; - sclp_trace(prio, id, (u32)(addr_t)sccb, summary.b, err); + sclp_trace(prio, id, __pa(sccb), summary.b, err); } static inline void sclp_trace_register(int prio, char *id, u32 a, u64 b, @@ -502,7 +502,7 @@ sclp_add_request(struct sclp_req *req) } /* RQAD: Request was added (a=sccb, b=caller) */ - sclp_trace(2, "RQAD", (u32)(addr_t)req->sccb, _RET_IP_, false); + sclp_trace(2, "RQAD", __pa(req->sccb), _RET_IP_, false); req->status = SCLP_REQ_QUEUED; req->start_count = 0; @@ -617,15 +617,15 @@ __sclp_find_req(u32 sccb) list_for_each(l, &sclp_req_queue) { req = list_entry(l, struct sclp_req, list); - if (sccb == (u32) (addr_t) req->sccb) - return req; + if (sccb == __pa(req->sccb)) + return req; } return NULL; } static bool ok_response(u32 sccb_int, sclp_cmdw_t cmd) { - struct sccb_header *sccb = (struct sccb_header *)(addr_t)sccb_int; + struct sccb_header *sccb = (struct sccb_header *)__va(sccb_int); struct evbuf_header *evbuf; u16 response; @@ -664,7 +664,7 @@ static void sclp_interrupt_handler(struct ext_code ext_code, /* INT: Interrupt received (a=intparm, b=cmd) */ sclp_trace_sccb(0, "INT", param32, active_cmd, active_cmd, - (struct sccb_header *)(addr_t)finished_sccb, + (struct sccb_header *)__va(finished_sccb), !ok_response(finished_sccb, active_cmd)); if (finished_sccb) { @@ -1110,7 +1110,7 @@ static void sclp_check_handler(struct ext_code ext_code, /* Is this the interrupt we are waiting for? */ if (finished_sccb == 0) return; - if (finished_sccb != (u32) (addr_t) sclp_init_sccb) + if (finished_sccb != __pa(sclp_init_sccb)) panic("sclp: unsolicited interrupt for buffer at 0x%x\n", finished_sccb); spin_lock(&sclp_lock); diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 5e434108aae6..8a30e77db469 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -333,7 +333,7 @@ static inline int sclp_service_call(sclp_cmdw_t command, void *sccb) "2:\n" EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "+&d" (cc) : "d" (command), "a" ((unsigned long)sccb) + : "+&d" (cc) : "d" (command), "a" (__pa(sccb)) : "cc", "memory"); if (cc == 4) return -EINVAL; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index f3d5c7f4c13d..ab68684b5d83 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -155,6 +155,11 @@ static void __init sclp_early_console_detect(struct init_sccb *sccb) sclp.has_linemode = 1; } +void __init sclp_early_adjust_va(void) +{ + sclp_early_sccb = __va((unsigned long)sclp_early_sccb); +} + void __init sclp_early_detect(void) { void *sccb = sclp_early_sccb; diff --git a/drivers/s390/char/sclp_sd.c b/drivers/s390/char/sclp_sd.c index a5dd4e9f5b1b..25c2d760f6e6 100644 --- a/drivers/s390/char/sclp_sd.c +++ b/drivers/s390/char/sclp_sd.c @@ -194,7 +194,7 @@ static int sclp_sd_sync(unsigned long page, u8 eq, u8 di, u64 sat, u64 sa, struct sclp_sd_evbuf *evbuf; int rc; - sclp_sd_listener_init(&listener, (u32) (addr_t) sccb); + sclp_sd_listener_init(&listener, __pa(sccb)); sclp_sd_listener_add(&listener); /* Prepare SCCB */ From c8f573eccb7398d7c198e547c630351e69af5ca1 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 1 Oct 2021 13:42:08 +0200 Subject: [PATCH 159/512] s390/ptrace: add last_break member to pt_regs Instead of using args[0] for the value of the last breaking event address register, add a member to make things more obvious. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ptrace.h | 1 + arch/s390/kernel/dumpstack.c | 2 +- arch/s390/kernel/traps.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 82fc11907451..4ffa8e7f0ed3 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -96,6 +96,7 @@ struct pt_regs { }; unsigned long flags; unsigned long cr1; + unsigned long last_break; }; /* diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index db1bc00229ca..85f326e258df 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -152,7 +152,7 @@ void show_stack(struct task_struct *task, unsigned long *stack, static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); - printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]); + printk(" [<%016lx>] %pSR\n", regs->last_break, (void *)regs->last_break); } void show_registers(struct pt_regs *regs) diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index bcefc2173de4..d984f0b42604 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -314,7 +314,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs) if (last_break < 4096) last_break = 1; current->thread.last_break = last_break; - regs->args[0] = last_break; + regs->last_break = last_break; } if (S390_lowcore.pgm_code & 0x0200) { From 26c21aa485841fecb8514a8261f759d34576eb6a Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 1 Oct 2021 17:08:09 +0200 Subject: [PATCH 160/512] s390: rename last_break to pgm_last_break With the upcoming BEAR enhancements last_break isn't really unique, so rename it to pgm_last_break. This way it should be more obvious that this is the last_break value that is written by the hardware when a program check occurs. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/boot/pgm_check_info.c | 4 ++-- arch/s390/include/asm/lowcore.h | 2 +- arch/s390/kernel/asm-offsets.c | 2 +- arch/s390/kernel/traps.c | 2 +- arch/s390/kvm/interrupt.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c index 75bcbfa27941..c2a1defc79da 100644 --- a/arch/s390/boot/pgm_check_info.c +++ b/arch/s390/boot/pgm_check_info.c @@ -175,6 +175,6 @@ void print_pgm_check_info(void) gpregs[12], gpregs[13], gpregs[14], gpregs[15]); print_stacktrace(); decompressor_printk("Last Breaking-Event-Address:\n"); - decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.breaking_event_addr, - (void *)S390_lowcore.breaking_event_addr); + decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break, + (void *)S390_lowcore.pgm_last_break); } diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 11213c8bfca5..1129a1e93e80 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -65,7 +65,7 @@ struct lowcore { __u32 external_damage_code; /* 0x00f4 */ __u64 failing_storage_address; /* 0x00f8 */ __u8 pad_0x0100[0x0110-0x0100]; /* 0x0100 */ - __u64 breaking_event_addr; /* 0x0110 */ + __u64 pgm_last_break; /* 0x0110 */ __u8 pad_0x0118[0x0120-0x0118]; /* 0x0118 */ psw_t restart_old_psw; /* 0x0120 */ psw_t external_old_psw; /* 0x0130 */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index b6ee3fd7fe0c..44eb79e4299e 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -78,7 +78,7 @@ int main(void) OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); - OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); + OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break); OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe); OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index d984f0b42604..d32a6ee7b0dd 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -300,7 +300,7 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { - unsigned long last_break = S390_lowcore.breaking_event_addr; + unsigned long last_break = S390_lowcore.pgm_last_break; unsigned int trapnr; irqentry_state_t state; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 10722455fd02..80971e59c604 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -960,7 +960,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) /* bit 1+2 of the target are the ilc, so we can directly use ilen */ rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC); rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea, - (u64 *) __LC_LAST_BREAK); + (u64 *) __LC_PGM_LAST_BREAK); rc |= put_guest_lc(vcpu, pgm_info.code, (u16 *)__LC_PGM_INT_CODE); rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, From 5d17d4ed7e892be3670de7189c76009e12397fe7 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 4 Oct 2021 08:51:06 +0200 Subject: [PATCH 161/512] s390: introduce nospec_uses_trampoline() and replace all of the "__is_defined(CC_USING_EXPOLINE) && !nospec_disable" occurrences. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/nospec-branch.h | 5 +++++ arch/s390/kernel/nospec-branch.c | 2 +- arch/s390/kernel/nospec-sysfs.c | 2 +- arch/s390/net/bpf_jit_comp.c | 6 +++--- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index b4bd8c41e9d3..82725cf783c7 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -12,6 +12,11 @@ void nospec_init_branches(void); void nospec_auto_detect(void); void nospec_revert(s32 *start, s32 *end); +static inline bool nospec_uses_trampoline(void) +{ + return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 250e4dbf653c..60e6fec27bba 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -38,7 +38,7 @@ static int __init nospec_report(void) { if (test_facility(156)) pr_info("Spectre V2 mitigation: etokens\n"); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) + if (nospec_uses_trampoline()) pr_info("Spectre V2 mitigation: execute trampolines\n"); if (__test_facility(82, alt_stfle_fac_list)) pr_info("Spectre V2 mitigation: limited branch prediction\n"); diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index b4b5c8c21166..52d4353188ad 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -15,7 +15,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (test_facility(156)) return sprintf(buf, "Mitigation: etokens\n"); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) + if (nospec_uses_trampoline()) return sprintf(buf, "Mitigation: execute trampolines\n"); if (__test_facility(82, alt_stfle_fac_list)) return sprintf(buf, "Mitigation: limited branch prediction\n"); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 1a374d021e25..233cc9bcd652 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -567,7 +567,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE, stack_depth); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { + if (nospec_uses_trampoline()) { jit->r14_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r14 thunk */ if (test_facility(35)) { @@ -585,7 +585,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) /* br %r14 */ _EMIT2(0x07fe); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable && + if ((nospec_uses_trampoline()) && (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) { jit->r1_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r1 thunk */ @@ -1332,7 +1332,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->seen |= SEEN_FUNC; /* lgrl %w1,func */ EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { + if (nospec_uses_trampoline()) { /* brasl %r14,__s390_indirect_jump_r1 */ EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); } else { From 3b051e89da70d464a036a86d70ce2ed61c73f792 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 7 Apr 2021 09:20:17 +0200 Subject: [PATCH 162/512] s390: add support for BEAR enhancement facility The Breaking-Event-Address-Register (BEAR) stores the address of the last breaking event instruction. Breaking events are usually instructions that change the program flow - for example branches, and instructions that modify the address in the PSW like lpswe. This is useful for debugging wild branches, because one could easily figure out where the wild branch was originating from. What is problematic is that lpswe is considered a breaking event, and therefore overwrites BEAR on kernel exit. The BEAR enhancement facility adds new instructions that allow to save/restore BEAR and also an lpswey instruction that doesn't cause a breaking event. So we can save BEAR on kernel entry and restore it on exit to user space. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/cpu.h | 3 +++ arch/s390/include/asm/lowcore.h | 7 ++--- arch/s390/kernel/asm-offsets.c | 3 +++ arch/s390/kernel/entry.S | 45 +++++++++++++++++++++++++++------ arch/s390/kernel/irq.c | 10 ++++++-- arch/s390/kernel/process.c | 2 +- arch/s390/kernel/setup.c | 5 ++++ arch/s390/kernel/syscall.c | 2 ++ arch/s390/kernel/traps.c | 10 ++++---- arch/s390/mm/dump_pagetables.c | 14 +++++++--- arch/s390/mm/vmem.c | 10 ++++++-- 11 files changed, 87 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h index 62228a884e06..26c710cd3485 100644 --- a/arch/s390/include/asm/cpu.h +++ b/arch/s390/include/asm/cpu.h @@ -12,6 +12,7 @@ #ifndef __ASSEMBLY__ #include +#include struct cpuid { @@ -21,5 +22,7 @@ struct cpuid unsigned int unused : 16; } __attribute__ ((packed, aligned(8))); +DECLARE_STATIC_KEY_FALSE(cpu_has_bear); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_CPU_H */ diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 1129a1e93e80..1262f5003acf 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -93,9 +93,10 @@ struct lowcore { psw_t return_psw; /* 0x0290 */ psw_t return_mcck_psw; /* 0x02a0 */ + __u64 last_break; /* 0x02b0 */ + /* CPU accounting and timing values. */ - __u64 sys_enter_timer; /* 0x02b0 */ - __u8 pad_0x02b8[0x02c0-0x02b8]; /* 0x02b8 */ + __u64 sys_enter_timer; /* 0x02b8 */ __u64 mcck_enter_timer; /* 0x02c0 */ __u64 exit_timer; /* 0x02c8 */ __u64 user_timer; /* 0x02d0 */ @@ -188,7 +189,7 @@ struct lowcore { __u32 tod_progreg_save_area; /* 0x1324 */ __u32 cpu_timer_save_area[2]; /* 0x1328 */ __u32 clock_comp_save_area[2]; /* 0x1330 */ - __u8 pad_0x1338[0x1340-0x1338]; /* 0x1338 */ + __u64 last_break_save_area; /* 0x1338 */ __u32 access_regs_save_area[16]; /* 0x1340 */ __u64 cregs_save_area[16]; /* 0x1380 */ __u8 pad_0x1400[0x1800-0x1400]; /* 0x1400 */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 44eb79e4299e..28177e4f52cc 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -35,6 +35,7 @@ int main(void) OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); OFFSET(__PT_FLAGS, pt_regs, flags); OFFSET(__PT_CR1, pt_regs, cr1); + OFFSET(__PT_LAST_BREAK, pt_regs, last_break); DEFINE(__PT_SIZE, sizeof(struct pt_regs)); BLANK(); /* stack_frame offsets */ @@ -127,6 +128,7 @@ int main(void) OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); OFFSET(__LC_GMAP, lowcore, gmap); OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline); + OFFSET(__LC_LAST_BREAK, lowcore, last_break); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); /* hardware defined lowcore locations 0x1000 - 0x18ff */ @@ -140,6 +142,7 @@ int main(void) OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area); OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area); OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area); + OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area); OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area); OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 4c9b967290ae..01bae1d51113 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -52,6 +52,22 @@ STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE _LPP_OFFSET = __LC_LPP + .macro STBEAR address + ALTERNATIVE "", ".insn s,0xb2010000,\address", 193 + .endm + + .macro LBEAR address + ALTERNATIVE "", ".insn s,0xb2000000,\address", 193 + .endm + + .macro LPSWEY address,lpswe + ALTERNATIVE "b \lpswe", ".insn siy,0xeb0000000071,\address,0", 193 + .endm + + .macro MBEAR reg + ALTERNATIVE "", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 + .endm + .macro CHECK_STACK savearea #ifdef CONFIG_CHECK_STACK tml %r15,STACK_SIZE - CONFIG_STACK_GUARD @@ -302,6 +318,7 @@ ENTRY(system_call) BPOFF lghi %r14,0 .Lsysc_per: + STBEAR __LC_LAST_BREAK lctlg %c1,%c1,__LC_KERNEL_ASCE lg %r12,__LC_CURRENT lg %r15,__LC_KERNEL_STACK @@ -321,14 +338,16 @@ ENTRY(system_call) xgr %r11,%r11 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC + MBEAR %r2 lgr %r3,%r14 brasl %r14,__do_syscall lctlg %c1,%c1,__LC_USER_ASCE mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) stpt __LC_EXIT_TIMER - b __LC_RETURN_LPSWE + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE ENDPROC(system_call) # @@ -340,9 +359,10 @@ ENTRY(ret_from_fork) lctlg %c1,%c1,__LC_USER_ASCE mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) stpt __LC_EXIT_TIMER - b __LC_RETURN_LPSWE + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE ENDPROC(ret_from_fork) /* @@ -382,6 +402,7 @@ ENTRY(pgm_check_handler) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK stmg %r8,%r9,__PT_PSW(%r11) # clear user controlled registers to prevent speculative use @@ -401,8 +422,9 @@ ENTRY(pgm_check_handler) stpt __LC_EXIT_TIMER .Lpgm_exit_kernel: mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - b __LC_RETURN_LPSWE + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # # single stepped system call @@ -412,7 +434,8 @@ ENTRY(pgm_check_handler) larl %r14,.Lsysc_per stg %r14,__LC_RETURN_PSW+8 lghi %r14,1 - lpswe __LC_RETURN_PSW # branch to .Lsysc_per + LBEAR __LC_PGM_LAST_BREAK + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per ENDPROC(pgm_check_handler) /* @@ -422,6 +445,7 @@ ENDPROC(pgm_check_handler) ENTRY(\name) STCK __LC_INT_CLOCK stpt __LC_SYS_ENTER_TIMER + STBEAR __LC_LAST_BREAK BPOFF stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r12,__LC_CURRENT @@ -453,6 +477,7 @@ ENTRY(\name) xgr %r10,%r10 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC + MBEAR %r11 stmg %r8,%r9,__PT_PSW(%r11) tm %r8,0x0001 # coming from user space? jno 1f @@ -465,8 +490,9 @@ ENTRY(\name) lctlg %c1,%c1,__LC_USER_ASCE BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER -2: lmg %r0,%r15,__PT_R0(%r11) - b __LC_RETURN_LPSWE +2: LBEAR __PT_LAST_BREAK(%r11) + lmg %r0,%r15,__PT_R0(%r11) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE ENDPROC(\name) .endm @@ -505,6 +531,7 @@ ENTRY(mcck_int_handler) BPOFF la %r1,4095 # validate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer + LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs lg %r12,__LC_CURRENT lmg %r8,%r9,__LC_MCK_OLD_PSW @@ -591,8 +618,10 @@ ENTRY(mcck_int_handler) jno 0f BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER -0: lmg %r11,%r15,__PT_R11(%r11) - b __LC_RETURN_MCCK_LPSWE +0: ALTERNATIVE "", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 + LBEAR 0(%r12) + lmg %r11,%r15,__PT_R11(%r11) + LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE .Lmcck_panic: /* diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 3a3145c4a3ba..0df83ecaa2e0 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -140,8 +140,11 @@ void noinstr do_io_irq(struct pt_regs *regs) irq_enter(); - if (user_mode(regs)) + if (user_mode(regs)) { update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + } from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; if (from_idle) @@ -171,8 +174,11 @@ void noinstr do_ext_irq(struct pt_regs *regs) irq_enter(); - if (user_mode(regs)) + if (user_mode(regs)) { update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + } regs->int_code = S390_lowcore.ext_int_code_addr; regs->int_parm = S390_lowcore.ext_params; diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 350e94d0cac2..e6b9b4753fd3 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -141,7 +141,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, frame->childregs.gprs[10] = arg; frame->childregs.gprs[11] = (unsigned long)do_exit; frame->childregs.orig_gpr2 = -1; - + frame->childregs.last_break = 1; return 0; } frame->childregs = *current_pt_regs(); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 860a4e6ebaf9..e738a45057ac 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -174,6 +174,8 @@ unsigned long MODULES_END; struct lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); +DEFINE_STATIC_KEY_FALSE(cpu_has_bear); + /* * The Write Back bit position in the physaddr is given by the SLPC PCI. * Leaving the mask zero always uses write through which is safe @@ -1038,6 +1040,9 @@ void __init setup_arch(char **cmdline_p) smp_detect_cpus(); topology_init_early(); + if (test_facility(193)) + static_branch_enable(&cpu_has_bear); + /* * Create kernel page tables and switch to virtual addressing. */ diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 8fe2d23b64f4..dc2355c623d6 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -154,6 +154,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap) regs->psw = S390_lowcore.svc_old_psw; regs->int_code = S390_lowcore.svc_int_code; update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; local_irq_enable(); regs->orig_gpr2 = regs->gprs[2]; diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index d32a6ee7b0dd..6c6f7dcce1a5 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -300,7 +300,6 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { - unsigned long last_break = S390_lowcore.pgm_last_break; unsigned int trapnr; irqentry_state_t state; @@ -311,10 +310,11 @@ void noinstr __do_pgm_check(struct pt_regs *regs) if (user_mode(regs)) { update_timer_sys(); - if (last_break < 4096) - last_break = 1; - current->thread.last_break = last_break; - regs->last_break = last_break; + if (!static_branch_likely(&cpu_has_bear)) { + if (regs->last_break < 4096) + regs->last_break = 1; + } + current->thread.last_break = regs->last_break; } if (S390_lowcore.pgm_code & 0x0200) { diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 0b0c8c284953..9f9af5298dd6 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -8,6 +8,7 @@ #include #include #include +#include #include static unsigned long max_addr; @@ -116,8 +117,13 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) return; if (st->current_prot & _PAGE_NOEXEC) return; - /* The first lowcore page is currently still W+X. */ - if (addr == PAGE_SIZE) + /* + * The first lowcore page is W+X if spectre mitigations are using + * trampolines or the BEAR enhancements facility is not installed, + * in which case we have two lpswe instructions in lowcore that need + * to be executable. + */ + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) return; WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); @@ -203,7 +209,9 @@ void ptdump_check_wx(void) if (st.wx_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); else - pr_info("Checked W+X mappings: passed, no unexpected W+X pages found\n"); + pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", + (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? + "unexpected " : ""); } #endif /* CONFIG_DEBUG_WX */ diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 2b1c6d916cf9..7d9705eeb02f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -584,8 +585,13 @@ void __init vmem_map_init(void) __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - /* we need lowcore executable for our LPSWE instructions */ - set_memory_x(0, 1); + if (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) { + /* + * Lowcore must be executable for LPSWE + * and expoline trampoline branch instructions. + */ + set_memory_x(0, 1); + } pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); From ff7a1eefdff5867db4a4f678deed0d04f32cad15 Mon Sep 17 00:00:00 2001 From: Huilong Deng Date: Sun, 17 Oct 2021 17:20:57 +0800 Subject: [PATCH 163/512] s390/bitops: return true/false (not 1/0) from bool functions Signed-off-by: Huilong Deng Link: https://lore.kernel.org/r/20211017092057.24179-1-denghuilong@cdjrlc.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index fd149480b6e2..5a530c552c23 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -188,7 +188,7 @@ static inline bool arch_test_and_set_bit_lock(unsigned long nr, volatile unsigned long *ptr) { if (arch_test_bit(nr, ptr)) - return 1; + return true; return arch_test_and_set_bit(nr, ptr); } From 453380318edd523bc08bf4892d354b49cb85bb0b Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 27 Sep 2021 15:01:56 +0200 Subject: [PATCH 164/512] s390/cpumf: Allow multiple processes to access /dev/hwc Commit a029a4eab39e ("s390/cpumf: Allow concurrent access for CPU Measurement Counter Facility") added CPU Measurement counter facility access to multiple consumers. It allows concurrent access to the CPU Measurement counter facility via several perf_event_open() system call invocations and via ioctl() system call of device /dev/hwc. However the access via device /dev/hwc was exclusive, only one process was able to open this device. The patch removes this restriction. Now multiple invocations of lshwc can execute in parallel. They can access different CPUs and counter sets or CPUs and counter set can overlap. Signed-off-by: Thomas Richter Suggested-by: Heiko Carstens Reviewed-by: Sumanth Korikkar Acked-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_cf.c | 236 +++++++++++++++++++++----------- 1 file changed, 154 insertions(+), 82 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 4a99154fe651..6f431fa9e4d7 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -773,22 +773,46 @@ static int __init cpumf_pmu_init(void) * counter set via normal file operations. */ -static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Excl. access */ +static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Access count */ static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ unsigned int sets; /* Counter set bit mask */ atomic_t cpus_ack; /* # CPUs successfully executed func */ }; -static struct cfset_request { /* CPUs and counter set bit mask */ +static struct cfset_session { /* CPUs and counter set bit mask */ + struct list_head head; /* Head of list of active processes */ +} cfset_session = { + .head = LIST_HEAD_INIT(cfset_session.head) +}; + +struct cfset_request { /* CPUs and counter set bit mask */ unsigned long ctrset; /* Bit mask of counter set to read */ cpumask_t mask; /* CPU mask to read from */ -} cfset_request; + struct list_head node; /* Chain to cfset_session.head */ +}; -static void cfset_ctrset_clear(void) +static void cfset_session_init(void) { - cpumask_clear(&cfset_request.mask); - cfset_request.ctrset = 0; + INIT_LIST_HEAD(&cfset_session.head); +} + +/* Remove current request from global bookkeeping. Maintain a counter set bit + * mask on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_del(struct cfset_request *p) +{ + list_del(&p->node); +} + +/* Add current request to global bookkeeping. Maintain a counter set bit mask + * on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_add(struct cfset_request *p) +{ + list_add(&p->node, &cfset_session.head); } /* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access @@ -827,15 +851,23 @@ static void cfset_ioctl_off(void *parm) struct cfset_call_on_cpu_parm *p = parm; int rc; - cpuhw->dev_state = 0; + /* Check if any counter set used by /dev/hwc */ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) - if ((p->sets & cpumf_ctr_ctl[rc])) - atomic_dec(&cpuhw->ctr_set[rc]); - rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if ((p->sets & cpumf_ctr_ctl[rc])) { + if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { + ctr_set_disable(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + ctr_set_stop(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + } + } + /* Keep perf_event_open counter sets */ + rc = lcctl(cpuhw->dev_state | cpuhw->state); if (rc) pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", cpuhw->state, S390_HWCTR_DEVICE, rc); - cpuhw->flags &= ~PMU_F_IN_USE; + if (!cpuhw->dev_state) + cpuhw->flags &= ~PMU_F_IN_USE; debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", __func__, rc, cpuhw->state, cpuhw->dev_state); } @@ -870,11 +902,26 @@ static void cfset_release_cpu(void *p) debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", __func__, cpuhw->state, cpuhw->dev_state); + cpuhw->dev_state = 0; rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ if (rc) pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", cpuhw->state, S390_HWCTR_DEVICE, rc); - cpuhw->dev_state = 0; +} + +/* This modifies the process CPU mask to adopt it to the currently online + * CPUs. Offline CPUs can not be addresses. This call terminates the access + * and is usually followed by close() or a new iotcl(..., START, ...) which + * creates a new request structure. + */ +static void cfset_all_stop(struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p = { + .sets = req->ctrset, + }; + + cpumask_and(&req->mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1); } /* Release function is also called when application gets terminated without @@ -882,10 +929,19 @@ static void cfset_release_cpu(void *p) */ static int cfset_release(struct inode *inode, struct file *file) { - on_each_cpu(cfset_release_cpu, NULL, 1); + mutex_lock(&cfset_ctrset_mutex); + /* Open followed by close/exit has no private_data */ + if (file->private_data) { + cfset_all_stop(file->private_data); + cfset_session_del(file->private_data); + kfree(file->private_data); + file->private_data = NULL; + } + if (!atomic_dec_return(&cfset_opencnt)) + on_each_cpu(cfset_release_cpu, NULL, 1); + mutex_unlock(&cfset_ctrset_mutex); + hw_perf_event_destroy(NULL); - cfset_ctrset_clear(); - atomic_set(&cfset_opencnt, 0); return 0; } @@ -893,9 +949,10 @@ static int cfset_open(struct inode *inode, struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - /* Only one user space program can open /dev/hwctr */ - if (atomic_xchg(&cfset_opencnt, 1)) - return -EBUSY; + mutex_lock(&cfset_ctrset_mutex); + if (atomic_inc_return(&cfset_opencnt) == 1) + cfset_session_init(); + mutex_unlock(&cfset_ctrset_mutex); cpumf_hw_inuse(); file->private_data = NULL; @@ -903,25 +960,10 @@ static int cfset_open(struct inode *inode, struct file *file) return nonseekable_open(inode, file); } -static int cfset_all_stop(void) +static int cfset_all_start(struct cfset_request *req) { struct cfset_call_on_cpu_parm p = { - .sets = cfset_request.ctrset, - }; - cpumask_var_t mask; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); - on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); - free_cpumask_var(mask); - return 0; -} - -static int cfset_all_start(void) -{ - struct cfset_call_on_cpu_parm p = { - .sets = cfset_request.ctrset, + .sets = req->ctrset, .cpus_ack = ATOMIC_INIT(0), }; cpumask_var_t mask; @@ -929,7 +971,7 @@ static int cfset_all_start(void) if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + cpumask_and(mask, &req->mask, cpu_online_mask); on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); @@ -1045,7 +1087,7 @@ static void cfset_cpu_read(void *parm) cpuhw->sets, cpuhw->used); } -static int cfset_all_read(unsigned long arg) +static int cfset_all_read(unsigned long arg, struct cfset_request *req) { struct cfset_call_on_cpu_parm p; cpumask_var_t mask; @@ -1054,46 +1096,53 @@ static int cfset_all_read(unsigned long arg) if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - p.sets = cfset_request.ctrset; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + p.sets = req->ctrset; + cpumask_and(mask, &req->mask, cpu_online_mask); on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); rc = cfset_all_copy(arg, mask); free_cpumask_var(mask); return rc; } -static long cfset_ioctl_read(unsigned long arg) +static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) { struct s390_ctrset_read read; - int ret = 0; + int ret = -ENODATA; - if (copy_from_user(&read, (char __user *)arg, sizeof(read))) - return -EFAULT; - ret = cfset_all_read(arg); - return ret; -} - -static long cfset_ioctl_stop(void) -{ - int ret = ENXIO; - - if (cfset_request.ctrset) { - ret = cfset_all_stop(); - cfset_ctrset_clear(); + if (req && req->ctrset) { + if (copy_from_user(&read, (char __user *)arg, sizeof(read))) + return -EFAULT; + ret = cfset_all_read(arg, req); } return ret; } -static long cfset_ioctl_start(unsigned long arg) +static long cfset_ioctl_stop(struct file *file) +{ + struct cfset_request *req = file->private_data; + int ret = -ENXIO; + + if (req) { + cfset_all_stop(req); + cfset_session_del(req); + kfree(req); + file->private_data = NULL; + ret = 0; + } + return ret; +} + +static long cfset_ioctl_start(unsigned long arg, struct file *file) { struct s390_ctrset_start __user *ustart; struct s390_ctrset_start start; + struct cfset_request *preq; void __user *umask; unsigned int len; int ret = 0; size_t need; - if (cfset_request.ctrset) + if (file->private_data) return -EBUSY; ustart = (struct s390_ctrset_start __user *)arg; if (copy_from_user(&start, ustart, sizeof(start))) @@ -1108,25 +1157,36 @@ static long cfset_ioctl_start(unsigned long arg) return -EINVAL; /* Invalid counter set */ if (!start.counter_sets) return -EINVAL; /* No counter set at all? */ - cpumask_clear(&cfset_request.mask); + + preq = kzalloc(sizeof(*preq), GFP_KERNEL); + if (!preq) + return -ENOMEM; + cpumask_clear(&preq->mask); len = min_t(u64, start.cpumask_len, cpumask_size()); umask = (void __user *)start.cpumask; - if (copy_from_user(&cfset_request.mask, umask, len)) + if (copy_from_user(&preq->mask, umask, len)) { + kfree(preq); return -EFAULT; - if (cpumask_empty(&cfset_request.mask)) + } + if (cpumask_empty(&preq->mask)) { + kfree(preq); return -EINVAL; + } need = cfset_needspace(start.counter_sets); - if (put_user(need, &ustart->data_bytes)) - ret = -EFAULT; - if (ret) - goto out; - cfset_request.ctrset = start.counter_sets; - ret = cfset_all_start(); -out: - if (ret) - cfset_ctrset_clear(); - debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n", - __func__, cfset_request.ctrset, need, ret); + if (put_user(need, &ustart->data_bytes)) { + kfree(preq); + return -EFAULT; + } + preq->ctrset = start.counter_sets; + ret = cfset_all_start(preq); + if (!ret) { + cfset_session_add(preq); + file->private_data = preq; + debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n", + __func__, preq->ctrset, need, ret); + } else { + kfree(preq); + } return ret; } @@ -1136,7 +1196,7 @@ out: * counter set keeps running until explicitly stopped. Returns the number * of bytes needed to store the counter values. If another S390_HWCTR_START * ioctl subcommand is called without a previous S390_HWCTR_STOP stop - * command, -EBUSY is returned. + * command on the same file descriptor, -EBUSY is returned. * S390_HWCTR_READ: Read the counter set values from specified CPU list given * with the S390_HWCTR_START command. * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the @@ -1150,13 +1210,13 @@ static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) mutex_lock(&cfset_ctrset_mutex); switch (cmd) { case S390_HWCTR_START: - ret = cfset_ioctl_start(arg); + ret = cfset_ioctl_start(arg, file); break; case S390_HWCTR_STOP: - ret = cfset_ioctl_stop(); + ret = cfset_ioctl_stop(file); break; case S390_HWCTR_READ: - ret = cfset_ioctl_read(arg); + ret = cfset_ioctl_read(arg, file->private_data); break; default: ret = -ENOTTY; @@ -1182,29 +1242,41 @@ static struct miscdevice cfset_dev = { .fops = &cfset_fops, }; +/* Hotplug add of a CPU. Scan through all active processes and add + * that CPU to the list of CPUs supplied with ioctl(..., START, ...). + */ int cfset_online_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; mutex_lock(&cfset_ctrset_mutex); - if (cfset_request.ctrset) { - p.sets = cfset_request.ctrset; - cfset_ioctl_on(&p); - cpumask_set_cpu(cpu, &cfset_request.mask); + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_on(&p); + cpumask_set_cpu(cpu, &rp->mask); + } } mutex_unlock(&cfset_ctrset_mutex); return 0; } +/* Hotplug remove of a CPU. Scan through all active processes and clear + * that CPU from the list of CPUs supplied with ioctl(..., START, ...). + */ int cfset_offline_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; mutex_lock(&cfset_ctrset_mutex); - if (cfset_request.ctrset) { - p.sets = cfset_request.ctrset; - cfset_ioctl_off(&p); - cpumask_clear_cpu(cpu, &cfset_request.mask); + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_off(&p); + cpumask_clear_cpu(cpu, &rp->mask); + } } mutex_unlock(&cfset_ctrset_mutex); return 0; From d0982725655721800878f3eb1cfd944ec3dc2107 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 19 Oct 2021 17:51:08 +0200 Subject: [PATCH 165/512] s390/ap: new module option ap.useirq This patch introduces a new AP module option to be able to control if the ap bus code is using interrupts or not. By default if the interrupt support is available it is used. This option makes it possible to disable interrupt use even when interrupt support is available. It should be obvious that this option can't magically enable interrupt support when the hardware or hypervisor layer does not support AP interrupts. On the kernel command line use ap.useirq=0 or ap.useirq=1 to disable or enable (that's the default) interrupt use. Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 38d71e076048..806d184482a9 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -61,6 +61,10 @@ static char *aqm_str; module_param_named(aqmask, aqm_str, charp, 0440); MODULE_PARM_DESC(aqmask, "AP bus domain mask."); +static int ap_useirq = 1; +module_param_named(useirq, ap_useirq, int, 0440); +MODULE_PARM_DESC(useirq, "Use interrupt if available, default is 1 (on)."); + atomic_t ap_max_msg_size = ATOMIC_INIT(AP_DEFAULT_MAX_MSG_SIZE); EXPORT_SYMBOL(ap_max_msg_size); @@ -1899,7 +1903,7 @@ static int __init ap_module_init(void) } /* enable interrupts if available */ - if (ap_interrupts_available()) { + if (ap_interrupts_available() && ap_useirq) { rc = register_adapter_interrupt(&ap_airq); ap_irq_flag = (rc == 0); } From a4892f85c85dc279f48b38f091ae2d108750d45e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 19 Oct 2021 19:11:42 +0200 Subject: [PATCH 166/512] s390/hmcdrv: fix kernel doc comments Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- drivers/s390/char/sclp_ftp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/s390/char/sclp_ftp.c b/drivers/s390/char/sclp_ftp.c index 1e9de99dcd02..ec5a0e2b9255 100644 --- a/drivers/s390/char/sclp_ftp.c +++ b/drivers/s390/char/sclp_ftp.c @@ -31,6 +31,8 @@ static u64 sclp_ftp_length; /** * sclp_ftp_txcb() - Diagnostic Test FTP services SCLP command callback + * @req: sclp request + * @data: pointer to struct completion */ static void sclp_ftp_txcb(struct sclp_req *req, void *data) { @@ -45,6 +47,7 @@ static void sclp_ftp_txcb(struct sclp_req *req, void *data) /** * sclp_ftp_rxcb() - Diagnostic Test FTP services receiver event callback + * @evbuf: pointer to Diagnostic Test (ET7) event buffer */ static void sclp_ftp_rxcb(struct evbuf_header *evbuf) { From 5ef4f710065d30c57326fface6b68681a59776ba Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 19 Oct 2021 12:57:32 -0400 Subject: [PATCH 167/512] s390/vfio-ap: s390/crypto: fix all kernel-doc warnings Fixes the kernel-doc warnings in the following source files: * drivers/s390/crypto/vfio_ap_private.h * drivers/s390/crypto/vfio_ap_drv.c * drivers/s390/crypto/vfio_ap_ops.c Signed-off-by: Tony Krowiak Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/vfio_ap_drv.c | 16 ++++++---- drivers/s390/crypto/vfio_ap_ops.c | 5 ++-- drivers/s390/crypto/vfio_ap_private.h | 43 +++++++++++++++++++-------- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index 4d2556bc7fe5..03311a476366 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -42,10 +42,13 @@ static struct ap_device_id ap_queue_ids[] = { MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids); /** - * vfio_ap_queue_dev_probe: + * vfio_ap_queue_dev_probe: Allocate a vfio_ap_queue structure and associate it + * with the device as driver_data. * - * Allocate a vfio_ap_queue structure and associate it - * with the device as driver_data. + * @apdev: the AP device being probed + * + * Return: returns 0 if the probe succeeded; otherwise, returns -ENOMEM if + * storage could not be allocated for a vfio_ap_queue object. */ static int vfio_ap_queue_dev_probe(struct ap_device *apdev) { @@ -61,10 +64,11 @@ static int vfio_ap_queue_dev_probe(struct ap_device *apdev) } /** - * vfio_ap_queue_dev_remove: + * vfio_ap_queue_dev_remove: Free the associated vfio_ap_queue structure. * - * Takes the matrix lock to avoid actions on this device while removing - * Free the associated vfio_ap_queue structure + * @apdev: the AP device being removed + * + * Takes the matrix lock to avoid actions on this device while doing the remove. */ static void vfio_ap_queue_dev_remove(struct ap_device *apdev) { diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 623d5269a52c..94c1c9bd58ad 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -187,6 +187,8 @@ end_free: * vfio_ap_irq_enable - Enable Interruption for a APQN * * @q: the vfio_ap_queue holding AQIC parameters + * @isc: the guest ISC to register with the GIB interface + * @nib: the notification indicator byte to pin. * * Pin the NIB saved in *q * Register the guest ISC to GIB interface and retrieve the @@ -738,7 +740,6 @@ vfio_ap_mdev_verify_queues_reserved_for_apqi(struct ap_matrix_mdev *matrix_mdev, * assign_domain_store - parses the APQI from @buf and sets the * corresponding bit in the mediated matrix device's AQM * - * * @dev: the matrix device * @attr: the mediated matrix device's assign_domain attribute * @buf: a buffer containing the AP queue index (APQI) of the domain to @@ -866,7 +867,6 @@ static DEVICE_ATTR_WO(unassign_domain); * assign_control_domain_store - parses the domain ID from @buf and sets * the corresponding bit in the mediated matrix device's ADM * - * * @dev: the matrix device * @attr: the mediated matrix device's assign_control_domain attribute * @buf: a buffer containing the domain ID to be assigned @@ -1142,6 +1142,7 @@ static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb, * by @matrix_mdev. * * @matrix_mdev: a matrix mediated device + * @kvm: the pointer to the kvm structure being unset. * * Note: The matrix_dev->lock must be taken prior to calling * this function; however, the lock will be temporarily released while the diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 77760e2b546f..648fcaf8104a 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -26,16 +26,18 @@ #define VFIO_AP_DRV_NAME "vfio_ap" /** - * ap_matrix_dev - the AP matrix device structure + * struct ap_matrix_dev - Contains the data for the matrix device. + * * @device: generic device structure associated with the AP matrix device * @available_instances: number of mediated matrix devices that can be created * @info: the struct containing the output from the PQAP(QCI) instruction - * mdev_list: the list of mediated matrix devices created - * lock: mutex for locking the AP matrix device. This lock will be + * @mdev_list: the list of mediated matrix devices created + * @lock: mutex for locking the AP matrix device. This lock will be * taken every time we fiddle with state managed by the vfio_ap * driver, be it using @mdev_list or writing the state of a * single ap_matrix_mdev device. It's quite coarse but we don't * expect much contention. + * @vfio_ap_drv: the vfio_ap device driver */ struct ap_matrix_dev { struct device device; @@ -49,17 +51,19 @@ struct ap_matrix_dev { extern struct ap_matrix_dev *matrix_dev; /** - * The AP matrix is comprised of three bit masks identifying the adapters, - * queues (domains) and control domains that belong to an AP matrix. The bits i - * each mask, from least significant to most significant bit, correspond to IDs - * 0 to 255. When a bit is set, the corresponding ID belongs to the matrix. + * struct ap_matrix - matrix of adapters, domains and control domains * * @apm_max: max adapter number in @apm - * @apm identifies the AP adapters in the matrix + * @apm: identifies the AP adapters in the matrix * @aqm_max: max domain number in @aqm - * @aqm identifies the AP queues (domains) in the matrix + * @aqm: identifies the AP queues (domains) in the matrix * @adm_max: max domain number in @adm - * @adm identifies the AP control domains in the matrix + * @adm: identifies the AP control domains in the matrix + * + * The AP matrix is comprised of three bit masks identifying the adapters, + * queues (domains) and control domains that belong to an AP matrix. The bits in + * each mask, from left to right, correspond to IDs 0 to 255. When a bit is set + * the corresponding ID belongs to the matrix. */ struct ap_matrix { unsigned long apm_max; @@ -71,13 +75,20 @@ struct ap_matrix { }; /** - * struct ap_matrix_mdev - the mediated matrix device structure - * @list: allows the ap_matrix_mdev struct to be added to a list + * struct ap_matrix_mdev - Contains the data associated with a matrix mediated + * device. + * @vdev: the vfio device + * @node: allows the ap_matrix_mdev struct to be added to a list * @matrix: the adapters, usage domains and control domains assigned to the * mediated matrix device. * @group_notifier: notifier block used for specifying callback function for * handling the VFIO_GROUP_NOTIFY_SET_KVM event + * @iommu_notifier: notifier block used for specifying callback function for + * handling the VFIO_IOMMU_NOTIFY_DMA_UNMAP even * @kvm: the struct holding guest's state + * @pqap_hook: the function pointer to the interception handler for the + * PQAP(AQIC) instruction. + * @mdev: the mediated device */ struct ap_matrix_mdev { struct vfio_device vdev; @@ -90,6 +101,14 @@ struct ap_matrix_mdev { struct mdev_device *mdev; }; +/** + * struct vfio_ap_queue - contains the data associated with a queue bound to the + * vfio_ap device driver + * @matrix_mdev: the matrix mediated device + * @saved_pfn: the guest PFN pinned for the guest + * @apqn: the APQN of the AP queue device + * @saved_isc: the guest ISC registered with the GIB interface + */ struct vfio_ap_queue { struct ap_matrix_mdev *matrix_mdev; unsigned long saved_pfn; From ad9a14517263a16af040598c7920c09ca9670a31 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Wed, 8 Sep 2021 17:36:23 +0200 Subject: [PATCH 168/512] s390/cio: make ccw_device_dma_* more robust Since commit 48720ba56891 ("virtio/s390: use DMA memory for ccw I/O and classic notifiers") we were supposed to make sure that virtio_ccw_release_dev() completes before the ccw device and the attached dma pool are torn down, but unfortunately we did not. Before that commit it used to be OK to delay cleaning up the memory allocated by virtio-ccw indefinitely (which isn't really intuitive for guys used to destruction happens in reverse construction order), but now we trigger a BUG_ON if the genpool is destroyed before all memory allocated from it is deallocated. Which brings down the guest. We can observe this problem, when unregister_virtio_device() does not give up the last reference to the virtio_device (e.g. because a virtio-scsi attached scsi disk got removed without previously unmounting its previously mounted partition). To make sure that the genpool is only destroyed after all the necessary freeing is done let us take a reference on the ccw device on each ccw_device_dma_zalloc() and give it up on each ccw_device_dma_free(). Actually there are multiple approaches to fixing the problem at hand that can work. The upside of this one is that it is the safest one while remaining simple. We don't crash the guest even if the driver does not pair allocations and frees. The downside is the reference counting overhead, that the reference counting for ccw devices becomes more complex, in a sense that we need to pair the calls to the aforementioned functions for it to be correct, and that if we happen to leak, we leak more than necessary (the whole ccw device instead of just the genpool). Some alternatives to this approach are taking a reference in virtio_ccw_online() and giving it up in virtio_ccw_release_dev() or making sure virtio_ccw_release_dev() completes its work before virtio_ccw_remove() returns. The downside of these approaches is that these are less safe against programming errors. Cc: # v5.3 Signed-off-by: Halil Pasic Fixes: 48720ba56891 ("virtio/s390: use DMA memory for ccw I/O and classic notifiers") Reported-by: bfu@redhat.com Reviewed-by: Vineeth Vijayan Acked-by: Cornelia Huck Signed-off-by: Vasily Gorbik --- drivers/s390/cio/device_ops.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index 0fe7b2f2e7f5..c533d1dadc6b 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -825,13 +825,23 @@ EXPORT_SYMBOL_GPL(ccw_device_get_chid); */ void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size) { - return cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size); + void *addr; + + if (!get_device(&cdev->dev)) + return NULL; + addr = cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size); + if (IS_ERR_OR_NULL(addr)) + put_device(&cdev->dev); + return addr; } EXPORT_SYMBOL(ccw_device_dma_zalloc); void ccw_device_dma_free(struct ccw_device *cdev, void *cpu_addr, size_t size) { + if (!cpu_addr) + return; cio_gp_dma_free(cdev->private->dma_pool, cpu_addr, size); + put_device(&cdev->dev); } EXPORT_SYMBOL(ccw_device_dma_free); From 132c1e74aa7f8f9d33552645d2c35d3d8f9f0cf1 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 20 Oct 2021 16:14:11 +0200 Subject: [PATCH 169/512] s390/ap: function rework based on compiler warning Slight rework of function __ap_revise_reserved() because of unused variable warning when build with W=1. This patch introduces an additional debug feature warning message when device_reprobe() returns with failure. However, the return value of __ap_revise_reserved() is still hard coded to 0 as this is a callback function to be used together with bus_for_each_dev() and thus the return value indicates to go on with the bus_for_each_dev() loop and not apport on a failure of something within this function. Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 806d184482a9..1986243f9cd3 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -793,6 +793,9 @@ static int __ap_revise_reserved(struct device *dev, void *dummy) AP_DBF_DBG("%s reprobing queue=%02x.%04x\n", __func__, card, queue); rc = device_reprobe(dev); + if (rc) + AP_DBF_WARN("%s reprobing queue=%02x.%04x failed\n", + __func__, card, queue); } } From eec013bbf66fd17fc5671de6744913cb68036a00 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 21 Oct 2021 13:40:32 +0200 Subject: [PATCH 170/512] s390/string: use generic strrchr Use generic strrchr instead of an optimized architecture specific variant. Performance of strrchr is not relevant for real life workloads, since the only user which may call this more frequently would be kbasename(). Suggested-by: Linus Torvalds Link: https://lore.kernel.org/lkml/CAHk-=whoe211F8ND-9hZvfnib0UA4gga8DZJ+YaBZNbE4fubdg@mail.gmail.com/ Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/string.h | 2 -- arch/s390/lib/string.c | 19 ------------------- 2 files changed, 21 deletions(-) diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 4fd66c5e8934..4f418850bfd6 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -36,7 +36,6 @@ void *memmove(void *dest, const void *src, size_t n); #define __HAVE_ARCH_STRNCAT /* arch function */ #define __HAVE_ARCH_STRNCPY /* arch function */ #define __HAVE_ARCH_STRNLEN /* inline & arch function */ -#define __HAVE_ARCH_STRRCHR /* arch function */ #define __HAVE_ARCH_STRSTR /* arch function */ /* Prototypes for non-inlined arch strings functions. */ @@ -46,7 +45,6 @@ size_t strlcat(char *dest, const char *src, size_t n); size_t strlcpy(char *dest, const char *src, size_t size); char *strncat(char *dest, const char *src, size_t n); char *strncpy(char *dest, const char *src, size_t n); -char *strrchr(const char *s, int c); char *strstr(const char *s1, const char *s2); #endif /* !CONFIG_KASAN */ diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index a95ca6df4e5e..4c2d1a8949cc 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -251,25 +251,6 @@ int strcmp(const char *s1, const char *s2) EXPORT_SYMBOL(strcmp); #endif -/** - * strrchr - Find the last occurrence of a character in a string - * @s: The string to be searched - * @c: The character to search for - */ -#ifdef __HAVE_ARCH_STRRCHR -char *strrchr(const char *s, int c) -{ - ssize_t len = __strend(s) - s; - - do { - if (s[len] == (char)c) - return (char *)s + len; - } while (--len >= 0); - return NULL; -} -EXPORT_SYMBOL(strrchr); -#endif - static inline int clcle(const char *s1, unsigned long l1, const char *s2, unsigned long l2) { From f492bac3b6c8f95c1b543bced9dc3818f342b6f0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 21 Oct 2021 14:43:10 +0200 Subject: [PATCH 171/512] s390/string: use generic strlcpy The generic version of strlcpy is identical to the architecure specific variant. Therefore use the generic variant. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/string.h | 2 -- arch/s390/lib/string.c | 26 -------------------------- 2 files changed, 28 deletions(-) diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 4f418850bfd6..3fae93ddb322 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -31,7 +31,6 @@ void *memmove(void *dest, const void *src, size_t n); #define __HAVE_ARCH_STRCMP /* arch function */ #define __HAVE_ARCH_STRCPY /* inline & arch function */ #define __HAVE_ARCH_STRLCAT /* arch function */ -#define __HAVE_ARCH_STRLCPY /* arch function */ #define __HAVE_ARCH_STRLEN /* inline & arch function */ #define __HAVE_ARCH_STRNCAT /* arch function */ #define __HAVE_ARCH_STRNCPY /* arch function */ @@ -42,7 +41,6 @@ void *memmove(void *dest, const void *src, size_t n); int memcmp(const void *s1, const void *s2, size_t n); int strcmp(const char *s1, const char *s2); size_t strlcat(char *dest, const char *src, size_t n); -size_t strlcpy(char *dest, const char *src, size_t size); char *strncat(char *dest, const char *src, size_t n); char *strncpy(char *dest, const char *src, size_t n); char *strstr(const char *s1, const char *s2); diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index 4c2d1a8949cc..d1955eabda28 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -97,32 +97,6 @@ char *strcpy(char *dest, const char *src) EXPORT_SYMBOL(strcpy); #endif -/** - * strlcpy - Copy a %NUL terminated string into a sized buffer - * @dest: Where to copy the string to - * @src: Where to copy the string from - * @size: size of destination buffer - * - * Compatible with *BSD: the result is always a valid - * NUL-terminated string that fits in the buffer (unless, - * of course, the buffer size is zero). It does not pad - * out the result like strncpy() does. - */ -#ifdef __HAVE_ARCH_STRLCPY -size_t strlcpy(char *dest, const char *src, size_t size) -{ - size_t ret = __strend(src) - src; - - if (size) { - size_t len = (ret >= size) ? size-1 : ret; - dest[len] = '\0'; - memcpy(dest, src, len); - } - return ret; -} -EXPORT_SYMBOL(strlcpy); -#endif - /** * strncpy - Copy a length-limited, %NUL-terminated string * @dest: Where to copy the string to From 74e74f9cb3dea0372bb317fd5a09c2762567f4f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 21 Oct 2021 15:14:43 +0200 Subject: [PATCH 172/512] s390/spinlock: remove incorrect kernel doc indicator Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/lib/spinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 9b2dab5a69f9..692dc84cd19c 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -26,7 +26,7 @@ static int __init spin_retry_init(void) } early_initcall(spin_retry_init); -/** +/* * spin_retry= parameter */ static int __init spin_retry_setup(char *str) From 6aefbf1cdf0018c5af53a3fa300a61fef0f046b6 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 14 Sep 2021 09:26:49 +0200 Subject: [PATCH 173/512] s390/pci: add s390_iommu_aperture kernel parameter Some applications map the same memory area for DMA multiple times while also mapping significant amounts of memory. With our current DMA code these applications will run out of DMA addresses after mapping half of the available memory because the number of DMA mappings is constrained by the number of concurrently active DMA addresses we support which in turn is limited by the minimum of hardware constraints and high_memory. Limiting the number of active DMA addresses to high_memory is only a heuristic to save memory used by the iommu_bitmap and DMA page tables however. This was added under the assumption that it rarely makes sense to DMA map more than system memory. To accommodate special applications which insist on double mapping, which works on other platforms, allow specifying a factor of how many times installed memory is available as DMA address space. Use 0 as a special value to apply no constraints beyond what hardware dictates at the expense of significantly more memory use. Reviewed-by: Matthew Rosato Reviewed-by: Pierre Morel Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- .../admin-guide/kernel-parameters.txt | 12 +++++++++ arch/s390/pci/pci_dma.c | 25 +++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 91ba391f9b32..078a04890f69 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4982,6 +4982,18 @@ an IOTLB flush. Default is lazy flushing before reuse, which is faster. + s390_iommu_aperture= [KNL,S390] + Specifies the size of the per device DMA address space + accessible through the DMA and IOMMU APIs as a decimal + factor of the size of main memory. + The default is 1 meaning that one can concurrently use + as many DMA addresses as physical memory is installed, + if supported by hardware, and thus map all of memory + once. With a value of 2 one can map all of memory twice + and so on. As a special case a factor of 0 imposes no + restrictions other than those given by hardware at the + cost of significant additional memory use for tables. + sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 93223bd110c3..1f4540d6bd2d 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -18,6 +18,8 @@ static struct kmem_cache *dma_region_table_cache; static struct kmem_cache *dma_page_table_cache; static int s390_iommu_strict; +static u64 s390_iommu_aperture; +static u32 s390_iommu_aperture_factor = 1; static int zpci_refresh_global(struct zpci_dev *zdev) { @@ -565,15 +567,19 @@ int zpci_dma_init_device(struct zpci_dev *zdev) /* * Restrict the iommu bitmap size to the minimum of the following: - * - main memory size + * - s390_iommu_aperture which defaults to high_memory * - 3-level pagetable address limit minus start_dma offset * - DMA address range allowed by the hardware (clp query pci fn) * * Also set zdev->end_dma to the actual end address of the usable * range, instead of the theoretical maximum as reported by hardware. + * + * This limits the number of concurrently usable DMA mappings since + * for each DMA mapped memory address we need a DMA address including + * extra DMA addresses for multiple mappings of the same memory address. */ zdev->start_dma = PAGE_ALIGN(zdev->start_dma); - zdev->iommu_size = min3((u64) high_memory, + zdev->iommu_size = min3(s390_iommu_aperture, ZPCI_TABLE_SIZE_RT - zdev->start_dma, zdev->end_dma - zdev->start_dma + 1); zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1; @@ -660,6 +666,12 @@ static int __init dma_alloc_cpu_table_caches(void) int __init zpci_dma_init(void) { + s390_iommu_aperture = (u64)high_memory; + if (!s390_iommu_aperture_factor) + s390_iommu_aperture = ULONG_MAX; + else + s390_iommu_aperture *= s390_iommu_aperture_factor; + return dma_alloc_cpu_table_caches(); } @@ -692,3 +704,12 @@ static int __init s390_iommu_setup(char *str) } __setup("s390_iommu=", s390_iommu_setup); + +static int __init s390_iommu_aperture_setup(char *str) +{ + if (kstrtou32(str, 10, &s390_iommu_aperture_factor)) + s390_iommu_aperture_factor = 1; + return 1; +} + +__setup("s390_iommu_aperture=", s390_iommu_aperture_setup); From 277c8389386e2ccb8417afe4e36f67fc5dcd735d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 30 Sep 2021 09:34:13 +0200 Subject: [PATCH 174/512] s390/kexec_file: move kernel image size check In preparation of adding support for command lines with variable sizes on s390, the check whether the new kernel image is at least HEAD_END bytes long isn't correct. Move the check to kexec_file_add_components() so we can get the size of the parm area and check the size there. The '.org HEAD_END' directive can now also be removed from head.S. This was used in the past to reserve space for the early sccb buffer, but with commit 9a5131b87cac1 ("s390/boot: move sclp early buffer from fixed address in asm to C") this is no longer required. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/boot/head.S | 2 -- arch/s390/include/asm/setup.h | 1 - arch/s390/kernel/machine_kexec_file.c | 17 ++--------------- 3 files changed, 2 insertions(+), 18 deletions(-) diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index 7e843d8e794e..5d3fc69a505c 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -400,5 +400,3 @@ SYM_DATA_START(parmarea) .byte 0 .org PARMAREA+__PARMAREA_SIZE SYM_DATA_END(parmarea) - - .org HEAD_END diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index b6606ffd85d8..121e1a8c41d7 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -11,7 +11,6 @@ #include #define PARMAREA 0x10400 -#define HEAD_END 0x11000 /* * Machine features detected in early.c diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index f9e4baa64b67..55382ffde9d4 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -227,7 +227,8 @@ void *kexec_file_add_components(struct kimage *image, if (ret) goto out; - if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { + if (image->kernel_buf_len < PARMAREA + sizeof(struct parmarea) || + image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { ret = -EINVAL; goto out; } @@ -307,17 +308,3 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, } return 0; } - -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - /* A kernel must be at least large enough to contain head.S. During - * load memory in head.S will be accessed, e.g. to register the next - * command line. If the next kernel were smaller the current kernel - * will panic at load. - */ - if (buf_len < HEAD_END) - return -ENOEXEC; - - return kexec_image_probe_default(image, buf, buf_len); -} From 5ecb2da660ab8eddafe059a6a8a708465db89ca2 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 23 Sep 2021 21:22:52 +0200 Subject: [PATCH 175/512] s390: support command lines longer than 896 bytes Currently s390 supports a fixed maximum command line length of 896 bytes. This isn't enough as some installers are trying to pass all configuration data via kernel command line, and even with zfcp alone it is easy to generate really long command lines. Therefore extend the command line to 4 kbytes. In the parm area where the command line is stored there is no indication of the maximum allowed length, so a new field which contains the maximum length is added. The parm area has always been initialized to zero, so with old kernels this field would read zero. This is important because tools like zipl could read this field. If it contains a number larger than zero zipl knows the maximum length that can be stored in the parm area, otherwise it must assume that it is booting a legacy kernel and only 896 bytes are available. The removing of trailing whitespace in head.S is also removed because code to do this is already present in setup_boot_command_line(). Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/boot/head.S | 35 +++++++++------------------ arch/s390/boot/ipl_parm.c | 4 +-- arch/s390/include/asm/setup.h | 7 ++++-- arch/s390/include/uapi/asm/setup.h | 2 -- arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/early.c | 2 +- arch/s390/kernel/machine_kexec_file.c | 22 ++++++++++++++--- 7 files changed, 39 insertions(+), 34 deletions(-) diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index 5d3fc69a505c..3a252d140c55 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -184,35 +184,23 @@ iplstart: bas %r14,.Lloader # load parameter file ltr %r2,%r2 # got anything ? bz .Lnopf - chi %r2,895 - bnh .Lnotrunc - la %r2,895 + l %r3,MAX_COMMAND_LINE_SIZE+ARCH_OFFSET-PARMAREA(%r12) + ahi %r3,-1 + clr %r2,%r3 + bl .Lnotrunc + lr %r2,%r3 .Lnotrunc: l %r4,.Linitrd clc 0(3,%r4),.L_hdr # if it is HDRx bz .Lagain1 # skip dataset header clc 0(3,%r4),.L_eof # if it is EOFx bz .Lagain1 # skip dateset trailer - la %r5,0(%r4,%r2) - lr %r3,%r2 - la %r3,COMMAND_LINE-PARMAREA(%r12) # load adr. of command line - mvc 0(256,%r3),0(%r4) - mvc 256(256,%r3),256(%r4) - mvc 512(256,%r3),512(%r4) - mvc 768(122,%r3),768(%r4) - slr %r0,%r0 - b .Lcntlp -.Ldelspc: - ic %r0,0(%r2,%r3) - chi %r0,0x20 # is it a space ? - be .Lcntlp - ahi %r2,1 - b .Leolp -.Lcntlp: - brct %r2,.Ldelspc -.Leolp: - slr %r0,%r0 - stc %r0,0(%r2,%r3) # terminate buffer + + lr %r5,%r2 + la %r6,COMMAND_LINE-PARMAREA(%r12) + lr %r7,%r2 + ahi %r7,1 + mvcl %r6,%r4 .Lnopf: # @@ -394,6 +382,7 @@ SYM_DATA_START(parmarea) .quad 0 # OLDMEM_BASE .quad 0 # OLDMEM_SIZE .quad kernel_version # points to kernel version string + .quad COMMAND_LINE_SIZE .org COMMAND_LINE .byte "root=/dev/ram0 ro" diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 0f84c072625e..9ed7e29c81d9 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -170,10 +170,10 @@ static inline int has_ebcdic_char(const char *str) void setup_boot_command_line(void) { - parmarea.command_line[ARCH_COMMAND_LINE_SIZE - 1] = 0; + parmarea.command_line[COMMAND_LINE_SIZE - 1] = 0; /* convert arch command line to ascii if necessary */ if (has_ebcdic_char(parmarea.command_line)) - EBCASC(parmarea.command_line, ARCH_COMMAND_LINE_SIZE); + EBCASC(parmarea.command_line, COMMAND_LINE_SIZE); /* copy arch command line */ strcpy(early_command_line, strim(parmarea.command_line)); diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 121e1a8c41d7..d718029794e2 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -42,6 +42,8 @@ #define STARTUP_NORMAL_OFFSET 0x10000 #define STARTUP_KDUMP_OFFSET 0x10010 +#define LEGACY_COMMAND_LINE_SIZE 896 + #ifndef __ASSEMBLY__ #include @@ -54,8 +56,9 @@ struct parmarea { unsigned long oldmem_base; /* 0x10418 */ unsigned long oldmem_size; /* 0x10420 */ unsigned long kernel_version; /* 0x10428 */ - char pad1[0x10480 - 0x10430]; /* 0x10430 - 0x10480 */ - char command_line[ARCH_COMMAND_LINE_SIZE]; /* 0x10480 */ + unsigned long max_command_line_size; /* 0x10430 */ + char pad1[0x10480-0x10438]; /* 0x10438 - 0x10480 */ + char command_line[COMMAND_LINE_SIZE]; /* 0x10480 */ }; extern struct parmarea parmarea; diff --git a/arch/s390/include/uapi/asm/setup.h b/arch/s390/include/uapi/asm/setup.h index 1f8803a31079..9b685536c31c 100644 --- a/arch/s390/include/uapi/asm/setup.h +++ b/arch/s390/include/uapi/asm/setup.h @@ -9,6 +9,4 @@ #define COMMAND_LINE_SIZE 4096 -#define ARCH_COMMAND_LINE_SIZE 896 - #endif /* _UAPI_ASM_S390_SETUP_H */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 28177e4f52cc..8e00bb228662 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -164,5 +164,6 @@ int main(void) DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base)); DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); + DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); return 0; } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index ba3ace5ac73c..3cdf68c53614 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -280,7 +280,7 @@ char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; static void __init setup_boot_command_line(void) { /* copy arch command line */ - strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE); + strlcpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); } static void __init check_image_bootable(void) diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 55382ffde9d4..528edff085d9 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -216,7 +216,9 @@ void *kexec_file_add_components(struct kimage *image, int (*add_kernel)(struct kimage *image, struct s390_load_data *data)) { + unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE; struct s390_load_data data = {0}; + unsigned long minsize; int ret; data.report = ipl_report_init(&ipl_block); @@ -227,11 +229,23 @@ void *kexec_file_add_components(struct kimage *image, if (ret) goto out; - if (image->kernel_buf_len < PARMAREA + sizeof(struct parmarea) || - image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { - ret = -EINVAL; + ret = -EINVAL; + minsize = PARMAREA + offsetof(struct parmarea, command_line); + if (image->kernel_buf_len < minsize) goto out; - } + + if (data.parm->max_command_line_size) + max_command_line_size = data.parm->max_command_line_size; + + if (minsize + max_command_line_size < minsize) + goto out; + + if (image->kernel_buf_len < minsize + max_command_line_size) + goto out; + + if (image->cmdline_buf_len >= max_command_line_size) + goto out; + memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); From 622021cd6c560ce7aaaf7294a732177a30c9d65f Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 29 Sep 2021 09:09:34 +0200 Subject: [PATCH 176/512] s390: make command line configurable Allow to configure the command line to an arbitrary length, with a default of 4096 bytes. Also remove COMMAND_LINE_SIZE from include/uapi/asm/setup.h as this is dynamic now and doesn't tell anything about the command line size limitations of a new kernel that might be loaded. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 8 ++++++++ arch/s390/include/asm/setup.h | 1 + arch/s390/include/uapi/asm/setup.h | 11 ----------- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d0d42c5c4141..8857ec3b97eb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -438,6 +438,14 @@ endchoice config 64BIT def_bool y +config COMMAND_LINE_SIZE + int "Maximum size of kernel command line" + default 4096 + range 896 1048576 + help + This allows you to specify the maximum length of the kernel command + line. + config COMPAT def_bool y prompt "Kernel support for 31 bit emulation" diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index d718029794e2..77e6506898f5 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -12,6 +12,7 @@ #define PARMAREA 0x10400 +#define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE /* * Machine features detected in early.c */ diff --git a/arch/s390/include/uapi/asm/setup.h b/arch/s390/include/uapi/asm/setup.h index 9b685536c31c..598d769e76df 100644 --- a/arch/s390/include/uapi/asm/setup.h +++ b/arch/s390/include/uapi/asm/setup.h @@ -1,12 +1 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S390 version - * Copyright IBM Corp. 1999, 2010 - */ - -#ifndef _UAPI_ASM_S390_SETUP_H -#define _UAPI_ASM_S390_SETUP_H - -#define COMMAND_LINE_SIZE 4096 - -#endif /* _UAPI_ASM_S390_SETUP_H */ From ff5d3bb6e16d644eabb675ec1c6ef242239ca5f1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 10 Sep 2021 17:14:17 +0100 Subject: [PATCH 177/512] PCI: Remove redundant 'rc' initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable 'rc' is being initialized with a value that is never read. Remove the redundant assignment. Addresses-Coverity: ("Unused value") Link: https://lore.kernel.org/r/20210910161417.91001-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 7998b65e9ae5..d72c0281eb54 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5288,7 +5288,7 @@ const struct attribute_group pci_dev_reset_method_attr_group = { */ int __pci_reset_function_locked(struct pci_dev *dev) { - int i, m, rc = -ENOTTY; + int i, m, rc; might_sleep(); From 9baf93d68bcc3d0a6042283b82603c076e25e4f5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 25 Oct 2021 16:27:16 -0300 Subject: [PATCH 178/512] fsnotify: pass data_type to fsnotify_name() Align the arguments of fsnotify_name() to those of fsnotify(). Link: https://lore.kernel.org/r/20211025192746.66445-2-krisman@collabora.com Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 12d3a7d308ab..d1144d7c3536 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -26,20 +26,21 @@ * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only * the child is interested and not the parent. */ -static inline void fsnotify_name(struct inode *dir, __u32 mask, - struct inode *child, - const struct qstr *name, u32 cookie) +static inline int fsnotify_name(__u32 mask, const void *data, int data_type, + struct inode *dir, const struct qstr *name, + u32 cookie) { if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0) - return; + return 0; - fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie); + return fsnotify(mask, data, data_type, dir, name, NULL, cookie); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { - fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0); + fsnotify_name(mask, d_inode(dentry), FSNOTIFY_EVENT_INODE, + dir, &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) @@ -154,8 +155,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, new_dir_mask |= FS_ISDIR; } - fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie); - fsnotify_name(new_dir, new_dir_mask, source, new_name, fs_cookie); + fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE, + old_dir, old_name, fs_cookie); + fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE, + new_dir, new_name, fs_cookie); if (target) fsnotify_link_count(target); @@ -209,7 +212,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, fsnotify_link_count(inode); audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_name(dir, FS_CREATE, inode, &new_dentry->d_name, 0); + fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE, + dir, &new_dentry->d_name, 0); } /* From fd5a3ff49a19aa69e2bc1e26e98037c2d778e61a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 25 Oct 2021 16:27:17 -0300 Subject: [PATCH 179/512] fsnotify: pass dentry instead of inode data Define a new data type to pass for event - FSNOTIFY_EVENT_DENTRY. Use it to pass the dentry instead of it's ->d_inode where available. This is needed in preparation to the refactor to retrieve the super block from the data field. In some cases (i.e. mkdir in kernfs), the data inode comes from a negative dentry, such that no super block information would be available. By receiving the dentry itself, instead of the inode, fsnotify can derive the super block even on these cases. Link: https://lore.kernel.org/r/20211025192746.66445-3-krisman@collabora.com Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein [Expand explanation in commit message] Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 5 ++--- include/linux/fsnotify_backend.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index d1144d7c3536..df0fa4687a18 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -39,8 +39,7 @@ static inline int fsnotify_name(__u32 mask, const void *data, int data_type, static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { - fsnotify_name(mask, d_inode(dentry), FSNOTIFY_EVENT_INODE, - dir, &dentry->d_name, 0); + fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) @@ -87,7 +86,7 @@ notify_child: */ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) { - fsnotify_parent(dentry, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE); + fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } static inline int fsnotify_file(struct file *file, __u32 mask) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 1ce66748a2d2..a2db821e8a8f 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -248,6 +248,7 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, + FSNOTIFY_EVENT_DENTRY, }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) @@ -255,6 +256,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; + case FSNOTIFY_EVENT_DENTRY: + return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); default: @@ -262,6 +265,19 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) } } +static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_DENTRY: + /* Non const is needed for dget() */ + return (struct dentry *)data; + case FSNOTIFY_EVENT_PATH: + return ((const struct path *)data)->dentry; + default: + return NULL; + } +} + static inline const struct path *fsnotify_data_path(const void *data, int data_type) { From dabe729dddca550446e9cc118c96d1f91703345b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 25 Oct 2021 16:27:18 -0300 Subject: [PATCH 180/512] fsnotify: clarify contract for create event hooks Clarify argument names and contract for fsnotify_create() and fsnotify_mkdir() to reflect the anomaly of kernfs, which leaves dentries negavite after mkdir/create. Remove the WARN_ON(!inode) in audit code that were added by the Fixes commit under the wrong assumption that dentries cannot be negative after mkdir/create. Fixes: aa93bdc5500c ("fsnotify: use helpers to access data by data_type") Link: https://lore.kernel.org/linux-fsdevel/87mtp5yz0q.fsf@collabora.com/ Link: https://lore.kernel.org/r/20211025192746.66445-4-krisman@collabora.com Reviewed-by: Jan Kara Reported-by: Gabriel Krisman Bertazi Signed-off-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 22 ++++++++++++++++------ kernel/audit_fsnotify.c | 3 +-- kernel/audit_watch.c | 3 +-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index df0fa4687a18..1e5f7435a4b5 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -192,16 +192,22 @@ static inline void fsnotify_inoderemove(struct inode *inode) /* * fsnotify_create - 'name' was linked in + * + * Caller must make sure that dentry->d_name is stable. + * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate + * ->d_inode later */ -static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) +static inline void fsnotify_create(struct inode *dir, struct dentry *dentry) { - audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); + audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_dirent(inode, dentry, FS_CREATE); + fsnotify_dirent(dir, dentry, FS_CREATE); } /* * fsnotify_link - new hardlink in 'inode' directory + * + * Caller must make sure that new_dentry->d_name is stable. * Note: We have to pass also the linked inode ptr as some filesystems leave * new_dentry->d_inode NULL and instantiate inode pointer later */ @@ -230,12 +236,16 @@ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) /* * fsnotify_mkdir - directory 'name' was created + * + * Caller must make sure that dentry->d_name is stable. + * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate + * ->d_inode later */ -static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) +static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry) { - audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); + audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR); + fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR); } /* diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 60739d5e3373..02348b48447c 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -160,8 +160,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask, audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); - if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) || - WARN_ON_ONCE(!inode)) + if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group)) return 0; if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 2acf7ca49154..223eed7b39cd 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -472,8 +472,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask, parent = container_of(inode_mark, struct audit_parent, mark); - if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) || - WARN_ON_ONCE(!inode)) + if (WARN_ON_ONCE(inode_mark->group != audit_watch_group)) return 0; if (mask & (FS_CREATE|FS_MOVED_TO) && inode) From cc53b55f697fe5aa98bdbfdfe67c6401da242155 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:19 -0300 Subject: [PATCH 181/512] fsnotify: Don't insert unmergeable events in hashtable Some events, like the overflow event, are not mergeable, so they are not hashed. But, when failing inside fsnotify_add_event for lack of space, fsnotify_add_event() still calls the insert hook, which adds the overflow event to the merge list. Add a check to prevent any kind of unmergeable event to be inserted in the hashtable. Fixes: 94e00d28a680 ("fsnotify: use hash table for faster events merge") Link: https://lore.kernel.org/r/20211025192746.66445-5-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 057abd2cf887..310246f8d3f1 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -702,6 +702,9 @@ static void fanotify_insert_event(struct fsnotify_group *group, assert_spin_locked(&group->notification_lock); + if (!fanotify_is_hashed_event(event->mask)) + return; + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, group, event, bucket); @@ -779,8 +782,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, fsn_event = &event->fse; ret = fsnotify_add_event(group, fsn_event, fanotify_merge, - fanotify_is_hashed_event(mask) ? - fanotify_insert_event : NULL); + fanotify_insert_event); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); From b9928e80dda84b349ba8de01780b9bef2fc36ffa Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:20 -0300 Subject: [PATCH 182/512] fanotify: Fold event size calculation to its own function Every time this function is invoked, it is immediately added to FAN_EVENT_METADATA_LEN, since there is no need to just calculate the length of info records. This minor clean up folds the rest of the calculation into the function, which now operates in terms of events, returning the size of the entire event, including metadata. Link: https://lore.kernel.org/r/20211025192746.66445-6-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 35 +++++++++++++++++------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6facdf476255..6895ec310b5d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -126,17 +126,24 @@ static int fanotify_fid_info_len(int fh_len, int name_len) FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(unsigned int info_mode, - struct fanotify_event *event) +static size_t fanotify_event_len(unsigned int info_mode, + struct fanotify_event *event) { - struct fanotify_info *info = fanotify_event_info(event); - int dir_fh_len = fanotify_event_dir_fh_len(event); - int fh_len = fanotify_event_object_fh_len(event); - int info_len = 0; + size_t event_len = FAN_EVENT_METADATA_LEN; + struct fanotify_info *info; + int dir_fh_len; + int fh_len; int dot_len = 0; + if (!info_mode) + return event_len; + + info = fanotify_event_info(event); + dir_fh_len = fanotify_event_dir_fh_len(event); + fh_len = fanotify_event_object_fh_len(event); + if (dir_fh_len) { - info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); + event_len += fanotify_fid_info_len(dir_fh_len, info->name_len); } else if ((info_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* @@ -147,12 +154,12 @@ static int fanotify_event_info_len(unsigned int info_mode, } if (info_mode & FAN_REPORT_PIDFD) - info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; + event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; if (fh_len) - info_len += fanotify_fid_info_len(fh_len, dot_len); + event_len += fanotify_fid_info_len(fh_len, dot_len); - return info_len; + return event_len; } /* @@ -181,7 +188,7 @@ static void fanotify_unhash_event(struct fsnotify_group *group, static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - size_t event_size = FAN_EVENT_METADATA_LEN; + size_t event_size; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); @@ -194,8 +201,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, goto out; event = FANOTIFY_E(fsn_event); - if (info_mode) - event_size += fanotify_event_info_len(info_mode, event); + event_size = fanotify_event_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); @@ -537,8 +543,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(info_mode, event); + metadata.event_len = fanotify_event_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; From 8299212cbdb01a5867e230e961f82e5c02a6de34 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:21 -0300 Subject: [PATCH 183/512] fanotify: Split fsid check from other fid mode checks FAN_FS_ERROR will require fsid, but not necessarily require the filesystem to expose a file handle. Split those checks into different functions, so they can be used separately when setting up an event. While there, update a comment about tmpfs having 0 fsid, which is no longer true. Link: https://lore.kernel.org/r/20211025192746.66445-7-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6895ec310b5d..adeae6d65e35 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1300,16 +1300,15 @@ out_destroy_group: return fd; } -/* Check if filesystem can encode a unique fid */ -static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) +static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) { __kernel_fsid_t root_fsid; int err; /* - * Make sure path is not in filesystem with zero fsid (e.g. tmpfs). + * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ - err = vfs_get_fsid(path->dentry, fsid); + err = vfs_get_fsid(dentry, fsid); if (err) return err; @@ -1317,10 +1316,10 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) return -ENODEV; /* - * Make sure path is not inside a filesystem subvolume (e.g. btrfs) + * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) * which uses a different fsid than sb root. */ - err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid); + err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); if (err) return err; @@ -1328,6 +1327,12 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) root_fsid.val[1] != fsid->val[1]) return -EXDEV; + return 0; +} + +/* Check if filesystem can encode a unique fid */ +static int fanotify_test_fid(struct dentry *dentry) +{ /* * We need to make sure that the file system supports at least * encoding a file handle so user can use name_to_handle_at() to @@ -1335,8 +1340,8 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) * objects. However, name_to_handle_at() requires that the * filesystem also supports decoding file handles. */ - if (!path->dentry->d_sb->s_export_op || - !path->dentry->d_sb->s_export_op->fh_to_dentry) + if (!dentry->d_sb->s_export_op || + !dentry->d_sb->s_export_op->fh_to_dentry) return -EOPNOTSUPP; return 0; @@ -1487,7 +1492,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } if (fid_mode) { - ret = fanotify_test_fid(&path, &__fsid); + ret = fanotify_test_fsid(path.dentry, &__fsid); + if (ret) + goto path_put_and_out; + + ret = fanotify_test_fid(path.dentry); if (ret) goto path_put_and_out; From e0462f91d24756916fded4313d508e0fc52f39c9 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:22 -0300 Subject: [PATCH 184/512] inotify: Don't force FS_IN_IGNORED According to Amir: "FS_IN_IGNORED is completely internal to inotify and there is no need to set it in i_fsnotify_mask at all, so if we remove the bit from the output of inotify_arg_to_mask() no functionality will change and we will be able to overload the event bit for FS_ERROR." This is done in preparation to overload FS_ERROR with the notification mechanism in fanotify. Link: https://lore.kernel.org/r/20211025192746.66445-8-krisman@collabora.com Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/inotify/inotify_user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 62051247f6d2..29fca3284bb5 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -94,10 +94,10 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) __u32 mask; /* - * Everything should accept their own ignored and should receive events - * when the inode is unmounted. All directories care about children. + * Everything should receive events when the inode is unmounted. + * All directories care about children. */ - mask = (FS_IN_IGNORED | FS_UNMOUNT); + mask = (FS_UNMOUNT); if (S_ISDIR(inode->i_mode)) mask |= FS_EVENT_ON_CHILD; From 808967a0a4d2f4ce6a2005c5692fffbecaf018c1 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:23 -0300 Subject: [PATCH 185/512] fsnotify: Add helper to detect overflow_event Similarly to fanotify_is_perm_event and friends, provide a helper predicate to say whether a mask is of an overflow event. Link: https://lore.kernel.org/r/20211025192746.66445-9-krisman@collabora.com Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.h | 3 ++- include/linux/fsnotify_backend.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 4a5e555dc3d2..c42cf8fd7d79 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -315,7 +315,8 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event) */ static inline bool fanotify_is_hashed_event(u32 mask) { - return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW); + return !(fanotify_is_perm_event(mask) || + fsnotify_is_overflow_event(mask)); } static inline unsigned int fanotify_event_hash_bucket( diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index a2db821e8a8f..749bc85e1d1c 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -510,6 +510,11 @@ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) fsnotify_add_event(group, group->overflow_event, NULL, NULL); } +static inline bool fsnotify_is_overflow_event(u32 mask) +{ + return mask & FS_Q_OVERFLOW; +} + static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); From 1ad03c3a326a86e259389592117252c851873395 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:24 -0300 Subject: [PATCH 186/512] fsnotify: Add wrapper around fsnotify_add_event fsnotify_add_event is growing in number of parameters, which in most case are just passed a NULL pointer. So, split out a new fsnotify_insert_event function to clean things up for users who don't need an insert hook. Link: https://lore.kernel.org/r/20211025192746.66445-10-krisman@collabora.com Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 4 ++-- fs/notify/inotify/inotify_fsnotify.c | 2 +- fs/notify/notification.c | 12 ++++++------ include/linux/fsnotify_backend.h | 23 ++++++++++++++++------- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 310246f8d3f1..f82e20228999 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -781,8 +781,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, } fsn_event = &event->fse; - ret = fsnotify_add_event(group, fsn_event, fanotify_merge, - fanotify_insert_event); + ret = fsnotify_insert_event(group, fsn_event, fanotify_merge, + fanotify_insert_event); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index d1a64daa0171..a96582cbfad1 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -116,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, if (len) strcpy(event->name, name->name); - ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL); + ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 32f45543b9c6..44bb10f50715 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -78,12 +78,12 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * 2 if the event was not queued - either the queue of events has overflown * or the group is shutting down. */ -int fsnotify_add_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct fsnotify_group *, - struct fsnotify_event *), - void (*insert)(struct fsnotify_group *, - struct fsnotify_event *)) +int fsnotify_insert_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 749bc85e1d1c..b323d0c4b967 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -498,16 +498,25 @@ extern int fsnotify_fasync(int fd, struct file *file, int on); extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ -extern int fsnotify_add_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct fsnotify_group *, - struct fsnotify_event *), - void (*insert)(struct fsnotify_group *, - struct fsnotify_event *)); +extern int fsnotify_insert_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)); + +static inline int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *)) +{ + return fsnotify_insert_event(group, event, merge, NULL); +} + /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { - fsnotify_add_event(group, group->overflow_event, NULL, NULL); + fsnotify_add_event(group, group->overflow_event, NULL); } static inline bool fsnotify_is_overflow_event(u32 mask) From 29335033c574a15334015d8c4e36862cff3d3384 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:25 -0300 Subject: [PATCH 187/512] fsnotify: Retrieve super block from the data field Some file system events (i.e. FS_ERROR) might not be associated with an inode or directory. For these, we can retrieve the super block from the data field. But, since the super_block is available in the data field on every event type, simplify the code to always retrieve it from there, through a new helper. Link: https://lore.kernel.org/r/20211025192746.66445-11-krisman@collabora.com Suggested-by: Jan Kara Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fsnotify.c | 7 +++---- include/linux/fsnotify_backend.h | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 963e6ce75b96..fde3a1115a17 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -455,16 +455,16 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - - * either @dir or @inode must be non-NULL. - * if both are non-NULL event may be reported to both. + * If @dir and @inode are both non-NULL, event may be + * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); + struct super_block *sb = fsnotify_data_sb(data, data_type); struct fsnotify_iter_info iter_info = {}; - struct super_block *sb; struct mount *mnt = NULL; struct inode *parent = NULL; int ret = 0; @@ -483,7 +483,6 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, */ parent = dir; } - sb = inode->i_sb; /* * Optimization: srcu_read_lock() has a memory barrier which can diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b323d0c4b967..035438fe4a43 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -289,6 +289,21 @@ static inline const struct path *fsnotify_data_path(const void *data, } } +static inline struct super_block *fsnotify_data_sb(const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_INODE: + return ((struct inode *)data)->i_sb; + case FSNOTIFY_EVENT_DENTRY: + return ((struct dentry *)data)->d_sb; + case FSNOTIFY_EVENT_PATH: + return ((const struct path *)data)->dentry->d_sb; + default: + return NULL; + } +} + enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_PARENT, From 24dca90590509a7a6cbe0650100c90c5b8a3468a Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:26 -0300 Subject: [PATCH 188/512] fsnotify: Protect fsnotify_handle_inode_event from no-inode events FAN_FS_ERROR allows events without inodes - i.e. for file system-wide errors. Even though fsnotify_handle_inode_event is not currently used by fanotify, this patch protects other backends from cases where neither inode or dir are provided. Also document the constraints of the interface (inode and dir cannot be both NULL). Link: https://lore.kernel.org/r/20211025192746.66445-12-krisman@collabora.com Suggested-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Jan Kara --- fs/nfsd/filecache.c | 3 +++ fs/notify/fsnotify.c | 3 +++ include/linux/fsnotify_backend.h | 1 + 3 files changed, 7 insertions(+) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index be3c1aad50ea..fdf89fcf1a0c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -602,6 +602,9 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie) { + if (WARN_ON_ONCE(!inode)) + return 0; + trace_nfsd_file_fsnotify_handle_event(inode, mask); /* Should be no marks on non-regular files */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index fde3a1115a17..4034ca566f95 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -252,6 +252,9 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group, if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; + if (WARN_ON_ONCE(!inode && !dir)) + return 0; + if ((inode_mark->mask & FS_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 035438fe4a43..b71dc788018e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -136,6 +136,7 @@ struct mem_cgroup; * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. + * Either @inode or @dir must be non-NULL. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * From 330ae77d2a5b0af32c0f29e139bf28ec8591de59 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:27 -0300 Subject: [PATCH 189/512] fsnotify: Pass group argument to free_event For group-wide mempool backed events, like FS_ERROR, the free_event callback will need to reference the group's mempool to free the memory. Wire that argument into the current callers. Link: https://lore.kernel.org/r/20211025192746.66445-13-krisman@collabora.com Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 3 ++- fs/notify/group.c | 2 +- fs/notify/inotify/inotify_fsnotify.c | 3 ++- fs/notify/notification.c | 2 +- include/linux/fsnotify_backend.h | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f82e20228999..c620b4f6fe12 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -835,7 +835,8 @@ static void fanotify_free_name_event(struct fanotify_event *event) kfree(FANOTIFY_NE(event)); } -static void fanotify_free_event(struct fsnotify_event *fsn_event) +static void fanotify_free_event(struct fsnotify_group *group, + struct fsnotify_event *fsn_event) { struct fanotify_event *event; diff --git a/fs/notify/group.c b/fs/notify/group.c index fb89c351295d..6a297efc4788 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -88,7 +88,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group) * that deliberately ignores overflow events. */ if (group->overflow_event) - group->ops->free_event(group->overflow_event); + group->ops->free_event(group, group->overflow_event); fsnotify_put_group(group); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index a96582cbfad1..d92d7b0adc9a 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -177,7 +177,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group) dec_inotify_instances(group->inotify_data.ucounts); } -static void inotify_free_event(struct fsnotify_event *fsn_event) +static void inotify_free_event(struct fsnotify_group *group, + struct fsnotify_event *fsn_event) { kfree(INOTIFY_E(fsn_event)); } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 44bb10f50715..9022ae650cf8 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -64,7 +64,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group, WARN_ON(!list_empty(&event->list)); spin_unlock(&group->notification_lock); } - group->ops->free_event(event); + group->ops->free_event(group, event); } /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b71dc788018e..3a7c31436182 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -156,7 +156,7 @@ struct fsnotify_ops { const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); - void (*free_event)(struct fsnotify_event *event); + void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; From 12f47bf0f0990933d95d021d13d31bda010648fd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:28 -0300 Subject: [PATCH 190/512] fanotify: Support null inode event in fanotify_dfid_inode FAN_FS_ERROR doesn't support DFID, but this function is still called for every event. The problem is that it is not capable of handling null inodes, which now can happen in case of superblock error events. For this case, just returning dir will be enough. Link: https://lore.kernel.org/r/20211025192746.66445-14-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index c620b4f6fe12..397ee623ff1e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -452,7 +452,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data, if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) return dir; - if (S_ISDIR(inode->i_mode)) + if (inode && S_ISDIR(inode->i_mode)) return inode; return dir; From 74fe4734897a2da2ae2a665a5e622cd490d36eaf Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:29 -0300 Subject: [PATCH 191/512] fanotify: Allow file handle encoding for unhashed events Allow passing a NULL hash to fanotify_encode_fh and avoid calculating the hash if not needed. Link: https://lore.kernel.org/r/20211025192746.66445-15-krisman@collabora.com Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 397ee623ff1e..ec84fee7ad01 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -403,8 +403,12 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = type; fh->len = fh_len; - /* Mix fh into event merge key */ - *hash ^= fanotify_hash_fh(fh); + /* + * Mix fh into event merge key. Hash might be NULL in case of + * unhashed FID events (i.e. FAN_FS_ERROR). + */ + if (hash) + *hash ^= fanotify_hash_fh(fh); return FANOTIFY_FH_HDR_LEN + fh_len; From 272531ac619b374ab474e989eb387162fded553f Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:30 -0300 Subject: [PATCH 192/512] fanotify: Encode empty file handle when no inode is provided Instead of failing, encode an invalid file handle in fanotify_encode_fh if no inode is provided. This bogus file handle will be reported by FAN_FS_ERROR for non-inode errors. Link: https://lore.kernel.org/r/20211025192746.66445-16-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index ec84fee7ad01..c64d61b673ca 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -370,8 +370,14 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = FILEID_ROOT; fh->len = 0; fh->flags = 0; + + /* + * Invalid FHs are used by FAN_FS_ERROR for errors not + * linked to any inode. The f_handle won't be reported + * back to userspace. + */ if (!inode) - return 0; + goto out; /* * !gpf means preallocated variable size fh, but fh_len could @@ -403,6 +409,7 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = type; fh->len = fh_len; +out: /* * Mix fh into event merge key. Hash might be NULL in case of * unhashed FID events (i.e. FAN_FS_ERROR). From 4fe595cf1c80e7a5af4d00c4da29def64aff57a2 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:31 -0300 Subject: [PATCH 193/512] fanotify: Require fid_mode for any non-fd event Like inode events, FAN_FS_ERROR will require fid mode. Therefore, convert the verification during fanotify_mark(2) to require fid for any non-fd event. This means fid_mode will not only be required for inode events, but for any event that doesn't provide a descriptor. Link: https://lore.kernel.org/r/20211025192746.66445-17-krisman@collabora.com Suggested-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 12 ++++++------ include/linux/fanotify.h | 3 +++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index adeae6d65e35..66ee3c2805c7 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1458,14 +1458,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* - * Events with data type inode do not carry enough information to report - * event->fd, so we do not allow setting a mask for inode events unless - * group supports reporting fid. - * inode events are not supported on a mount mark, because they do not - * carry enough information (i.e. path) to be filtered by mount point. + * Events that do not carry enough information to report + * event->fd require a group that supports reporting fid. Those + * events are not supported on a mount mark, because they do not + * carry enough information (i.e. path) to be filtered by mount + * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); - if (mask & FANOTIFY_INODE_EVENTS && + if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) goto fput_and_out; diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index eec3b7c40811..52d464802d99 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -84,6 +84,9 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ */ #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE) +/* Events that can be reported with event->fd */ +#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) + /* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */ #define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \ FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF) From 9daa811073fa19c08e8aad3b90f9235fed161acf Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:32 -0300 Subject: [PATCH 194/512] fsnotify: Support FS_ERROR event type Expose a new type of fsnotify event for filesystems to report errors for userspace monitoring tools. fanotify will send this type of notification for FAN_FS_ERROR events. This also introduce a helper for generating the new event. Link: https://lore.kernel.org/r/20211025192746.66445-18-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 13 +++++++++++++ include/linux/fsnotify_backend.h | 32 +++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1e5f7435a4b5..787545e87eeb 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -339,4 +339,17 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) fsnotify_dentry(dentry, mask); } +static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, + int error) +{ + struct fs_error_report report = { + .error = error, + .inode = inode, + .sb = sb, + }; + + return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, + NULL, NULL, NULL, 0); +} + #endif /* _LINUX_FS_NOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 3a7c31436182..00dbaafbcf95 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -42,6 +42,12 @@ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ +#define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */ + +/* + * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify + * which does not support FS_ERROR. + */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ @@ -95,7 +101,8 @@ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \ - FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED) + FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ + FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \ @@ -250,6 +257,13 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, + FSNOTIFY_EVENT_ERROR, +}; + +struct fs_error_report { + int error; + struct inode *inode; + struct super_block *sb; }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) @@ -261,6 +275,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); + case FSNOTIFY_EVENT_ERROR: + return ((struct fs_error_report *)data)->inode; default: return NULL; } @@ -300,6 +316,20 @@ static inline struct super_block *fsnotify_data_sb(const void *data, return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; + case FSNOTIFY_EVENT_ERROR: + return ((struct fs_error_report *) data)->sb; + default: + return NULL; + } +} + +static inline struct fs_error_report *fsnotify_data_error_report( + const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_ERROR: + return (struct fs_error_report *) data; default: return NULL; } From 8d11a4f43ef4679be0908026907a7613b33d7127 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:33 -0300 Subject: [PATCH 195/512] fanotify: Reserve UAPI bits for FAN_FS_ERROR FAN_FS_ERROR allows reporting of event type FS_ERROR to userspace, which is a mechanism to report file system wide problems via fanotify. This commit preallocate userspace visible bits to match the FS_ERROR event. Link: https://lore.kernel.org/r/20211025192746.66445-19-krisman@collabora.com Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 1 + include/uapi/linux/fanotify.h | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index c64d61b673ca..8f152445d75c 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -752,6 +752,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); + BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 64553df9d735..2990731ddc8b 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -20,6 +20,7 @@ #define FAN_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ +#define FAN_FS_ERROR 0x00008000 /* Filesystem error */ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ From 734a1a5eccc5f7473002b0669f788e135f1f64aa Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:34 -0300 Subject: [PATCH 196/512] fanotify: Pre-allocate pool of error events Pre-allocate slots for file system errors to have greater chances of succeeding, since error events can happen in GFP_NOFS context. This patch introduces a group-wide mempool of error events, shared by all FAN_FS_ERROR marks in this group. Link: https://lore.kernel.org/r/20211025192746.66445-20-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 3 +++ fs/notify/fanotify/fanotify.h | 11 +++++++++++ fs/notify/fanotify/fanotify_user.c | 26 +++++++++++++++++++++++++- include/linux/fsnotify_backend.h | 2 ++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 8f152445d75c..01d68dfc74aa 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -819,6 +819,9 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) if (group->fanotify_data.ucounts) dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_GROUPS); + + if (mempool_initialized(&group->fanotify_data.error_events_pool)) + mempool_exit(&group->fanotify_data.error_events_pool); } static void fanotify_free_path_event(struct fanotify_event *event) diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index c42cf8fd7d79..a577e87fac2b 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -141,6 +141,7 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH, FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ + FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ __FANOTIFY_EVENT_TYPE_NUM }; @@ -196,6 +197,16 @@ FANOTIFY_NE(struct fanotify_event *event) return container_of(event, struct fanotify_name_event, fae); } +struct fanotify_error_event { + struct fanotify_event fae; +}; + +static inline struct fanotify_error_event * +FANOTIFY_EE(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_error_event, fae); +} + static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_FID) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 66ee3c2805c7..2f4182b754b2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -30,6 +30,7 @@ #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_GROUPS 128 +#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 /* * Legacy fanotify marks limits (8192) is per group and we introduced a tunable @@ -1054,6 +1055,15 @@ out_dec_ucounts: return ERR_PTR(ret); } +static int fanotify_group_init_error_pool(struct fsnotify_group *group) +{ + if (mempool_initialized(&group->fanotify_data.error_events_pool)) + return 0; + + return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, + FANOTIFY_DEFAULT_FEE_POOL_SIZE, + sizeof(struct fanotify_error_event)); +} static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int type, @@ -1062,6 +1072,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_mark(connp, group); @@ -1072,13 +1083,26 @@ static int fanotify_add_mark(struct fsnotify_group *group, return PTR_ERR(fsn_mark); } } + + /* + * Error events are pre-allocated per group, only if strictly + * needed (i.e. FAN_FS_ERROR was requested). + */ + if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + ret = fanotify_group_init_error_pool(group); + if (ret) + goto out; + } + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); if (added & ~fsnotify_conn_mask(fsn_mark->connector)) fsnotify_recalc_mask(fsn_mark->connector); + +out: mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); - return 0; + return ret; } static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 00dbaafbcf95..51ef2b079bfa 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -19,6 +19,7 @@ #include #include #include +#include /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily @@ -246,6 +247,7 @@ struct fsnotify_group { int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; + mempool_t error_events_pool; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; From 83e9acbe13dc1b767f91b5c1350f7a65689b26f6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:35 -0300 Subject: [PATCH 197/512] fanotify: Support enqueueing of error events Once an error event is triggered, enqueue it in the notification group, similarly to what is done for other events. FAN_FS_ERROR is not handled specially, since the memory is now handled by a preallocated mempool. For now, make the event unhashed. A future patch implements merging of this kind of event. Link: https://lore.kernel.org/r/20211025192746.66445-21-krisman@collabora.com Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 35 +++++++++++++++++++++++++++++++++++ fs/notify/fanotify/fanotify.h | 6 ++++++ 2 files changed, 41 insertions(+) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 01d68dfc74aa..1f195c95dfcd 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -574,6 +574,27 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, return &fne->fae; } +static struct fanotify_event *fanotify_alloc_error_event( + struct fsnotify_group *group, + __kernel_fsid_t *fsid, + const void *data, int data_type) +{ + struct fs_error_report *report = + fsnotify_data_error_report(data, data_type); + struct fanotify_error_event *fee; + + if (WARN_ON_ONCE(!report)) + return NULL; + + fee = mempool_alloc(&group->fanotify_data.error_events_pool, GFP_NOFS); + if (!fee) + return NULL; + + fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR; + + return &fee->fae; +} + static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, @@ -641,6 +662,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); + } else if (fanotify_is_error_event(mask)) { + event = fanotify_alloc_error_event(group, fsid, data, + data_type); } else if (name_event && (file_name || child)) { event = fanotify_alloc_name_event(id, fsid, file_name, child, &hash, gfp); @@ -850,6 +874,14 @@ static void fanotify_free_name_event(struct fanotify_event *event) kfree(FANOTIFY_NE(event)); } +static void fanotify_free_error_event(struct fsnotify_group *group, + struct fanotify_event *event) +{ + struct fanotify_error_event *fee = FANOTIFY_EE(event); + + mempool_free(fee, &group->fanotify_data.error_events_pool); +} + static void fanotify_free_event(struct fsnotify_group *group, struct fsnotify_event *fsn_event) { @@ -873,6 +905,9 @@ static void fanotify_free_event(struct fsnotify_group *group, case FANOTIFY_EVENT_TYPE_OVERFLOW: kfree(event); break; + case FANOTIFY_EVENT_TYPE_FS_ERROR: + fanotify_free_error_event(group, event); + break; default: WARN_ON_ONCE(1); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index a577e87fac2b..ebef952481fa 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -298,6 +298,11 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct fanotify_event, fse); } +static inline bool fanotify_is_error_event(u32 mask) +{ + return mask & FAN_FS_ERROR; +} + static inline bool fanotify_event_has_path(struct fanotify_event *event) { return event->type == FANOTIFY_EVENT_TYPE_PATH || @@ -327,6 +332,7 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event) static inline bool fanotify_is_hashed_event(u32 mask) { return !(fanotify_is_perm_event(mask) || + fanotify_is_error_event(mask) || fsnotify_is_overflow_event(mask)); } From 8a6ae64132fd27a944faed7bc38484827609eb76 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:36 -0300 Subject: [PATCH 198/512] fanotify: Support merging of error events Error events (FAN_FS_ERROR) against the same file system can be merged by simply iterating the error count. The hash is taken from the fsid, without considering the FH. This means that only the first error object is reported. Link: https://lore.kernel.org/r/20211025192746.66445-22-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 26 ++++++++++++++++++++++++-- fs/notify/fanotify/fanotify.h | 4 +++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 1f195c95dfcd..cedcb1546804 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -111,6 +111,16 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, return fanotify_info_equal(info1, info2); } +static bool fanotify_error_event_equal(struct fanotify_error_event *fee1, + struct fanotify_error_event *fee2) +{ + /* Error events against the same file system are always merged. */ + if (!fanotify_fsid_equal(&fee1->fsid, &fee2->fsid)) + return false; + + return true; +} + static bool fanotify_should_merge(struct fanotify_event *old, struct fanotify_event *new) { @@ -141,6 +151,9 @@ static bool fanotify_should_merge(struct fanotify_event *old, case FANOTIFY_EVENT_TYPE_FID_NAME: return fanotify_name_event_equal(FANOTIFY_NE(old), FANOTIFY_NE(new)); + case FANOTIFY_EVENT_TYPE_FS_ERROR: + return fanotify_error_event_equal(FANOTIFY_EE(old), + FANOTIFY_EE(new)); default: WARN_ON_ONCE(1); } @@ -176,6 +189,10 @@ static int fanotify_merge(struct fsnotify_group *group, break; if (fanotify_should_merge(old, new)) { old->mask |= new->mask; + + if (fanotify_is_error_event(old->mask)) + FANOTIFY_EE(old)->err_count++; + return 1; } } @@ -577,7 +594,8 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, static struct fanotify_event *fanotify_alloc_error_event( struct fsnotify_group *group, __kernel_fsid_t *fsid, - const void *data, int data_type) + const void *data, int data_type, + unsigned int *hash) { struct fs_error_report *report = fsnotify_data_error_report(data, data_type); @@ -591,6 +609,10 @@ static struct fanotify_event *fanotify_alloc_error_event( return NULL; fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR; + fee->err_count = 1; + fee->fsid = *fsid; + + *hash ^= fanotify_hash_fsid(fsid); return &fee->fae; } @@ -664,7 +686,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, event = fanotify_alloc_perm_event(path, gfp); } else if (fanotify_is_error_event(mask)) { event = fanotify_alloc_error_event(group, fsid, data, - data_type); + data_type, &hash); } else if (name_event && (file_name || child)) { event = fanotify_alloc_name_event(id, fsid, file_name, child, &hash, gfp); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index ebef952481fa..2b032b79d5b0 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -199,6 +199,9 @@ FANOTIFY_NE(struct fanotify_event *event) struct fanotify_error_event { struct fanotify_event fae; + u32 err_count; /* Suppressed errors count */ + + __kernel_fsid_t fsid; /* FSID this error refers to. */ }; static inline struct fanotify_error_event * @@ -332,7 +335,6 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event) static inline bool fanotify_is_hashed_event(u32 mask) { return !(fanotify_is_perm_event(mask) || - fanotify_is_error_event(mask) || fsnotify_is_overflow_event(mask)); } From 2c5069433a3adc01ff9c5673567961bb7f138074 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:37 -0300 Subject: [PATCH 199/512] fanotify: Wrap object_fh inline space in a creator macro fanotify_error_event would duplicate this sequence of declarations that already exist elsewhere with a slight different size. Create a helper macro to avoid code duplication. Link: https://lore.kernel.org/r/20211025192746.66445-23-krisman@collabora.com Suggested-by: Jan Kara Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 2b032b79d5b0..3510d06654ed 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -171,12 +171,18 @@ static inline void fanotify_init_event(struct fanotify_event *event, event->pid = NULL; } +#define FANOTIFY_INLINE_FH(name, size) \ +struct { \ + struct fanotify_fh (name); \ + /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \ + unsigned char _inline_fh_buf[(size)]; \ +} + struct fanotify_fid_event { struct fanotify_event fae; __kernel_fsid_t fsid; - struct fanotify_fh object_fh; - /* Reserve space in object_fh.buf[] - access with fanotify_fh_buf() */ - unsigned char _inline_fh_buf[FANOTIFY_INLINE_FH_LEN]; + + FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN); }; static inline struct fanotify_fid_event * From 4bd5a5c8e6e5cd964e9738e6ef87f6c2cb453edf Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:38 -0300 Subject: [PATCH 200/512] fanotify: Add helpers to decide whether to report FID/DFID Now that there is an event that reports FID records even for a zeroed file handle, wrap the logic that deides whether to issue the records into helper functions. This shouldn't have any impact on the code, but simplifies further patches. Link: https://lore.kernel.org/r/20211025192746.66445-24-krisman@collabora.com Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.h | 10 ++++++++++ fs/notify/fanotify/fanotify_user.c | 13 +++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 3510d06654ed..80af269eebb8 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -264,6 +264,16 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event) return info ? fanotify_info_dir_fh_len(info) : 0; } +static inline bool fanotify_event_has_object_fh(struct fanotify_event *event) +{ + return fanotify_event_object_fh_len(event) > 0; +} + +static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event) +{ + return fanotify_event_dir_fh_len(event) > 0; +} + struct fanotify_path_event { struct fanotify_event fae; struct path path; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 2f4182b754b2..a9b5c36ee49e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -140,10 +140,9 @@ static size_t fanotify_event_len(unsigned int info_mode, return event_len; info = fanotify_event_info(event); - dir_fh_len = fanotify_event_dir_fh_len(event); - fh_len = fanotify_event_object_fh_len(event); - if (dir_fh_len) { + if (fanotify_event_has_dir_fh(event)) { + dir_fh_len = fanotify_event_dir_fh_len(event); event_len += fanotify_fid_info_len(dir_fh_len, info->name_len); } else if ((info_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { @@ -157,8 +156,10 @@ static size_t fanotify_event_len(unsigned int info_mode, if (info_mode & FAN_REPORT_PIDFD) event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; - if (fh_len) + if (fanotify_event_has_object_fh(event)) { + fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); + } return event_len; } @@ -451,7 +452,7 @@ static int copy_info_records_to_user(struct fanotify_event *event, /* * Event info records order is as follows: dir fid + name, child fid. */ - if (fanotify_event_dir_fh_len(event)) { + if (fanotify_event_has_dir_fh(event)) { info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : FAN_EVENT_INFO_TYPE_DFID; ret = copy_fid_info_to_user(fanotify_event_fsid(event), @@ -467,7 +468,7 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } - if (fanotify_event_object_fh_len(event)) { + if (fanotify_event_has_object_fh(event)) { const char *dot = NULL; int dot_len = 0; From 572c28f27a269f88e2d8d7b6b1507f114d637337 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:40 -0300 Subject: [PATCH 201/512] fanotify: WARN_ON against too large file handles struct fanotify_error_event, at least, is preallocated and isn't able to to handle arbitrarily large file handles. Future-proof the code by complaining loudly if a handle larger than MAX_HANDLE_SZ is ever found. Link: https://lore.kernel.org/r/20211025192746.66445-26-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index cedcb1546804..45df610debbe 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -360,13 +360,23 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, static int fanotify_encode_fh_len(struct inode *inode) { int dwords = 0; + int fh_len; if (!inode) return 0; exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); + fh_len = dwords << 2; - return dwords << 2; + /* + * struct fanotify_error_event might be preallocated and is + * limited to MAX_HANDLE_SZ. This should never happen, but + * safeguard by forcing an invalid file handle. + */ + if (WARN_ON_ONCE(fh_len > MAX_HANDLE_SZ)) + return 0; + + return fh_len; } /* From 936d6a38be39177495af38497bf8da1c6128fa1b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:41 -0300 Subject: [PATCH 202/512] fanotify: Report fid info for file related file system errors Plumb the pieces to add a FID report to error records. Since all error event memory must be pre-allocated, we pre-allocate the maximum file handle size possible, such that it should always fit. For errors that don't expose a file handle, report it with an invalid FID. Internally we use zero-length FILEID_ROOT file handle for passing the information (which we report as zero-length FILEID_INVALID file handle to userspace) so we update the handle reporting code to deal with this case correctly. Link: https://lore.kernel.org/r/20211025192746.66445-27-krisman@collabora.com Link: https://lore.kernel.org/r/20211025192746.66445-25-krisman@collabora.com Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara [Folded two patches into 2 to make series bisectable] Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 11 +++++++++++ fs/notify/fanotify/fanotify.h | 9 +++++++++ fs/notify/fanotify/fanotify_user.c | 8 +++++--- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 45df610debbe..465f07e70e6d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -609,7 +609,9 @@ static struct fanotify_event *fanotify_alloc_error_event( { struct fs_error_report *report = fsnotify_data_error_report(data, data_type); + struct inode *inode; struct fanotify_error_event *fee; + int fh_len; if (WARN_ON_ONCE(!report)) return NULL; @@ -622,6 +624,15 @@ static struct fanotify_event *fanotify_alloc_error_event( fee->err_count = 1; fee->fsid = *fsid; + inode = report->inode; + fh_len = fanotify_encode_fh_len(inode); + + /* Bad fh_len. Fallback to using an invalid fh. Should never happen. */ + if (!fh_len && inode) + inode = NULL; + + fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0); + *hash ^= fanotify_hash_fsid(fsid); return &fee->fae; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 80af269eebb8..edd7587adcc5 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -208,6 +208,8 @@ struct fanotify_error_event { u32 err_count; /* Suppressed errors count */ __kernel_fsid_t fsid; /* FSID this error refers to. */ + + FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ); }; static inline struct fanotify_error_event * @@ -222,6 +224,8 @@ static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event) return &FANOTIFY_FE(event)->fsid; else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return &FANOTIFY_NE(event)->fsid; + else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return &FANOTIFY_EE(event)->fsid; else return NULL; } @@ -233,6 +237,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh( return &FANOTIFY_FE(event)->object_fh; else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return fanotify_info_file_fh(&FANOTIFY_NE(event)->info); + else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return &FANOTIFY_EE(event)->object_fh; else return NULL; } @@ -266,6 +272,9 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event) static inline bool fanotify_event_has_object_fh(struct fanotify_event *event) { + /* For error events, even zeroed fh are reported. */ + if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return true; return fanotify_event_object_fh_len(event) > 0; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index a9b5c36ee49e..ff4a7373f7a5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -339,9 +339,6 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", __func__, fh_len, name_len, info_len, count); - if (!fh_len) - return 0; - if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT; @@ -376,6 +373,11 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, handle.handle_type = fh->type; handle.handle_bytes = fh_len; + + /* Mangle handle_type for bad file_handle */ + if (!fh_len) + handle.handle_type = FILEID_INVALID; + if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT; From 130a3c742107acff985541c28360c8b40203559c Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:42 -0300 Subject: [PATCH 203/512] fanotify: Emit generic error info for error event The error info is a record sent to users on FAN_FS_ERROR events documenting the type of error. It also carries an error count, documenting how many errors were observed since the last reporting. Link: https://lore.kernel.org/r/20211025192746.66445-28-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 1 + fs/notify/fanotify/fanotify.h | 1 + fs/notify/fanotify/fanotify_user.c | 36 ++++++++++++++++++++++++++++++ include/uapi/linux/fanotify.h | 7 ++++++ 4 files changed, 45 insertions(+) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 465f07e70e6d..af61425e6e3b 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -621,6 +621,7 @@ static struct fanotify_event *fanotify_alloc_error_event( return NULL; fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR; + fee->error = report->error; fee->err_count = 1; fee->fsid = *fsid; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index edd7587adcc5..d25f500bf7e7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -205,6 +205,7 @@ FANOTIFY_NE(struct fanotify_event *event) struct fanotify_error_event { struct fanotify_event fae; + s32 error; /* Error reported by the Filesystem. */ u32 err_count; /* Suppressed errors count */ __kernel_fsid_t fsid; /* FSID this error refers to. */ diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ff4a7373f7a5..fb817028d0b6 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -115,6 +115,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) #define FANOTIFY_PIDFD_INFO_HDR_LEN \ sizeof(struct fanotify_event_info_pidfd) +#define FANOTIFY_ERROR_INFO_LEN \ + (sizeof(struct fanotify_event_info_error)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -139,6 +141,9 @@ static size_t fanotify_event_len(unsigned int info_mode, if (!info_mode) return event_len; + if (fanotify_is_error_event(event->mask)) + event_len += FANOTIFY_ERROR_INFO_LEN; + info = fanotify_event_info(event); if (fanotify_event_has_dir_fh(event)) { @@ -324,6 +329,28 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } +static size_t copy_error_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_error info; + struct fanotify_error_event *fee = FANOTIFY_EE(event); + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; + info.hdr.pad = 0; + info.hdr.len = FANOTIFY_ERROR_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.error = fee->error; + info.error_count = fee->err_count; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, int info_type, const char *name, size_t name_len, @@ -530,6 +557,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_is_error_event(event->mask)) { + ret = copy_error_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 2990731ddc8b..bd1932c2074d 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -126,6 +126,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_DFID_NAME 2 #define FAN_EVENT_INFO_TYPE_DFID 3 #define FAN_EVENT_INFO_TYPE_PIDFD 4 +#define FAN_EVENT_INFO_TYPE_ERROR 5 /* Variable length info record following event metadata */ struct fanotify_event_info_header { @@ -160,6 +161,12 @@ struct fanotify_event_info_pidfd { __s32 pidfd; }; +struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; +}; + struct fanotify_response { __s32 fd; __u32 response; From 9709bd548f11a092d124698118013f66e1740f9b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:43 -0300 Subject: [PATCH 204/512] fanotify: Allow users to request FAN_FS_ERROR events Wire up the FAN_FS_ERROR event in the fanotify_mark syscall, allowing user space to request the monitoring of FAN_FS_ERROR events. These events are limited to filesystem marks, so check it is the case in the syscall handler. Link: https://lore.kernel.org/r/20211025192746.66445-29-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 2 +- fs/notify/fanotify/fanotify_user.c | 4 ++++ include/linux/fanotify.h | 6 +++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index af61425e6e3b..b6091775aa6e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -822,7 +822,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type, dir); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index fb817028d0b6..559bc1e9926d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1520,6 +1520,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group->priority == FS_PRIO_0) goto fput_and_out; + if (mask & FAN_FS_ERROR && + mark_type != FAN_MARK_FILESYSTEM) + goto fput_and_out; + /* * Events that do not carry enough information to report * event->fd require a group that supports reporting fid. Those diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 52d464802d99..616af2ea20f3 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -91,9 +91,13 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ #define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \ FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF) +/* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */ +#define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR) + /* Events that user can request to be notified on */ #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ - FANOTIFY_INODE_EVENTS) + FANOTIFY_INODE_EVENTS | \ + FANOTIFY_ERROR_EVENTS) /* Events that require a permission response from user */ #define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ From 9a089b21f79b47eed240d4da7ea0d049de7c9b4d Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:44 -0300 Subject: [PATCH 205/512] ext4: Send notifications on error Send a FS_ERROR message via fsnotify to a userspace monitoring tool whenever a ext4 error condition is triggered. This follows the existing error conditions in ext4, so it is hooked to the ext4_error* functions. Link: https://lore.kernel.org/r/20211025192746.66445-30-krisman@collabora.com Signed-off-by: Gabriel Krisman Bertazi Acked-by: Theodore Ts'o Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Jan Kara --- fs/ext4/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 88d5d274a868..1a766c68a55e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ @@ -759,6 +760,8 @@ void __ext4_error(struct super_block *sb, const char *function, sb->s_id, function, line, current->comm, &vaf); va_end(args); } + fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); + ext4_handle_error(sb, force_ro, error, 0, block, function, line); } @@ -789,6 +792,8 @@ void __ext4_error_inode(struct inode *inode, const char *function, current->comm, &vaf); va_end(args); } + fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); + ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } @@ -827,6 +832,8 @@ void __ext4_error_file(struct file *file, const char *function, current->comm, path, &vaf); va_end(args); } + fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); + ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } @@ -894,6 +901,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } + fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } From 5451093081db6ca1a708d149e11cfd219800bc4c Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:45 -0300 Subject: [PATCH 206/512] samples: Add fs error monitoring example Introduce an example of a FAN_FS_ERROR fanotify user to track filesystem errors. Link: https://lore.kernel.org/r/20211025192746.66445-31-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- samples/Kconfig | 9 +++ samples/Makefile | 1 + samples/fanotify/Makefile | 5 ++ samples/fanotify/fs-monitor.c | 142 ++++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 samples/fanotify/Makefile create mode 100644 samples/fanotify/fs-monitor.c diff --git a/samples/Kconfig b/samples/Kconfig index b0503ef058d3..88353b8eac0b 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -120,6 +120,15 @@ config SAMPLE_CONNECTOR with it. See also Documentation/driver-api/connector.rst +config SAMPLE_FANOTIFY_ERROR + bool "Build fanotify error monitoring sample" + depends on FANOTIFY + help + When enabled, this builds an example code that uses the + FAN_FS_ERROR fanotify mechanism to monitor filesystem + errors. + See also Documentation/admin-guide/filesystem-monitoring.rst. + config SAMPLE_HIDRAW bool "hidraw sample" depends on CC_CAN_LINK && HEADERS_INSTALL diff --git a/samples/Makefile b/samples/Makefile index 087e0988ccc5..931a81847c48 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -5,6 +5,7 @@ subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ +obj-$(CONFIG_SAMPLE_FANOTIFY_ERROR) += fanotify/ subdir-$(CONFIG_SAMPLE_HIDRAW) += hidraw obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += hw_breakpoint/ obj-$(CONFIG_SAMPLE_KDB) += kdb/ diff --git a/samples/fanotify/Makefile b/samples/fanotify/Makefile new file mode 100644 index 000000000000..e20db1bdde3b --- /dev/null +++ b/samples/fanotify/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += fs-monitor + +userccflags += -I usr/include -Wall + diff --git a/samples/fanotify/fs-monitor.c b/samples/fanotify/fs-monitor.c new file mode 100644 index 000000000000..a0e44cd31e6f --- /dev/null +++ b/samples/fanotify/fs-monitor.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021, Collabora Ltd. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef FAN_FS_ERROR +#define FAN_FS_ERROR 0x00008000 +#define FAN_EVENT_INFO_TYPE_ERROR 5 + +struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; +}; +#endif + +#ifndef FILEID_INO32_GEN +#define FILEID_INO32_GEN 1 +#endif + +#ifndef FILEID_INVALID +#define FILEID_INVALID 0xff +#endif + +static void print_fh(struct file_handle *fh) +{ + int i; + uint32_t *h = (uint32_t *) fh->f_handle; + + printf("\tfh: "); + for (i = 0; i < fh->handle_bytes; i++) + printf("%hhx", fh->f_handle[i]); + printf("\n"); + + printf("\tdecoded fh: "); + if (fh->handle_type == FILEID_INO32_GEN) + printf("inode=%u gen=%u\n", h[0], h[1]); + else if (fh->handle_type == FILEID_INVALID && !fh->handle_bytes) + printf("Type %d (Superblock error)\n", fh->handle_type); + else + printf("Type %d (Unknown)\n", fh->handle_type); + +} + +static void handle_notifications(char *buffer, int len) +{ + struct fanotify_event_metadata *event = + (struct fanotify_event_metadata *) buffer; + struct fanotify_event_info_header *info; + struct fanotify_event_info_error *err; + struct fanotify_event_info_fid *fid; + int off; + + for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) { + + if (event->mask != FAN_FS_ERROR) { + printf("unexpected FAN MARK: %llx\n", event->mask); + goto next_event; + } + + if (event->fd != FAN_NOFD) { + printf("Unexpected fd (!= FAN_NOFD)\n"); + goto next_event; + } + + printf("FAN_FS_ERROR (len=%d)\n", event->event_len); + + for (off = sizeof(*event) ; off < event->event_len; + off += info->len) { + info = (struct fanotify_event_info_header *) + ((char *) event + off); + + switch (info->info_type) { + case FAN_EVENT_INFO_TYPE_ERROR: + err = (struct fanotify_event_info_error *) info; + + printf("\tGeneric Error Record: len=%d\n", + err->hdr.len); + printf("\terror: %d\n", err->error); + printf("\terror_count: %d\n", err->error_count); + break; + + case FAN_EVENT_INFO_TYPE_FID: + fid = (struct fanotify_event_info_fid *) info; + + printf("\tfsid: %x%x\n", + fid->fsid.val[0], fid->fsid.val[1]); + print_fh((struct file_handle *) &fid->handle); + break; + + default: + printf("\tUnknown info type=%d len=%d:\n", + info->info_type, info->len); + } + } +next_event: + printf("---\n\n"); + } +} + +int main(int argc, char **argv) +{ + int fd; + + char buffer[BUFSIZ]; + + if (argc < 2) { + printf("Missing path argument\n"); + return 1; + } + + fd = fanotify_init(FAN_CLASS_NOTIF|FAN_REPORT_FID, O_RDONLY); + if (fd < 0) + errx(1, "fanotify_init"); + + if (fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, + FAN_FS_ERROR, AT_FDCWD, argv[1])) { + errx(1, "fanotify_mark"); + } + + while (1) { + int n = read(fd, buffer, BUFSIZ); + + if (n < 0) + errx(1, "read"); + + handle_notifications(buffer, n); + } + + return 0; +} From c0baf9ac0b05d53dfe0436661dbdc5e43c01c5e0 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:46 -0300 Subject: [PATCH 207/512] docs: Document the FAN_FS_ERROR event Document the FAN_FS_ERROR event for user administrators and user space developers. Link: https://lore.kernel.org/r/20211025192746.66445-32-krisman@collabora.com Reviewed-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Jan Kara Signed-off-by: Jan Kara --- .../admin-guide/filesystem-monitoring.rst | 74 +++++++++++++++++++ Documentation/admin-guide/index.rst | 1 + 2 files changed, 75 insertions(+) create mode 100644 Documentation/admin-guide/filesystem-monitoring.rst diff --git a/Documentation/admin-guide/filesystem-monitoring.rst b/Documentation/admin-guide/filesystem-monitoring.rst new file mode 100644 index 000000000000..5a3c84e60095 --- /dev/null +++ b/Documentation/admin-guide/filesystem-monitoring.rst @@ -0,0 +1,74 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================== +File system Monitoring with fanotify +==================================== + +File system Error Reporting +=========================== + +Fanotify supports the FAN_FS_ERROR event type for file system-wide error +reporting. It is meant to be used by file system health monitoring +daemons, which listen for these events and take actions (notify +sysadmin, start recovery) when a file system problem is detected. + +By design, a FAN_FS_ERROR notification exposes sufficient information +for a monitoring tool to know a problem in the file system has happened. +It doesn't necessarily provide a user space application with semantics +to verify an IO operation was successfully executed. That is out of +scope for this feature. Instead, it is only meant as a framework for +early file system problem detection and reporting recovery tools. + +When a file system operation fails, it is common for dozens of kernel +errors to cascade after the initial failure, hiding the original failure +log, which is usually the most useful debug data to troubleshoot the +problem. For this reason, FAN_FS_ERROR tries to report only the first +error that occurred for a file system since the last notification, and +it simply counts additional errors. This ensures that the most +important pieces of information are never lost. + +FAN_FS_ERROR requires the fanotify group to be setup with the +FAN_REPORT_FID flag. + +At the time of this writing, the only file system that emits FAN_FS_ERROR +notifications is Ext4. + +A FAN_FS_ERROR Notification has the following format:: + + [ Notification Metadata (Mandatory) ] + [ Generic Error Record (Mandatory) ] + [ FID record (Mandatory) ] + +The order of records is not guaranteed, and new records might be added +in the future. Therefore, applications must not rely on the order and +must be prepared to skip over unknown records. Please refer to +``samples/fanotify/fs-monitor.c`` for an example parser. + +Generic error record +-------------------- + +The generic error record provides enough information for a file system +agnostic tool to learn about a problem in the file system, without +providing any additional details about the problem. This record is +identified by ``struct fanotify_event_info_header.info_type`` being set +to FAN_EVENT_INFO_TYPE_ERROR. + + struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; + }; + +The `error` field identifies the type of error using errno values. +`error_count` tracks the number of errors that occurred and were +suppressed to preserve the original error information, since the last +notification. + +FID record +---------- + +The FID record can be used to uniquely identify the inode that triggered +the error through the combination of fsid and file handle. A file system +specific application can use that information to attempt a recovery +procedure. Errors that are not related to an inode are reported with an +empty file handle of type FILEID_INVALID. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index dc00afcabb95..1bedab498104 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -82,6 +82,7 @@ configure specific aspects of kernel behavior to your liking. edid efi-stub ext4 + filesystem-monitoring nfs/index gpio/index highuid From 81dedaf10c20959bdf5624f9783f408df26ba7a4 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 27 Oct 2021 22:34:41 +0800 Subject: [PATCH 208/512] fs: reiserfs: remove useless new_opts in reiserfs_remount Since the commit c3d98ea08291 ("VFS: Don't use save/replace_mount_options if not using generic_show_options") eliminates replace_mount_options in reiserfs_remount, but does not handle the allocated new_opts, it will cause memory leak in the reiserfs_remount. Because new_opts is useless in reiserfs_mount, so we fix this bug by removing the useless new_opts in reiserfs_remount. Fixes: c3d98ea08291 ("VFS: Don't use save/replace_mount_options if not using generic_show_options") Link: https://lore.kernel.org/r/20211027143445.4156459-1-mudongliangabcd@gmail.com Signed-off-by: Dongliang Mu Signed-off-by: Jan Kara --- fs/reiserfs/super.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 58481f8d63d5..f7b05c6b3dcf 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1437,7 +1437,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) unsigned long safe_mask = 0; unsigned int commit_max_age = (unsigned int)-1; struct reiserfs_journal *journal = SB_JOURNAL(s); - char *new_opts; int err; char *qf_names[REISERFS_MAXQUOTAS]; unsigned int qfmt = 0; @@ -1445,10 +1444,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) int i; #endif - new_opts = kstrdup(arg, GFP_KERNEL); - if (arg && !new_opts) - return -ENOMEM; - sync_filesystem(s); reiserfs_write_lock(s); @@ -1599,7 +1594,6 @@ out_ok_unlocked: out_err_unlock: reiserfs_write_unlock(s); out_err: - kfree(new_opts); return err; } From fd1ae23b495b3a8a9975e49705b7678f6e2ab67b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 13 Oct 2021 01:41:36 +0000 Subject: [PATCH 209/512] PCI: Prefer 'unsigned int' over bare 'unsigned' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bare "unsigned" type implicitly means "unsigned int", but the preferred coding style is to use the complete type name. Update the bare use of "unsigned" to the preferred "unsigned int". No change to functionality intended. See a1ce18e4f941 ("checkpatch: warn on bare unsigned or signed declarations without int"). Link: https://lore.kernel.org/r/20211013014136.1117543-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-thunder-ecam.c | 4 ++-- drivers/pci/msi.c | 3 ++- drivers/pci/pci.c | 5 +++-- drivers/pci/probe.c | 7 ++++--- drivers/pci/quirks.c | 12 ++++++------ drivers/pci/rom.c | 2 +- drivers/pci/setup-bus.c | 2 +- 7 files changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/pci/controller/pci-thunder-ecam.c b/drivers/pci/controller/pci-thunder-ecam.c index ffd84656544f..e9d5ca245f5e 100644 --- a/drivers/pci/controller/pci-thunder-ecam.c +++ b/drivers/pci/controller/pci-thunder-ecam.c @@ -17,7 +17,7 @@ static void set_val(u32 v, int where, int size, u32 *val) { int shift = (where & 3) * 8; - pr_debug("set_val %04x: %08x\n", (unsigned)(where & ~3), v); + pr_debug("set_val %04x: %08x\n", (unsigned int)(where & ~3), v); v >>= shift; if (size == 1) v &= 0xff; @@ -187,7 +187,7 @@ static int thunder_ecam_config_read(struct pci_bus *bus, unsigned int devfn, pr_debug("%04x:%04x - Fix pass#: %08x, where: %03x, devfn: %03x\n", vendor_device & 0xffff, vendor_device >> 16, class_rev, - (unsigned) where, devfn); + (unsigned int)where, devfn); /* Check for non type-00 header */ if (cfg_type == 0) { diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 0099a00af361..bdc6ba7f39f0 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -579,7 +579,8 @@ err: return ret; } -static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries) +static void __iomem *msix_map_region(struct pci_dev *dev, + unsigned int nr_entries) { resource_size_t phys_addr; u32 table_offset; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index d72c0281eb54..a3ec83f554c5 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6324,11 +6324,12 @@ EXPORT_SYMBOL_GPL(pci_pr3_present); * cannot be left as a userspace activity). DMA aliases should therefore * be configured via quirks, such as the PCI fixup header quirk. */ -void pci_add_dma_alias(struct pci_dev *dev, u8 devfn_from, unsigned nr_devfns) +void pci_add_dma_alias(struct pci_dev *dev, u8 devfn_from, + unsigned int nr_devfns) { int devfn_to; - nr_devfns = min(nr_devfns, (unsigned) MAX_NR_DEVFNS - devfn_from); + nr_devfns = min(nr_devfns, (unsigned int)MAX_NR_DEVFNS - devfn_from); devfn_to = devfn_from + nr_devfns - 1; if (!dev->dma_alias_mask) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index d9fc02a71baa..51c0a33640e6 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2550,11 +2550,12 @@ struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn) } EXPORT_SYMBOL(pci_scan_single_device); -static unsigned next_fn(struct pci_bus *bus, struct pci_dev *dev, unsigned fn) +static unsigned int next_fn(struct pci_bus *bus, struct pci_dev *dev, + unsigned int fn) { int pos; u16 cap = 0; - unsigned next_fn; + unsigned int next_fn; if (pci_ari_enabled(bus)) { if (!dev) @@ -2613,7 +2614,7 @@ static int only_one_child(struct pci_bus *bus) */ int pci_scan_slot(struct pci_bus *bus, int devfn) { - unsigned fn, nr = 0; + unsigned int fn, nr = 0; struct pci_dev *dev; if (only_one_child(bus) && (devfn > 0)) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 881c5d7c3d02..058bc9bfa296 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -501,7 +501,7 @@ static void quirk_s3_64M(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_868, quirk_s3_64M); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_968, quirk_s3_64M); -static void quirk_io(struct pci_dev *dev, int pos, unsigned size, +static void quirk_io(struct pci_dev *dev, int pos, unsigned int size, const char *name) { u32 region; @@ -552,7 +552,7 @@ static void quirk_cs5536_vsa(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, quirk_cs5536_vsa); static void quirk_io_region(struct pci_dev *dev, int port, - unsigned size, int nr, const char *name) + unsigned int size, int nr, const char *name) { u16 region; struct pci_bus_region bus_region; @@ -666,7 +666,7 @@ static void piix4_io_quirk(struct pci_dev *dev, const char *name, unsigned int p base = devres & 0xffff; size = 16; for (;;) { - unsigned bit = size >> 1; + unsigned int bit = size >> 1; if ((bit & mask) == bit) break; size = bit; @@ -692,7 +692,7 @@ static void piix4_mem_quirk(struct pci_dev *dev, const char *name, unsigned int mask = (devres & 0x3f) << 16; size = 128 << 16; for (;;) { - unsigned bit = size >> 1; + unsigned int bit = size >> 1; if ((bit & mask) == bit) break; size = bit; @@ -806,7 +806,7 @@ static void ich6_lpc_acpi_gpio(struct pci_dev *dev) "ICH6 GPIO"); } -static void ich6_lpc_generic_decode(struct pci_dev *dev, unsigned reg, +static void ich6_lpc_generic_decode(struct pci_dev *dev, unsigned int reg, const char *name, int dynsize) { u32 val; @@ -850,7 +850,7 @@ static void quirk_ich6_lpc(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, quirk_ich6_lpc); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, quirk_ich6_lpc); -static void ich7_lpc_generic_decode(struct pci_dev *dev, unsigned reg, +static void ich7_lpc_generic_decode(struct pci_dev *dev, unsigned int reg, const char *name) { u32 val; diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c index 8fc9a4e911e3..e18d3a4383ba 100644 --- a/drivers/pci/rom.c +++ b/drivers/pci/rom.c @@ -85,7 +85,7 @@ static size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, { void __iomem *image; int last_image; - unsigned length; + unsigned int length; image = rom; do { diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 2ce636937c6e..547396ec50b5 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1525,7 +1525,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus, { struct pci_dev *dev = bus->self; struct resource *r; - unsigned old_flags = 0; + unsigned int old_flags = 0; struct resource *b_res; int idx = 1; From fb2099960d46c486c552807a216aae33819155c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 27 Oct 2021 10:50:41 +0000 Subject: [PATCH 210/512] MAINTAINERS: Update PCI subsystem information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the following information related to the PCI subsystem which includes the PCI drivers, PCI native host bridge and endpoint drivers, and the PCI endpoint sub-system: - Sort fields as per preferred order - Sort files in the alphabetical order - Update old Patchwork URLs - Update Git repository for the PCI endpoint subsystem - Add Bugzilla link - Add link to the official IRC channel - Add files "drivers/pci/pci-bridge-emul.{c,h}" to the right section so that proper ownership is returned for both files from the get_maintainer.pl script Link: https://lore.kernel.org/r/20211027105041.24087-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- MAINTAINERS | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ca6d6fde85cf..3ec2fdb015c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14440,9 +14440,12 @@ M: Lorenzo Pieralisi R: Krzysztof Wilczyński L: linux-pci@vger.kernel.org S: Supported +Q: https://patchwork.kernel.org/project/linux-pci/list/ +B: https://bugzilla.kernel.org +C: irc://irc.oftc.net/linux-pci +T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git F: Documentation/PCI/endpoint/* F: Documentation/misc-devices/pci-endpoint-test.rst -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kishon/pci-endpoint.git F: drivers/misc/pci_endpoint_test.c F: drivers/pci/endpoint/ F: tools/pci/ @@ -14488,15 +14491,21 @@ R: Rob Herring R: Krzysztof Wilczyński L: linux-pci@vger.kernel.org S: Supported -Q: http://patchwork.ozlabs.org/project/linux-pci/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/ +Q: https://patchwork.kernel.org/project/linux-pci/list/ +B: https://bugzilla.kernel.org +C: irc://irc.oftc.net/linux-pci +T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git F: drivers/pci/controller/ +F: drivers/pci/pci-bridge-emul.c +F: drivers/pci/pci-bridge-emul.h PCI SUBSYSTEM M: Bjorn Helgaas L: linux-pci@vger.kernel.org S: Supported -Q: http://patchwork.ozlabs.org/project/linux-pci/list/ +Q: https://patchwork.kernel.org/project/linux-pci/list/ +B: https://bugzilla.kernel.org +C: irc://irc.oftc.net/linux-pci T: git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git F: Documentation/PCI/ F: Documentation/devicetree/bindings/pci/ From 7a41ae80bdcb17e14dd7d83239b8a0cf368f18be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 28 Oct 2021 20:56:53 +0200 Subject: [PATCH 211/512] PCI: pci-bridge-emul: Fix emulation of W1C bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pci_bridge_emul_conf_write() function correctly clears W1C bits in cfgspace cache, but it does not inform the underlying implementation about the clear request: the .write_op() method is given the value with these bits cleared. This is wrong if the .write_op() needs to know which bits were requested to be cleared. Fix the value to be passed into the .write_op() method to have requested W1C bits set, so that it can clear them. Both pci-bridge-emul users (mvebu and aardvark) are compatible with this change. Link: https://lore.kernel.org/r/20211028185659.20329-2-kabel@kernel.org Fixes: 23a5fba4d941 ("PCI: Introduce PCI bridge emulated config space common logic") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Cc: Russell King --- drivers/pci/pci-bridge-emul.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index fdaf86a888b7..db97cddfc85e 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -431,8 +431,21 @@ int pci_bridge_emul_conf_write(struct pci_bridge_emul *bridge, int where, /* Clear the W1C bits */ new &= ~((value << shift) & (behavior[reg / 4].w1c & mask)); + /* Save the new value with the cleared W1C bits into the cfgspace */ cfgspace[reg / 4] = cpu_to_le32(new); + /* + * Clear the W1C bits not specified by the write mask, so that the + * write_op() does not clear them. + */ + new &= ~(behavior[reg / 4].w1c & ~mask); + + /* + * Set the W1C bits specified by the write mask, so that write_op() + * knows about that they are to be cleared. + */ + new |= (value << shift) & (behavior[reg / 4].w1c & mask); + if (write_op) write_op(bridge, reg, old, new, mask); From e4313be1599d397625c14fb7826996813622decf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 28 Oct 2021 20:56:54 +0200 Subject: [PATCH 212/512] PCI: aardvark: Fix return value of MSI domain .alloc() method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSI domain callback .alloc() (implemented by advk_msi_irq_domain_alloc() function) should return zero on success, since non-zero value indicates failure. When the driver was converted to generic MSI API in commit f21a8b1b6837 ("PCI: aardvark: Move to MSI handling using generic MSI support"), it was converted so that it returns hwirq number. Fix this. Link: https://lore.kernel.org/r/20211028185659.20329-3-kabel@kernel.org Fixes: f21a8b1b6837 ("PCI: aardvark: Move to MSI handling using generic MSI support") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 10476c00b312..b45ff2911c80 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -1138,7 +1138,7 @@ static int advk_msi_irq_domain_alloc(struct irq_domain *domain, domain->host_data, handle_simple_irq, NULL, NULL); - return hwirq; + return 0; } static void advk_msi_irq_domain_free(struct irq_domain *domain, From 95997723b6402cd6c53e0f9e7ac640ec64eaaff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Thu, 28 Oct 2021 20:56:55 +0200 Subject: [PATCH 213/512] PCI: aardvark: Read all 16-bits from PCIE_MSI_PAYLOAD_REG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCIE_MSI_PAYLOAD_REG contains 16-bit MSI number, not only lower 8 bits. Fix reading content of this register and add a comment describing the access to this register. Link: https://lore.kernel.org/r/20211028185659.20329-4-kabel@kernel.org Fixes: 8c39d710363c ("PCI: aardvark: Add Aardvark PCI host controller driver") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index b45ff2911c80..389ebba1dd9b 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -119,6 +119,7 @@ #define PCIE_MSI_STATUS_REG (CONTROL_BASE_ADDR + 0x58) #define PCIE_MSI_MASK_REG (CONTROL_BASE_ADDR + 0x5C) #define PCIE_MSI_PAYLOAD_REG (CONTROL_BASE_ADDR + 0x9C) +#define PCIE_MSI_DATA_MASK GENMASK(15, 0) /* PCIe window configuration */ #define OB_WIN_BASE_ADDR 0x4c00 @@ -1319,8 +1320,12 @@ static void advk_pcie_handle_msi(struct advk_pcie *pcie) if (!(BIT(msi_idx) & msi_status)) continue; + /* + * msi_idx contains bits [4:0] of the msi_data and msi_data + * contains 16bit MSI interrupt number + */ advk_writel(pcie, BIT(msi_idx), PCIE_MSI_STATUS_REG); - msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & 0xFF; + msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & PCIE_MSI_DATA_MASK; generic_handle_irq(msi_data); } From 771153fc884f566a89af2d30033b7f3bc6e24e84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 28 Oct 2021 20:56:56 +0200 Subject: [PATCH 214/512] PCI: aardvark: Fix support for bus mastering and PCI_COMMAND on emulated bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From very vague, ambiguous and incomplete information from Marvell we deduced that the 32-bit Aardvark register at address 0x4 (PCIE_CORE_CMD_STATUS_REG), which is not documented for Root Complex mode in the Functional Specification (only for Endpoint mode), controls two 16-bit PCIe registers: Command Register and Status Registers of PCIe Root Port. This means that bit 2 controls bus mastering and forwarding of memory and I/O requests in the upstream direction. According to PCI specifications bits [0:2] of Command Register, this should be by default disabled on reset. So explicitly disable these bits at early setup of the Aardvark driver. Remove code which unconditionally enables all 3 bits and let kernel code (via pci_set_master() function) to handle bus mastering of Root PCIe Bridge via emulated PCI_COMMAND on emulated bridge. Link: https://lore.kernel.org/r/20211028185659.20329-5-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # b2a56469d550 ("PCI: aardvark: Add FIXME comment for PCIE_CORE_CMD_STATUS_REG access") --- drivers/pci/controller/pci-aardvark.c | 54 +++++++++++++++++++-------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 389ebba1dd9b..d7db03da4d1c 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -31,9 +31,6 @@ /* PCIe core registers */ #define PCIE_CORE_DEV_ID_REG 0x0 #define PCIE_CORE_CMD_STATUS_REG 0x4 -#define PCIE_CORE_CMD_IO_ACCESS_EN BIT(0) -#define PCIE_CORE_CMD_MEM_ACCESS_EN BIT(1) -#define PCIE_CORE_CMD_MEM_IO_REQ_EN BIT(2) #define PCIE_CORE_DEV_REV_REG 0x8 #define PCIE_CORE_PCIEXP_CAP 0xc0 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 @@ -514,6 +511,11 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = (PCI_VENDOR_ID_MARVELL << 16) | PCI_VENDOR_ID_MARVELL; advk_writel(pcie, reg, VENDOR_ID_REG); + /* Disable Root Bridge I/O space, memory space and bus mastering */ + reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); + reg &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); + /* Set Advanced Error Capabilities and Control PF0 register */ reg = PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX | PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN | @@ -612,19 +614,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) advk_pcie_disable_ob_win(pcie, i); advk_pcie_train_link(pcie); - - /* - * FIXME: The following register update is suspicious. This register is - * applicable only when the PCI controller is configured for Endpoint - * mode, not as a Root Complex. But apparently when this code is - * removed, some cards stop working. This should be investigated and - * a comment explaining this should be put here. - */ - reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); - reg |= PCIE_CORE_CMD_MEM_ACCESS_EN | - PCIE_CORE_CMD_IO_ACCESS_EN | - PCIE_CORE_CMD_MEM_IO_REQ_EN; - advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); } static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u32 *val) @@ -753,6 +742,37 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie) return -ETIMEDOUT; } +static pci_bridge_emul_read_status_t +advk_pci_bridge_emul_base_conf_read(struct pci_bridge_emul *bridge, + int reg, u32 *value) +{ + struct advk_pcie *pcie = bridge->data; + + switch (reg) { + case PCI_COMMAND: + *value = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); + return PCI_BRIDGE_EMUL_HANDLED; + + default: + return PCI_BRIDGE_EMUL_NOT_HANDLED; + } +} + +static void +advk_pci_bridge_emul_base_conf_write(struct pci_bridge_emul *bridge, + int reg, u32 old, u32 new, u32 mask) +{ + struct advk_pcie *pcie = bridge->data; + + switch (reg) { + case PCI_COMMAND: + advk_writel(pcie, new, PCIE_CORE_CMD_STATUS_REG); + break; + + default: + break; + } +} static pci_bridge_emul_read_status_t advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, @@ -854,6 +874,8 @@ advk_pci_bridge_emul_pcie_conf_write(struct pci_bridge_emul *bridge, } static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = { + .read_base = advk_pci_bridge_emul_base_conf_read, + .write_base = advk_pci_bridge_emul_base_conf_write, .read_pcie = advk_pci_bridge_emul_pcie_conf_read, .write_pcie = advk_pci_bridge_emul_pcie_conf_write, }; From 84e1b4045dc887b78bdc87d92927093dc3a465aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 28 Oct 2021 20:56:57 +0200 Subject: [PATCH 215/512] PCI: aardvark: Set PCI Bridge Class Code to PCI Bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aardvark controller has something like config space of a Root Port available at offset 0x0 of internal registers - these registers are used for implementation of the emulated bridge. The default value of Class Code of this bridge corresponds to a RAID Mass storage controller, though. (This is probably intended for when the controller is used as Endpoint.) Change the Class Code to correspond to a PCI Bridge. Add comment explaining this change. Link: https://lore.kernel.org/r/20211028185659.20329-6-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index d7db03da4d1c..ddca45415c65 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -511,6 +511,26 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = (PCI_VENDOR_ID_MARVELL << 16) | PCI_VENDOR_ID_MARVELL; advk_writel(pcie, reg, VENDOR_ID_REG); + /* + * Change Class Code of PCI Bridge device to PCI Bridge (0x600400), + * because the default value is Mass storage controller (0x010400). + * + * Note that this Aardvark PCI Bridge does not have compliant Type 1 + * Configuration Space and it even cannot be accessed via Aardvark's + * PCI config space access method. Something like config space is + * available in internal Aardvark registers starting at offset 0x0 + * and is reported as Type 0. In range 0x10 - 0x34 it has totally + * different registers. + * + * Therefore driver uses emulation of PCI Bridge which emulates + * access to configuration space via internal Aardvark registers or + * emulated configuration buffer. + */ + reg = advk_readl(pcie, PCIE_CORE_DEV_REV_REG); + reg &= ~0xffffff00; + reg |= (PCI_CLASS_BRIDGE_PCI << 8) << 8; + advk_writel(pcie, reg, PCIE_CORE_DEV_REV_REG); + /* Disable Root Bridge I/O space, memory space and bus mastering */ reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); reg &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); From bc4fac42e5f8460af09c0a7f2f1915be09e20c71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 28 Oct 2021 20:56:58 +0200 Subject: [PATCH 216/512] PCI: aardvark: Fix support for PCI_BRIDGE_CTL_BUS_RESET on emulated bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aardvark supports PCIe Hot Reset via PCIE_CORE_CTRL1_REG. Use it for implementing PCI_BRIDGE_CTL_BUS_RESET bit of PCI_BRIDGE_CONTROL register on emulated bridge. With this, the function pci_reset_secondary_bus() starts working and can reset connected PCIe card. Custom userspace script [1] which uses setpci can trigger PCIe Hot Reset and reset the card manually. [1] https://alexforencich.com/wiki/en/pcie/hot-reset-linux Link: https://lore.kernel.org/r/20211028185659.20329-7-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index ddca45415c65..c3b725afa11f 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -773,6 +773,22 @@ advk_pci_bridge_emul_base_conf_read(struct pci_bridge_emul *bridge, *value = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); return PCI_BRIDGE_EMUL_HANDLED; + case PCI_INTERRUPT_LINE: { + /* + * From the whole 32bit register we support reading from HW only + * one bit: PCI_BRIDGE_CTL_BUS_RESET. + * Other bits are retrieved only from emulated config buffer. + */ + __le32 *cfgspace = (__le32 *)&bridge->conf; + u32 val = le32_to_cpu(cfgspace[PCI_INTERRUPT_LINE / 4]); + if (advk_readl(pcie, PCIE_CORE_CTRL1_REG) & HOT_RESET_GEN) + val |= PCI_BRIDGE_CTL_BUS_RESET << 16; + else + val &= ~(PCI_BRIDGE_CTL_BUS_RESET << 16); + *value = val; + return PCI_BRIDGE_EMUL_HANDLED; + } + default: return PCI_BRIDGE_EMUL_NOT_HANDLED; } @@ -789,6 +805,17 @@ advk_pci_bridge_emul_base_conf_write(struct pci_bridge_emul *bridge, advk_writel(pcie, new, PCIE_CORE_CMD_STATUS_REG); break; + case PCI_INTERRUPT_LINE: + if (mask & (PCI_BRIDGE_CTL_BUS_RESET << 16)) { + u32 val = advk_readl(pcie, PCIE_CORE_CTRL1_REG); + if (new & (PCI_BRIDGE_CTL_BUS_RESET << 16)) + val |= HOT_RESET_GEN; + else + val &= ~HOT_RESET_GEN; + advk_writel(pcie, val, PCIE_CORE_CTRL1_REG); + } + break; + default: break; } From 239edf686c14a9ff926dec2f350289ed7adfefe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 28 Oct 2021 20:56:59 +0200 Subject: [PATCH 217/512] PCI: aardvark: Fix support for PCI_ROM_ADDRESS1 on emulated bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This register is exported at address offset 0x30. Link: https://lore.kernel.org/r/20211028185659.20329-8-kabel@kernel.org Fixes: 8a3ebd8de328 ("PCI: aardvark: Implement emulated root PCI bridge config space") Signed-off-by: Pali Rohár Signed-off-by: Marek Behún Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-aardvark.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index c3b725afa11f..c5300d49807a 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -32,6 +32,7 @@ #define PCIE_CORE_DEV_ID_REG 0x0 #define PCIE_CORE_CMD_STATUS_REG 0x4 #define PCIE_CORE_DEV_REV_REG 0x8 +#define PCIE_CORE_EXP_ROM_BAR_REG 0x30 #define PCIE_CORE_PCIEXP_CAP 0xc0 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 #define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX BIT(5) @@ -773,6 +774,10 @@ advk_pci_bridge_emul_base_conf_read(struct pci_bridge_emul *bridge, *value = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); return PCI_BRIDGE_EMUL_HANDLED; + case PCI_ROM_ADDRESS1: + *value = advk_readl(pcie, PCIE_CORE_EXP_ROM_BAR_REG); + return PCI_BRIDGE_EMUL_HANDLED; + case PCI_INTERRUPT_LINE: { /* * From the whole 32bit register we support reading from HW only @@ -805,6 +810,10 @@ advk_pci_bridge_emul_base_conf_write(struct pci_bridge_emul *bridge, advk_writel(pcie, new, PCIE_CORE_CMD_STATUS_REG); break; + case PCI_ROM_ADDRESS1: + advk_writel(pcie, new, PCIE_CORE_EXP_ROM_BAR_REG); + break; + case PCI_INTERRUPT_LINE: if (mask & (PCI_BRIDGE_CTL_BUS_RESET << 16)) { u32 val = advk_readl(pcie, PCIE_CORE_CTRL1_REG); From 8fc70b3a142f97f7859bf052151df896933d2586 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 28 Oct 2021 15:56:28 -0300 Subject: [PATCH 218/512] samples: Make fs-monitor depend on libc and headers Prevent build errors when headers or libc are not available, such as on kernel build bots, like the below: samples/fanotify/fs-monitor.c:7:10: fatal error: errno.h: No such file or directory 7 | #include | ^~~~~~~~~ Link: https://lore.kernel.org/r/87fsslasgz.fsf@collabora.com Suggested-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- samples/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/Kconfig b/samples/Kconfig index 88353b8eac0b..56539b21f2c7 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -122,7 +122,7 @@ config SAMPLE_CONNECTOR config SAMPLE_FANOTIFY_ERROR bool "Build fanotify error monitoring sample" - depends on FANOTIFY + depends on FANOTIFY && CC_CAN_LINK && HEADERS_INSTALL help When enabled, this builds an example code that uses the FAN_FS_ERROR fanotify mechanism to monitor filesystem From 9abeae5d4458326e16df7ea237104b58c27dfd77 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 28 Oct 2021 18:05:49 -0300 Subject: [PATCH 219/512] docs: Fix formatting of literal sections in fanotify docs Stephen Rothwell reported the following warning was introduced by commit c0baf9ac0b05 ("docs: Document the FAN_FS_ERROR event"). Documentation/admin-guide/filesystem-monitoring.rst:60: WARNING: Definition list ends without a blank line; unexpected unindent. Link: https://lore.kernel.org/r/87y26camhe.fsf@collabora.com Reported-by: Stephen Rothwell Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- .../admin-guide/filesystem-monitoring.rst | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/filesystem-monitoring.rst b/Documentation/admin-guide/filesystem-monitoring.rst index 5a3c84e60095..ab8dba76283c 100644 --- a/Documentation/admin-guide/filesystem-monitoring.rst +++ b/Documentation/admin-guide/filesystem-monitoring.rst @@ -35,9 +35,11 @@ notifications is Ext4. A FAN_FS_ERROR Notification has the following format:: - [ Notification Metadata (Mandatory) ] - [ Generic Error Record (Mandatory) ] - [ FID record (Mandatory) ] + :: + + [ Notification Metadata (Mandatory) ] + [ Generic Error Record (Mandatory) ] + [ FID record (Mandatory) ] The order of records is not guaranteed, and new records might be added in the future. Therefore, applications must not rely on the order and @@ -53,11 +55,13 @@ providing any additional details about the problem. This record is identified by ``struct fanotify_event_info_header.info_type`` being set to FAN_EVENT_INFO_TYPE_ERROR. - struct fanotify_event_info_error { - struct fanotify_event_info_header hdr; - __s32 error; - __u32 error_count; - }; + :: + + struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; + }; The `error` field identifies the type of error using errno values. `error_count` tracks the number of errors that occurred and were From b7eccf75c28e5469bb4685a03310dbb66ee323f9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 1 Nov 2021 12:47:32 +0100 Subject: [PATCH 220/512] samples: Fix warning in fsnotify sample The fsnotify sample code generates the following warning on powerpc: samples/fanotify/fs-monitor.c: In function 'handle_notifications': samples/fanotify/fs-monitor.c:68:36: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 2 has type '__u64' {aka 'long unsigned int'} [-Wformat=] 68 | printf("unexpected FAN MARK: %llx\n", event->mask); | ~~~^ ~~~~~~~~~~~ | | | | | __u64 {aka long unsigned int} | long long unsigned int | %lx Fix the problem by explicitely typing the argument to proper type. Reported-by: Stephen Rothwell Signed-off-by: Jan Kara --- samples/fanotify/fs-monitor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/fanotify/fs-monitor.c b/samples/fanotify/fs-monitor.c index a0e44cd31e6f..2e08a1807db7 100644 --- a/samples/fanotify/fs-monitor.c +++ b/samples/fanotify/fs-monitor.c @@ -65,7 +65,8 @@ static void handle_notifications(char *buffer, int len) for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) { if (event->mask != FAN_FS_ERROR) { - printf("unexpected FAN MARK: %llx\n", event->mask); + printf("unexpected FAN MARK: %llx\n", + (unsigned long long)event->mask); goto next_event; } From 15c72660fe9a3fddb301ac90175860b14c63ff03 Mon Sep 17 00:00:00 2001 From: Zhang Mingyu Date: Mon, 1 Nov 2021 07:51:52 +0000 Subject: [PATCH 221/512] samples: remove duplicate include in fs-monitor.c 'sys/types.h' included in 'samples/fanotify/fs-monitor.c' is duplicated.It is also included on 15 line. Link: https://lore.kernel.org/r/20211101075152.35780-1-zhang.mingyu@zte.com.cn Reported-by: Zeal Robot Signed-off-by: Zhang Mingyu Signed-off-by: Jan Kara --- samples/fanotify/fs-monitor.c | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/fanotify/fs-monitor.c b/samples/fanotify/fs-monitor.c index 2e08a1807db7..608db24c471e 100644 --- a/samples/fanotify/fs-monitor.c +++ b/samples/fanotify/fs-monitor.c @@ -12,7 +12,6 @@ #include #include #include -#include #ifndef FAN_FS_ERROR #define FAN_FS_ERROR 0x00008000 From 61d37547436dce45e3590db72360df0e230ca969 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:08 +0100 Subject: [PATCH 222/512] PCI: kirin: Reorganize the PHY logic inside the driver The pcie-kirin PCIe driver contains internally a PHY interface for Kirin 960. As the next patches will add support for using an external PHY driver, reorganize the driver in a way that the PHY part will be self-contained. This could be moved to a separate PHY driver, but a change like that would mean a non-backward-compatible DT schema change. Link: https://lore.kernel.org/r/ad2f4aa6bbb71d5c9af0139704672f75f12644fc.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song Cc: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/pcie-kirin.c | 474 +++++++++++++----------- 1 file changed, 261 insertions(+), 213 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 026fd1e42a55..b4063a3434df 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -28,26 +28,16 @@ #define to_kirin_pcie(x) dev_get_drvdata((x)->dev) -#define REF_CLK_FREQ 100000000 - /* PCIe ELBI registers */ #define SOC_PCIECTRL_CTRL0_ADDR 0x000 #define SOC_PCIECTRL_CTRL1_ADDR 0x004 -#define SOC_PCIEPHY_CTRL2_ADDR 0x008 -#define SOC_PCIEPHY_CTRL3_ADDR 0x00c #define PCIE_ELBI_SLV_DBI_ENABLE (0x1 << 21) /* info located in APB */ #define PCIE_APP_LTSSM_ENABLE 0x01c -#define PCIE_APB_PHY_CTRL0 0x0 -#define PCIE_APB_PHY_CTRL1 0x4 #define PCIE_APB_PHY_STATUS0 0x400 #define PCIE_LINKUP_ENABLE (0x8020) #define PCIE_LTSSM_ENABLE_BIT (0x1 << 11) -#define PIPE_CLK_STABLE (0x1 << 19) -#define PHY_REF_PAD_BIT (0x1 << 8) -#define PHY_PWR_DOWN_BIT (0x1 << 22) -#define PHY_RST_ACK_BIT (0x1 << 16) /* info located in sysctrl */ #define SCTRL_PCIE_CMOS_OFFSET 0x60 @@ -60,6 +50,29 @@ #define PCIE_DEBOUNCE_PARAM 0xF0F400 #define PCIE_OE_BYPASS (0x3 << 28) +struct kirin_pcie { + struct dw_pcie *pci; + struct phy *phy; + void __iomem *apb_base; + void *phy_priv; /* Needed for Kirin 960 PHY */ +}; + +/* + * Kirin 960 PHY. Can't be split into a PHY driver without changing the + * DT schema. + */ + +#define REF_CLK_FREQ 100000000 + +/* PHY info located in APB */ +#define PCIE_APB_PHY_CTRL0 0x0 +#define PCIE_APB_PHY_CTRL1 0x4 +#define PCIE_APB_PHY_STATUS0 0x400 +#define PIPE_CLK_STABLE BIT(19) +#define PHY_REF_PAD_BIT BIT(8) +#define PHY_PWR_DOWN_BIT BIT(22) +#define PHY_RST_ACK_BIT BIT(16) + /* peri_crg ctrl */ #define CRGCTRL_PCIE_ASSERT_OFFSET 0x88 #define CRGCTRL_PCIE_ASSERT_BIT 0x8c000000 @@ -69,8 +82,6 @@ #define REF_2_PERST_MAX 25000 #define PERST_2_ACCESS_MIN 10000 #define PERST_2_ACCESS_MAX 12000 -#define LINK_WAIT_MIN 900 -#define LINK_WAIT_MAX 1000 #define PIPE_CLK_WAIT_MIN 550 #define PIPE_CLK_WAIT_MAX 600 #define TIME_CMOS_MIN 100 @@ -78,20 +89,248 @@ #define TIME_PHY_PD_MIN 10 #define TIME_PHY_PD_MAX 11 -struct kirin_pcie { - struct dw_pcie *pci; - void __iomem *apb_base; - void __iomem *phy_base; +struct hi3660_pcie_phy { + struct device *dev; + void __iomem *base; struct regmap *crgctrl; struct regmap *sysctrl; struct clk *apb_sys_clk; struct clk *apb_phy_clk; struct clk *phy_ref_clk; - struct clk *pcie_aclk; - struct clk *pcie_aux_clk; + struct clk *aclk; + struct clk *aux_clk; int gpio_id_reset; }; +/* Registers in PCIePHY */ +static inline void kirin_apb_phy_writel(struct hi3660_pcie_phy *hi3660_pcie_phy, + u32 val, u32 reg) +{ + writel(val, hi3660_pcie_phy->base + reg); +} + +static inline u32 kirin_apb_phy_readl(struct hi3660_pcie_phy *hi3660_pcie_phy, + u32 reg) +{ + return readl(hi3660_pcie_phy->base + reg); +} + +static int hi3660_pcie_phy_get_clk(struct hi3660_pcie_phy *phy) +{ + struct device *dev = phy->dev; + + phy->phy_ref_clk = devm_clk_get(dev, "pcie_phy_ref"); + if (IS_ERR(phy->phy_ref_clk)) + return PTR_ERR(phy->phy_ref_clk); + + phy->aux_clk = devm_clk_get(dev, "pcie_aux"); + if (IS_ERR(phy->aux_clk)) + return PTR_ERR(phy->aux_clk); + + phy->apb_phy_clk = devm_clk_get(dev, "pcie_apb_phy"); + if (IS_ERR(phy->apb_phy_clk)) + return PTR_ERR(phy->apb_phy_clk); + + phy->apb_sys_clk = devm_clk_get(dev, "pcie_apb_sys"); + if (IS_ERR(phy->apb_sys_clk)) + return PTR_ERR(phy->apb_sys_clk); + + phy->aclk = devm_clk_get(dev, "pcie_aclk"); + if (IS_ERR(phy->aclk)) + return PTR_ERR(phy->aclk); + + return 0; +} + +static int hi3660_pcie_phy_get_resource(struct hi3660_pcie_phy *phy) +{ + struct device *dev = phy->dev; + struct platform_device *pdev; + + /* registers */ + pdev = container_of(dev, struct platform_device, dev); + + phy->base = devm_platform_ioremap_resource_byname(pdev, "phy"); + if (IS_ERR(phy->base)) + return PTR_ERR(phy->base); + + phy->crgctrl = syscon_regmap_lookup_by_compatible("hisilicon,hi3660-crgctrl"); + if (IS_ERR(phy->crgctrl)) + return PTR_ERR(phy->crgctrl); + + phy->sysctrl = syscon_regmap_lookup_by_compatible("hisilicon,hi3660-sctrl"); + if (IS_ERR(phy->sysctrl)) + return PTR_ERR(phy->sysctrl); + + /* gpios */ + phy->gpio_id_reset = of_get_named_gpio(dev->of_node, + "reset-gpios", 0); + if (phy->gpio_id_reset == -EPROBE_DEFER) { + return -EPROBE_DEFER; + } else if (!gpio_is_valid(phy->gpio_id_reset)) { + dev_err(phy->dev, "unable to get a valid gpio pin\n"); + return -ENODEV; + } + + return 0; +} + +static int hi3660_pcie_phy_start(struct hi3660_pcie_phy *phy) +{ + struct device *dev = phy->dev; + u32 reg_val; + + reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL1); + reg_val &= ~PHY_REF_PAD_BIT; + kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL1); + + reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL0); + reg_val &= ~PHY_PWR_DOWN_BIT; + kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL0); + usleep_range(TIME_PHY_PD_MIN, TIME_PHY_PD_MAX); + + reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL1); + reg_val &= ~PHY_RST_ACK_BIT; + kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL1); + + usleep_range(PIPE_CLK_WAIT_MIN, PIPE_CLK_WAIT_MAX); + reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_STATUS0); + if (reg_val & PIPE_CLK_STABLE) { + dev_err(dev, "PIPE clk is not stable\n"); + return -EINVAL; + } + + return 0; +} + +static void hi3660_pcie_phy_oe_enable(struct hi3660_pcie_phy *phy) +{ + u32 val; + + regmap_read(phy->sysctrl, SCTRL_PCIE_OE_OFFSET, &val); + val |= PCIE_DEBOUNCE_PARAM; + val &= ~PCIE_OE_BYPASS; + regmap_write(phy->sysctrl, SCTRL_PCIE_OE_OFFSET, val); +} + +static int hi3660_pcie_phy_clk_ctrl(struct hi3660_pcie_phy *phy, bool enable) +{ + int ret = 0; + + if (!enable) + goto close_clk; + + ret = clk_set_rate(phy->phy_ref_clk, REF_CLK_FREQ); + if (ret) + return ret; + + ret = clk_prepare_enable(phy->phy_ref_clk); + if (ret) + return ret; + + ret = clk_prepare_enable(phy->apb_sys_clk); + if (ret) + goto apb_sys_fail; + + ret = clk_prepare_enable(phy->apb_phy_clk); + if (ret) + goto apb_phy_fail; + + ret = clk_prepare_enable(phy->aclk); + if (ret) + goto aclk_fail; + + ret = clk_prepare_enable(phy->aux_clk); + if (ret) + goto aux_clk_fail; + + return 0; + +close_clk: + clk_disable_unprepare(phy->aux_clk); +aux_clk_fail: + clk_disable_unprepare(phy->aclk); +aclk_fail: + clk_disable_unprepare(phy->apb_phy_clk); +apb_phy_fail: + clk_disable_unprepare(phy->apb_sys_clk); +apb_sys_fail: + clk_disable_unprepare(phy->phy_ref_clk); + + return ret; +} + +static int hi3660_pcie_phy_power_on(struct kirin_pcie *pcie) +{ + struct hi3660_pcie_phy *phy = pcie->phy_priv; + int ret; + + /* Power supply for Host */ + regmap_write(phy->sysctrl, + SCTRL_PCIE_CMOS_OFFSET, SCTRL_PCIE_CMOS_BIT); + usleep_range(TIME_CMOS_MIN, TIME_CMOS_MAX); + + hi3660_pcie_phy_oe_enable(phy); + + ret = hi3660_pcie_phy_clk_ctrl(phy, true); + if (ret) + return ret; + + /* ISO disable, PCIeCtrl, PHY assert and clk gate clear */ + regmap_write(phy->sysctrl, + SCTRL_PCIE_ISO_OFFSET, SCTRL_PCIE_ISO_BIT); + regmap_write(phy->crgctrl, + CRGCTRL_PCIE_ASSERT_OFFSET, CRGCTRL_PCIE_ASSERT_BIT); + regmap_write(phy->sysctrl, + SCTRL_PCIE_HPCLK_OFFSET, SCTRL_PCIE_HPCLK_BIT); + + ret = hi3660_pcie_phy_start(phy); + if (ret) + goto disable_clks; + + /* perst assert Endpoint */ + if (!gpio_request(phy->gpio_id_reset, "pcie_perst")) { + usleep_range(REF_2_PERST_MIN, REF_2_PERST_MAX); + ret = gpio_direction_output(phy->gpio_id_reset, 1); + if (ret) + goto disable_clks; + usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX); + return 0; + } + +disable_clks: + hi3660_pcie_phy_clk_ctrl(phy, false); + return ret; +} + +static int hi3660_pcie_phy_init(struct platform_device *pdev, + struct kirin_pcie *pcie) +{ + struct device *dev = &pdev->dev; + struct hi3660_pcie_phy *phy; + int ret; + + phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL); + if (!phy) + return -ENOMEM; + + pcie->phy_priv = phy; + phy->dev = dev; + + /* registers */ + pdev = container_of(dev, struct platform_device, dev); + + ret = hi3660_pcie_phy_get_clk(phy); + if (ret) + return ret; + + return hi3660_pcie_phy_get_resource(phy); +} + +/* + * The non-PHY part starts here + */ + /* Registers in PCIeCTRL */ static inline void kirin_apb_ctrl_writel(struct kirin_pcie *kirin_pcie, u32 val, u32 reg) @@ -104,46 +343,6 @@ static inline u32 kirin_apb_ctrl_readl(struct kirin_pcie *kirin_pcie, u32 reg) return readl(kirin_pcie->apb_base + reg); } -/* Registers in PCIePHY */ -static inline void kirin_apb_phy_writel(struct kirin_pcie *kirin_pcie, - u32 val, u32 reg) -{ - writel(val, kirin_pcie->phy_base + reg); -} - -static inline u32 kirin_apb_phy_readl(struct kirin_pcie *kirin_pcie, u32 reg) -{ - return readl(kirin_pcie->phy_base + reg); -} - -static long kirin_pcie_get_clk(struct kirin_pcie *kirin_pcie, - struct platform_device *pdev) -{ - struct device *dev = &pdev->dev; - - kirin_pcie->phy_ref_clk = devm_clk_get(dev, "pcie_phy_ref"); - if (IS_ERR(kirin_pcie->phy_ref_clk)) - return PTR_ERR(kirin_pcie->phy_ref_clk); - - kirin_pcie->pcie_aux_clk = devm_clk_get(dev, "pcie_aux"); - if (IS_ERR(kirin_pcie->pcie_aux_clk)) - return PTR_ERR(kirin_pcie->pcie_aux_clk); - - kirin_pcie->apb_phy_clk = devm_clk_get(dev, "pcie_apb_phy"); - if (IS_ERR(kirin_pcie->apb_phy_clk)) - return PTR_ERR(kirin_pcie->apb_phy_clk); - - kirin_pcie->apb_sys_clk = devm_clk_get(dev, "pcie_apb_sys"); - if (IS_ERR(kirin_pcie->apb_sys_clk)) - return PTR_ERR(kirin_pcie->apb_sys_clk); - - kirin_pcie->pcie_aclk = devm_clk_get(dev, "pcie_aclk"); - if (IS_ERR(kirin_pcie->pcie_aclk)) - return PTR_ERR(kirin_pcie->pcie_aclk); - - return 0; -} - static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie, struct platform_device *pdev) { @@ -152,151 +351,9 @@ static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie, if (IS_ERR(kirin_pcie->apb_base)) return PTR_ERR(kirin_pcie->apb_base); - kirin_pcie->phy_base = - devm_platform_ioremap_resource_byname(pdev, "phy"); - if (IS_ERR(kirin_pcie->phy_base)) - return PTR_ERR(kirin_pcie->phy_base); - - kirin_pcie->crgctrl = - syscon_regmap_lookup_by_compatible("hisilicon,hi3660-crgctrl"); - if (IS_ERR(kirin_pcie->crgctrl)) - return PTR_ERR(kirin_pcie->crgctrl); - - kirin_pcie->sysctrl = - syscon_regmap_lookup_by_compatible("hisilicon,hi3660-sctrl"); - if (IS_ERR(kirin_pcie->sysctrl)) - return PTR_ERR(kirin_pcie->sysctrl); - return 0; } -static int kirin_pcie_phy_init(struct kirin_pcie *kirin_pcie) -{ - struct device *dev = kirin_pcie->pci->dev; - u32 reg_val; - - reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL1); - reg_val &= ~PHY_REF_PAD_BIT; - kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL1); - - reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL0); - reg_val &= ~PHY_PWR_DOWN_BIT; - kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL0); - usleep_range(TIME_PHY_PD_MIN, TIME_PHY_PD_MAX); - - reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL1); - reg_val &= ~PHY_RST_ACK_BIT; - kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL1); - - usleep_range(PIPE_CLK_WAIT_MIN, PIPE_CLK_WAIT_MAX); - reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_STATUS0); - if (reg_val & PIPE_CLK_STABLE) { - dev_err(dev, "PIPE clk is not stable\n"); - return -EINVAL; - } - - return 0; -} - -static void kirin_pcie_oe_enable(struct kirin_pcie *kirin_pcie) -{ - u32 val; - - regmap_read(kirin_pcie->sysctrl, SCTRL_PCIE_OE_OFFSET, &val); - val |= PCIE_DEBOUNCE_PARAM; - val &= ~PCIE_OE_BYPASS; - regmap_write(kirin_pcie->sysctrl, SCTRL_PCIE_OE_OFFSET, val); -} - -static int kirin_pcie_clk_ctrl(struct kirin_pcie *kirin_pcie, bool enable) -{ - int ret = 0; - - if (!enable) - goto close_clk; - - ret = clk_set_rate(kirin_pcie->phy_ref_clk, REF_CLK_FREQ); - if (ret) - return ret; - - ret = clk_prepare_enable(kirin_pcie->phy_ref_clk); - if (ret) - return ret; - - ret = clk_prepare_enable(kirin_pcie->apb_sys_clk); - if (ret) - goto apb_sys_fail; - - ret = clk_prepare_enable(kirin_pcie->apb_phy_clk); - if (ret) - goto apb_phy_fail; - - ret = clk_prepare_enable(kirin_pcie->pcie_aclk); - if (ret) - goto aclk_fail; - - ret = clk_prepare_enable(kirin_pcie->pcie_aux_clk); - if (ret) - goto aux_clk_fail; - - return 0; - -close_clk: - clk_disable_unprepare(kirin_pcie->pcie_aux_clk); -aux_clk_fail: - clk_disable_unprepare(kirin_pcie->pcie_aclk); -aclk_fail: - clk_disable_unprepare(kirin_pcie->apb_phy_clk); -apb_phy_fail: - clk_disable_unprepare(kirin_pcie->apb_sys_clk); -apb_sys_fail: - clk_disable_unprepare(kirin_pcie->phy_ref_clk); - - return ret; -} - -static int kirin_pcie_power_on(struct kirin_pcie *kirin_pcie) -{ - int ret; - - /* Power supply for Host */ - regmap_write(kirin_pcie->sysctrl, - SCTRL_PCIE_CMOS_OFFSET, SCTRL_PCIE_CMOS_BIT); - usleep_range(TIME_CMOS_MIN, TIME_CMOS_MAX); - kirin_pcie_oe_enable(kirin_pcie); - - ret = kirin_pcie_clk_ctrl(kirin_pcie, true); - if (ret) - return ret; - - /* ISO disable, PCIeCtrl, PHY assert and clk gate clear */ - regmap_write(kirin_pcie->sysctrl, - SCTRL_PCIE_ISO_OFFSET, SCTRL_PCIE_ISO_BIT); - regmap_write(kirin_pcie->crgctrl, - CRGCTRL_PCIE_ASSERT_OFFSET, CRGCTRL_PCIE_ASSERT_BIT); - regmap_write(kirin_pcie->sysctrl, - SCTRL_PCIE_HPCLK_OFFSET, SCTRL_PCIE_HPCLK_BIT); - - ret = kirin_pcie_phy_init(kirin_pcie); - if (ret) - goto close_clk; - - /* perst assert Endpoint */ - if (!gpio_request(kirin_pcie->gpio_id_reset, "pcie_perst")) { - usleep_range(REF_2_PERST_MIN, REF_2_PERST_MAX); - ret = gpio_direction_output(kirin_pcie->gpio_id_reset, 1); - if (ret) - goto close_clk; - usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX); - - return 0; - } - -close_clk: - kirin_pcie_clk_ctrl(kirin_pcie, false); - return ret; -} - static void kirin_pcie_sideband_dbi_w_mode(struct kirin_pcie *kirin_pcie, bool on) { @@ -444,7 +501,7 @@ static int kirin_pcie_probe(struct platform_device *pdev) pci->pp.ops = &kirin_pcie_host_ops; kirin_pcie->pci = pci; - ret = kirin_pcie_get_clk(kirin_pcie, pdev); + ret = hi3660_pcie_phy_init(pdev, kirin_pcie); if (ret) return ret; @@ -452,16 +509,7 @@ static int kirin_pcie_probe(struct platform_device *pdev) if (ret) return ret; - kirin_pcie->gpio_id_reset = of_get_named_gpio(dev->of_node, - "reset-gpios", 0); - if (kirin_pcie->gpio_id_reset == -EPROBE_DEFER) { - return -EPROBE_DEFER; - } else if (!gpio_is_valid(kirin_pcie->gpio_id_reset)) { - dev_err(dev, "unable to get a valid gpio pin\n"); - return -ENODEV; - } - - ret = kirin_pcie_power_on(kirin_pcie); + ret = hi3660_pcie_phy_power_on(kirin_pcie); if (ret) return ret; @@ -479,8 +527,8 @@ static struct platform_driver kirin_pcie_driver = { .probe = kirin_pcie_probe, .driver = { .name = "kirin-pcie", - .of_match_table = kirin_pcie_match, - .suppress_bind_attrs = true, + .of_match_table = kirin_pcie_match, + .suppress_bind_attrs = true, }, }; builtin_platform_driver(kirin_pcie_driver); From 000f60db784b6461b2e6c1b1bdda8699225b5524 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:09 +0100 Subject: [PATCH 223/512] PCI: kirin: Add support for a PHY layer The pcie-kirin driver contains both PHY and generic PCI driver. The best would be, instead, to support a PCI PHY driver, making the driver more generic. However, it is too late to remove the Kirin 960 PHY, as a change like that would make the DT schema incompatible with past versions. So, add support for an external PHY driver without removing the existing Kirin 960 PHY from it. Link: https://lore.kernel.org/r/f38361df2e9d0dc5a38ff942b631f7fef64cdc12.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Kishon Vijay Abraham I Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 93 +++++++++++++++++++++---- 1 file changed, 79 insertions(+), 14 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index b4063a3434df..91a7c096bf8f 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -8,16 +8,18 @@ * Author: Xiaowei Song */ -#include #include +#include #include #include #include #include #include #include +#include #include #include +#include #include #include #include @@ -50,11 +52,18 @@ #define PCIE_DEBOUNCE_PARAM 0xF0F400 #define PCIE_OE_BYPASS (0x3 << 28) +enum pcie_kirin_phy_type { + PCIE_KIRIN_INTERNAL_PHY, + PCIE_KIRIN_EXTERNAL_PHY +}; + struct kirin_pcie { + enum pcie_kirin_phy_type type; + struct dw_pcie *pci; struct phy *phy; void __iomem *apb_base; - void *phy_priv; /* Needed for Kirin 960 PHY */ + void *phy_priv; /* only for PCIE_KIRIN_INTERNAL_PHY */ }; /* @@ -476,8 +485,63 @@ static const struct dw_pcie_host_ops kirin_pcie_host_ops = { .host_init = kirin_pcie_host_init, }; +static int kirin_pcie_power_on(struct platform_device *pdev, + struct kirin_pcie *kirin_pcie) +{ + struct device *dev = &pdev->dev; + int ret; + + if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) { + ret = hi3660_pcie_phy_init(pdev, kirin_pcie); + if (ret) + return ret; + + return hi3660_pcie_phy_power_on(kirin_pcie); + } + + kirin_pcie->phy = devm_of_phy_get(dev, dev->of_node, NULL); + if (IS_ERR(kirin_pcie->phy)) + return PTR_ERR(kirin_pcie->phy); + + ret = phy_init(kirin_pcie->phy); + if (ret) + goto err; + + ret = phy_power_on(kirin_pcie->phy); + if (ret) + goto err; + + return 0; +err: + phy_exit(kirin_pcie->phy); + return ret; +} + +static int __exit kirin_pcie_remove(struct platform_device *pdev) +{ + struct kirin_pcie *kirin_pcie = platform_get_drvdata(pdev); + + if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) + return 0; + + phy_power_off(kirin_pcie->phy); + phy_exit(kirin_pcie->phy); + + return 0; +} + +static const struct of_device_id kirin_pcie_match[] = { + { + .compatible = "hisilicon,kirin960-pcie", + .data = (void *)PCIE_KIRIN_INTERNAL_PHY + }, + {}, +}; + static int kirin_pcie_probe(struct platform_device *pdev) { + enum pcie_kirin_phy_type phy_type; + const struct of_device_id *of_id; struct device *dev = &pdev->dev; struct kirin_pcie *kirin_pcie; struct dw_pcie *pci; @@ -488,6 +552,14 @@ static int kirin_pcie_probe(struct platform_device *pdev) return -EINVAL; } + of_id = of_match_device(kirin_pcie_match, dev); + if (!of_id) { + dev_err(dev, "OF data missing\n"); + return -EINVAL; + } + + phy_type = (long)of_id->data; + kirin_pcie = devm_kzalloc(dev, sizeof(struct kirin_pcie), GFP_KERNEL); if (!kirin_pcie) return -ENOMEM; @@ -500,31 +572,24 @@ static int kirin_pcie_probe(struct platform_device *pdev) pci->ops = &kirin_dw_pcie_ops; pci->pp.ops = &kirin_pcie_host_ops; kirin_pcie->pci = pci; - - ret = hi3660_pcie_phy_init(pdev, kirin_pcie); - if (ret) - return ret; + kirin_pcie->type = phy_type; ret = kirin_pcie_get_resource(kirin_pcie, pdev); if (ret) return ret; - ret = hi3660_pcie_phy_power_on(kirin_pcie); + platform_set_drvdata(pdev, kirin_pcie); + + ret = kirin_pcie_power_on(pdev, kirin_pcie); if (ret) return ret; - platform_set_drvdata(pdev, kirin_pcie); - return dw_pcie_host_init(&pci->pp); } -static const struct of_device_id kirin_pcie_match[] = { - { .compatible = "hisilicon,kirin960-pcie" }, - {}, -}; - static struct platform_driver kirin_pcie_driver = { .probe = kirin_pcie_probe, + .remove = __exit_p(kirin_pcie_remove), .driver = { .name = "kirin-pcie", .of_match_table = kirin_pcie_match, From d19afe7be12689bb9ca245122ec5118c3f976e2e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:10 +0100 Subject: [PATCH 224/512] PCI: kirin: Use regmap for APB registers The PHY layer need to access APB registers too, for Kirin 970. So place them into a named regmap. Link: https://lore.kernel.org/r/daf0e4bda5a69a5ac8484e70f09351a959805c8c.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 49 +++++++++++++------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 91a7c096bf8f..86c13661e02d 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -61,8 +61,8 @@ struct kirin_pcie { enum pcie_kirin_phy_type type; struct dw_pcie *pci; + struct regmap *apb; struct phy *phy; - void __iomem *apb_base; void *phy_priv; /* only for PCIE_KIRIN_INTERNAL_PHY */ }; @@ -340,25 +340,27 @@ static int hi3660_pcie_phy_init(struct platform_device *pdev, * The non-PHY part starts here */ -/* Registers in PCIeCTRL */ -static inline void kirin_apb_ctrl_writel(struct kirin_pcie *kirin_pcie, - u32 val, u32 reg) -{ - writel(val, kirin_pcie->apb_base + reg); -} - -static inline u32 kirin_apb_ctrl_readl(struct kirin_pcie *kirin_pcie, u32 reg) -{ - return readl(kirin_pcie->apb_base + reg); -} +static const struct regmap_config pcie_kirin_regmap_conf = { + .name = "kirin_pcie_apb", + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, +}; static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie, struct platform_device *pdev) { - kirin_pcie->apb_base = - devm_platform_ioremap_resource_byname(pdev, "apb"); - if (IS_ERR(kirin_pcie->apb_base)) - return PTR_ERR(kirin_pcie->apb_base); + struct device *dev = &pdev->dev; + void __iomem *apb_base; + + apb_base = devm_platform_ioremap_resource_byname(pdev, "apb"); + if (IS_ERR(apb_base)) + return PTR_ERR(apb_base); + + kirin_pcie->apb = devm_regmap_init_mmio(dev, apb_base, + &pcie_kirin_regmap_conf); + if (IS_ERR(kirin_pcie->apb)) + return PTR_ERR(kirin_pcie->apb); return 0; } @@ -368,13 +370,13 @@ static void kirin_pcie_sideband_dbi_w_mode(struct kirin_pcie *kirin_pcie, { u32 val; - val = kirin_apb_ctrl_readl(kirin_pcie, SOC_PCIECTRL_CTRL0_ADDR); + regmap_read(kirin_pcie->apb, SOC_PCIECTRL_CTRL0_ADDR, &val); if (on) val = val | PCIE_ELBI_SLV_DBI_ENABLE; else val = val & ~PCIE_ELBI_SLV_DBI_ENABLE; - kirin_apb_ctrl_writel(kirin_pcie, val, SOC_PCIECTRL_CTRL0_ADDR); + regmap_write(kirin_pcie->apb, SOC_PCIECTRL_CTRL0_ADDR, val); } static void kirin_pcie_sideband_dbi_r_mode(struct kirin_pcie *kirin_pcie, @@ -382,13 +384,13 @@ static void kirin_pcie_sideband_dbi_r_mode(struct kirin_pcie *kirin_pcie, { u32 val; - val = kirin_apb_ctrl_readl(kirin_pcie, SOC_PCIECTRL_CTRL1_ADDR); + regmap_read(kirin_pcie->apb, SOC_PCIECTRL_CTRL1_ADDR, &val); if (on) val = val | PCIE_ELBI_SLV_DBI_ENABLE; else val = val & ~PCIE_ELBI_SLV_DBI_ENABLE; - kirin_apb_ctrl_writel(kirin_pcie, val, SOC_PCIECTRL_CTRL1_ADDR); + regmap_write(kirin_pcie->apb, SOC_PCIECTRL_CTRL1_ADDR, val); } static int kirin_pcie_rd_own_conf(struct pci_bus *bus, unsigned int devfn, @@ -448,8 +450,9 @@ static void kirin_pcie_write_dbi(struct dw_pcie *pci, void __iomem *base, static int kirin_pcie_link_up(struct dw_pcie *pci) { struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci); - u32 val = kirin_apb_ctrl_readl(kirin_pcie, PCIE_APB_PHY_STATUS0); + u32 val; + regmap_read(kirin_pcie->apb, PCIE_APB_PHY_STATUS0, &val); if ((val & PCIE_LINKUP_ENABLE) == PCIE_LINKUP_ENABLE) return 1; @@ -461,8 +464,8 @@ static int kirin_pcie_start_link(struct dw_pcie *pci) struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci); /* assert LTSSM enable */ - kirin_apb_ctrl_writel(kirin_pcie, PCIE_LTSSM_ENABLE_BIT, - PCIE_APP_LTSSM_ENABLE); + regmap_write(kirin_pcie->apb, PCIE_APP_LTSSM_ENABLE, + PCIE_LTSSM_ENABLE_BIT); return 0; } From 7be3248f313930ff3d3436d4e9ddbe9fccc1f541 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Thu, 14 Oct 2021 11:52:39 +0000 Subject: [PATCH 225/512] cifs: To match file servers, make sure the server hostname matches We generally rely on a bunch of factors to differentiate between servers. For example, IP address, port etc. For certain server types (like Azure), it is important to make sure that the server hostname matches too, even if the both hostnames currently resolve to the same IP address. Signed-off-by: Shyam Prasad N Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/cifs/connect.c | 19 +++++++++++-------- fs/cifs/fs_context.c | 8 ++++++++ fs/cifs/fs_context.h | 1 + 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index c3b94c1e4591..e6e261dfd107 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -794,7 +794,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) */ } - kfree(server->hostname); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -1235,6 +1234,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context * if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns)) return 0; + if (strcasecmp(server->hostname, ctx->server_hostname)) + return 0; + if (!match_address(server, addr, (struct sockaddr *)&ctx->srcaddr)) return 0; @@ -1336,6 +1338,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; + kfree(server->hostname); task = xchg(&server->tsk, NULL); if (task) @@ -1361,14 +1364,15 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx) goto out_err; } + tcp_ses->hostname = kstrdup(ctx->server_hostname, GFP_KERNEL); + if (!tcp_ses->hostname) { + rc = -ENOMEM; + goto out_err; + } + tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); - tcp_ses->hostname = extract_hostname(ctx->UNC); - if (IS_ERR(tcp_ses->hostname)) { - rc = PTR_ERR(tcp_ses->hostname); - goto out_err_crypto_release; - } tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); tcp_ses->noblockcnt = ctx->rootfs; @@ -1497,8 +1501,7 @@ out_err_crypto_release: out_err: if (tcp_ses) { - if (!IS_ERR(tcp_ses->hostname)) - kfree(tcp_ses->hostname); + kfree(tcp_ses->hostname); if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); kfree(tcp_ses); diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 3109def8e199..4130908af8d7 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -318,6 +318,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(mount_options); DUP_CTX_STR(username); DUP_CTX_STR(password); + DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); DUP_CTX_STR(domainname); @@ -456,6 +457,11 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) if (!pos) return -EINVAL; + /* record the server hostname */ + ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL); + if (!ctx->server_hostname) + return -ENOMEM; + /* skip past delimiter */ ++pos; @@ -1496,6 +1502,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; + kfree(ctx->server_hostname); + ctx->server_hostname = NULL; kfree(ctx->UNC); ctx->UNC = NULL; kfree(ctx->source); diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index a42ba71d7a81..29601a4eb411 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -166,6 +166,7 @@ struct smb3_fs_context { char *password; char *domainname; char *source; + char *server_hostname; char *UNC; char *nodename; char *iocharset; /* local code page for mapping to and from Unicode */ From 7ae5e588b0a53a72819e661106cbe99dde83b41d Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 14 Oct 2021 15:54:26 -0500 Subject: [PATCH 226/512] cifs: add mount parameter tcpnodelay Although corking and uncorking the socket (which cifs.ko already does) should usually have the desired benefit, using the new tcpnodelay mount option causes tcp_sock_set_nodelay() to be set on the socket which may be useful in order to ensure that we don't ever have cases where the network stack is waiting on sending an SMB request until multiple SMB requests have been added to the send queue (since this could lead to long latencies). To enable it simply append "tcpnodelay" it to the mount options Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/fs_context.c | 8 ++++++++ fs/cifs/fs_context.h | 1 + 2 files changed, 9 insertions(+) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 4130908af8d7..38d96a480745 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -116,6 +116,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag("nosharesock", Opt_nosharesock), fsparam_flag_no("persistenthandles", Opt_persistent), fsparam_flag_no("resilienthandles", Opt_resilient), + fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay), fsparam_flag("domainauto", Opt_domainauto), fsparam_flag("rdma", Opt_rdma), fsparam_flag("modesid", Opt_modesid), @@ -1389,6 +1390,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } } break; + case Opt_tcp_nodelay: + /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */ + if (result.negated) + ctx->sockopt_tcp_nodelay = false; + else + ctx->sockopt_tcp_nodelay = true; + break; case Opt_domainauto: ctx->domainauto = true; break; diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 29601a4eb411..b2d22cf9cb18 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -98,6 +98,7 @@ enum cifs_param { Opt_nosharesock, Opt_persistent, Opt_resilient, + Opt_tcp_nodelay, Opt_domainauto, Opt_rdma, Opt_modesid, From 31dedb8ed11e0d9aa266bb33e46d827006c4a72f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 13 Oct 2021 00:31:45 +0000 Subject: [PATCH 227/512] PCI: cpqphp: Use instead of MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the preferred generic header file linux/io.h that already includes the corresponding asm/io.h file. Link: https://lore.kernel.org/r/20211013003145.1107148-2-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Derrick --- drivers/pci/hotplug/cpqphp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/cpqphp.h b/drivers/pci/hotplug/cpqphp.h index 77e4e0142fbc..2f7b49ea96e2 100644 --- a/drivers/pci/hotplug/cpqphp.h +++ b/drivers/pci/hotplug/cpqphp.h @@ -15,7 +15,7 @@ #define _CPQPHP_H #include -#include /* for read? and write? functions */ +#include /* for read? and write? functions */ #include /* for delays */ #include #include /* for signal_pending() */ From 496bb18483cc0474913e81e18a6b313aaea4c120 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 27 Jun 2021 13:46:24 +0200 Subject: [PATCH 228/512] PCI: j721e: Fix j721e_pcie_probe() error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If an error occurs after a successful cdns_pcie_init_phy() call, it must be undone by a cdns_pcie_disable_phy() call, as already done above and below. Update the goto to branch at the correct place of the error handling path. Link: https://lore.kernel.org/r/db477b0cb444891a17c4bb424467667dc30d0bab.1624794264.git.christophe.jaillet@wanadoo.fr Fixes: 49e0efdce791 ("PCI: j721e: Add support to provide refclk to PCIe connector") Signed-off-by: Christophe JAILLET Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- drivers/pci/controller/cadence/pci-j721e.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c index ffb176d288cd..918e11082e6a 100644 --- a/drivers/pci/controller/cadence/pci-j721e.c +++ b/drivers/pci/controller/cadence/pci-j721e.c @@ -474,7 +474,7 @@ static int j721e_pcie_probe(struct platform_device *pdev) ret = clk_prepare_enable(clk); if (ret) { dev_err(dev, "failed to enable pcie_refclk\n"); - goto err_get_sync; + goto err_pcie_setup; } pcie->refclk = clk; From 27cd7e3c9bb1ae13bc16f08138edd6e4df3cd211 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 21 Oct 2021 02:50:19 +0000 Subject: [PATCH 229/512] PCI: cadence: Add cdns_plat_pcie_probe() missing return When cdns_plat_pcie_probe() succeeds, return success instead of falling into the error handling code. Fixes: bd22885aa188 ("PCI: cadence: Refactor driver to use as a core library") Link: https://lore.kernel.org/r/DM6PR19MB40271B93057D949310F0B0EDA0BF9@DM6PR19MB4027.namprd19.prod.outlook.com Signed-off-by: Xuliang Zhang Signed-off-by: Li Chen Signed-off-by: Bjorn Helgaas Reviewed-by: Bjorn Helgaas Cc: stable@vger.kernel.org --- drivers/pci/controller/cadence/pcie-cadence-plat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/controller/cadence/pcie-cadence-plat.c b/drivers/pci/controller/cadence/pcie-cadence-plat.c index 5fee0f89ab59..a224afadbcc0 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-plat.c +++ b/drivers/pci/controller/cadence/pcie-cadence-plat.c @@ -127,6 +127,8 @@ static int cdns_plat_pcie_probe(struct platform_device *pdev) goto err_init; } + return 0; + err_init: err_get_sync: pm_runtime_put_sync(dev); From ca25c63779ca966ff3dd4ea20af1401452dbbe75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 13 Oct 2021 00:31:44 +0000 Subject: [PATCH 230/512] PCI: vmd: Drop redundant includes of , MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already include and , which include and . Drop the redundant includes of and . [bhelgaas: squash in fix from Wan Jiabing : https://lore.kernel.org/r/20211104063720.29375-1-wanjiabing@vivo.com] Link: https://lore.kernel.org/r/20211013003145.1107148-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Derrick --- drivers/pci/controller/vmd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index a5987e52700e..1ed2667069ab 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -18,8 +18,6 @@ #include #include -#include -#include #define VMD_CFGBAR 0 #define VMD_MEMBAR1 2 From 5ec0a6fcb60ea430f8ee7e0bec22db9b22f856d3 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sat, 11 Sep 2021 03:03:05 -0700 Subject: [PATCH 231/512] PCI: Do not enable AtomicOps on VFs Host crashes when pci_enable_atomic_ops_to_root() is called for VFs with virtual buses. The virtual buses added to SR-IOV have bus->self set to NULL and host crashes due to this. PID: 4481 TASK: ffff89c6941b0000 CPU: 53 COMMAND: "bash" ... #3 [ffff9a9481713808] oops_end at ffffffffb9025cd6 #4 [ffff9a9481713828] page_fault_oops at ffffffffb906e417 #5 [ffff9a9481713888] exc_page_fault at ffffffffb9a0ad14 #6 [ffff9a94817138b0] asm_exc_page_fault at ffffffffb9c00ace [exception RIP: pcie_capability_read_dword+28] RIP: ffffffffb952fd5c RSP: ffff9a9481713960 RFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff89c6b1096000 RCX: 0000000000000000 RDX: ffff9a9481713990 RSI: 0000000000000024 RDI: 0000000000000000 RBP: 0000000000000080 R8: 0000000000000008 R9: ffff89c64341a2f8 R10: 0000000000000002 R11: 0000000000000000 R12: ffff89c648bab000 R13: 0000000000000000 R14: 0000000000000000 R15: ffff89c648bab0c8 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffff9a9481713988] pci_enable_atomic_ops_to_root at ffffffffb95359a6 #8 [ffff9a94817139c0] bnxt_qplib_determine_atomics at ffffffffc08c1a33 [bnxt_re] #9 [ffff9a94817139d0] bnxt_re_dev_init at ffffffffc08ba2d1 [bnxt_re] Per PCIe r5.0, sec 9.3.5.10, the AtomicOp Requester Enable bit in Device Control 2 is reserved for VFs. The PF value applies to all associated VFs. Return -EINVAL if pci_enable_atomic_ops_to_root() is called for a VF. Link: https://lore.kernel.org/r/1631354585-16597-1-git-send-email-selvin.xavier@broadcom.com Fixes: 35f5ace5dea4 ("RDMA/bnxt_re: Enable global atomic ops if platform supports") Fixes: 430a23689dea ("PCI: Add pci_enable_atomic_ops_to_root()") Signed-off-by: Selvin Xavier Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Gospodarek --- drivers/pci/pci.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c63598c1cdd8..e55d944ff30f 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3719,6 +3719,14 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask) struct pci_dev *bridge; u32 cap, ctl2; + /* + * Per PCIe r5.0, sec 9.3.5.10, the AtomicOp Requester Enable bit + * in Device Control 2 is reserved in VFs and the PF value applies + * to all associated VFs. + */ + if (dev->is_virtfn) + return -EINVAL; + if (!pci_is_pcie(dev)) return -EINVAL; From 0ab8d0f6ae3f1234e38eaedc3ff7c22bfdf7f78c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Sep 2021 17:38:34 +0100 Subject: [PATCH 232/512] irqdomain: Make of_phandle_args_to_fwspec() generally available of_phandle_args_to_fwspec() can be generally useful to code extracting a DT of_phandle and using an irq_fwspec to use the hierarchical irqdomain API. Make it visible to the rest of the kernel, including modules. Link: https://lore.kernel.org/r/20210929163847.2807812-2-maz@kernel.org Tested-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- include/linux/irqdomain.h | 4 ++++ kernel/irq/irqdomain.c | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 23e4ee523576..cfd442316f39 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -64,6 +64,10 @@ struct irq_fwspec { u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; }; +/* Conversion function from of_phandle_args fields to fwspec */ +void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec); + /* * Should several domains have the same device node, but serve * different purposes (for example one domain is for PCI/MSI, and the diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 19e83e9b723c..5a698c1f6cc6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -744,9 +744,8 @@ static int irq_domain_translate(struct irq_domain *d, return 0; } -static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, - unsigned int count, - struct irq_fwspec *fwspec) +void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec) { int i; @@ -756,6 +755,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, for (i = 0; i < count; i++) fwspec->param[i] = args[i]; } +EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { From 0412841812265734c306ba5ef8088bcb64d5d3bd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Sep 2021 17:38:35 +0100 Subject: [PATCH 233/512] of/irq: Allow matching of an interrupt-map local to an interrupt controller of_irq_parse_raw() has a baked assumption that if a node has an interrupt-controller property, it cannot possibly also have an interrupt-map property (the latter being ignored). This seems to be an odd behaviour, and there is no reason why we should avoid supporting this use case. This is specially useful when a PCI root port acts as an interrupt controller for PCI endpoints, such as this: pcie0: pcie@690000000 { [...] port00: pci@0,0 { device_type = "pci"; [...] #address-cells = <3>; interrupt-controller; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0 0 0 1 &port00 0 0 0 0>, <0 0 0 2 &port00 0 0 0 1>, <0 0 0 3 &port00 0 0 0 2>, <0 0 0 4 &port00 0 0 0 3>; }; }; Handle it by detecting that we have an interrupt-map early in the parsing, and special case the situation where the phandle in the interrupt map refers to the current node (which is the interesting case here). Link: https://lore.kernel.org/r/20210929163847.2807812-3-maz@kernel.org Tested-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring --- drivers/of/irq.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 352e14b007e7..32be5a03951f 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -156,10 +156,14 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) /* Now start the actual "proper" walk of the interrupt tree */ while (ipar != NULL) { - /* Now check if cursor is an interrupt-controller and if it is - * then we are done + /* + * Now check if cursor is an interrupt-controller and + * if it is then we are done, unless there is an + * interrupt-map which takes precedence. */ - if (of_property_read_bool(ipar, "interrupt-controller")) { + imap = of_get_property(ipar, "interrupt-map", &imaplen); + if (imap == NULL && + of_property_read_bool(ipar, "interrupt-controller")) { pr_debug(" -> got it !\n"); return 0; } @@ -173,8 +177,6 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) goto fail; } - /* Now look for an interrupt-map */ - imap = of_get_property(ipar, "interrupt-map", &imaplen); /* No interrupt map, check for an interrupt parent */ if (imap == NULL) { pr_debug(" -> no map, getting parent\n"); @@ -255,6 +257,11 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) out_irq->args_count = intsize = newintsize; addrsize = newaddrsize; + if (ipar == newpar) { + pr_debug("%pOF interrupt-map entry to self\n", ipar); + return 0; + } + skiplevel: /* Iterate again with new parent */ out_irq->np = newpar; From 978fd0056e19defc406208556e6eee133784b605 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Sep 2021 17:38:36 +0100 Subject: [PATCH 234/512] PCI: of: Allow matching of an interrupt-map local to a PCI device Just as we now allow an interrupt map to be parsed when part of an interrupt controller, there is no reason to ignore an interrupt map that would be part of a pci device node such as a root port since we already allow interrupt specifiers. Allow the matching of such property when local to the node of a PCI device, which allows the device itself to use the interrupt map for for its own purpose. Link: https://lore.kernel.org/r/20210929163847.2807812-4-maz@kernel.org Tested-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring --- drivers/pci/of.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pci/of.c b/drivers/pci/of.c index d84381ce82b5..0b1237cff239 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -423,7 +423,7 @@ failed: */ static int of_irq_parse_pci(const struct pci_dev *pdev, struct of_phandle_args *out_irq) { - struct device_node *dn, *ppnode; + struct device_node *dn, *ppnode = NULL; struct pci_dev *ppdev; __be32 laddr[3]; u8 pin; @@ -452,8 +452,14 @@ static int of_irq_parse_pci(const struct pci_dev *pdev, struct of_phandle_args * if (pin == 0) return -ENODEV; + /* Local interrupt-map in the device node? Use it! */ + if (of_get_property(dn, "interrupt-map", NULL)) { + pin = pci_swizzle_interrupt_pin(pdev, pin); + ppnode = dn; + } + /* Now we walk up the PCI tree */ - for (;;) { + while (!ppnode) { /* Get the pci_dev of our parent */ ppdev = pdev->bus->self; From 1e33888fbe44ade6e9d54eb7c6c5e92d1455ff08 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 29 Sep 2021 17:38:37 +0100 Subject: [PATCH 235/512] PCI: apple: Add initial hardware bring-up Add a minimal driver to bring up the PCIe bus on Apple system-on-chips, particularly the Apple M1. This driver exposes the internal bus used for the USB type-A ports, Ethernet, Wi-Fi, and Bluetooth. Bringing up the radios requires additional drivers beyond what's necessary for PCIe itself. Co-developed-by: Stan Skowronek Link: https://lore.kernel.org/r/20210929163847.2807812-5-maz@kernel.org Signed-off-by: Stan Skowronek Signed-off-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring Reviewed-by: Sven Peter --- MAINTAINERS | 7 + drivers/pci/controller/Kconfig | 13 ++ drivers/pci/controller/Makefile | 1 + drivers/pci/controller/pcie-apple.c | 238 ++++++++++++++++++++++++++++ 4 files changed, 259 insertions(+) create mode 100644 drivers/pci/controller/pcie-apple.c diff --git a/MAINTAINERS b/MAINTAINERS index ca6d6fde85cf..af7b775fd54f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1280,6 +1280,13 @@ S: Maintained F: Documentation/devicetree/bindings/iommu/apple,dart.yaml F: drivers/iommu/apple-dart.c +APPLE PCIE CONTROLLER DRIVER +M: Alyssa Rosenzweig +M: Marc Zyngier +L: linux-pci@vger.kernel.org +S: Maintained +F: drivers/pci/controller/pcie-apple.c + APPLE SMC DRIVER M: Henrik Rydberg L: linux-hwmon@vger.kernel.org diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 326f7d13024f..953d9acc031f 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -312,6 +312,19 @@ config PCIE_HISI_ERR Say Y here if you want error handling support for the PCIe controller's errors on HiSilicon HIP SoCs +config PCIE_APPLE + tristate "Apple PCIe controller" + depends on ARCH_APPLE || COMPILE_TEST + depends on OF + depends on PCI_MSI_IRQ_DOMAIN + select PCI_HOST_COMMON + help + Say Y here if you want to enable PCIe controller support on Apple + system-on-chips, like the Apple M1. This is required for the USB + type-A ports, Ethernet, Wi-Fi, and Bluetooth. + + If unsure, say Y if you have an Apple Silicon system. + source "drivers/pci/controller/dwc/Kconfig" source "drivers/pci/controller/mobiveil/Kconfig" source "drivers/pci/controller/cadence/Kconfig" diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index aaf30b3dcc14..f9d40bad932c 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_VMD) += vmd.o obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o obj-$(CONFIG_PCI_LOONGSON) += pci-loongson.o obj-$(CONFIG_PCIE_HISI_ERR) += pcie-hisi-error.o +obj-$(CONFIG_PCIE_APPLE) += pcie-apple.o # pcie-hisi.o quirks are needed even without CONFIG_PCIE_DW obj-y += dwc/ obj-y += mobiveil/ diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c new file mode 100644 index 000000000000..ed78af70dbe7 --- /dev/null +++ b/drivers/pci/controller/pcie-apple.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCIe host bridge driver for Apple system-on-chips. + * + * The HW is ECAM compliant, so once the controller is initialized, + * the driver mostly deals MSI mapping and handling of per-port + * interrupts (INTx, management and error signals). + * + * Initialization requires enabling power and clocks, along with a + * number of register pokes. + * + * Copyright (C) 2021 Alyssa Rosenzweig + * Copyright (C) 2021 Google LLC + * Copyright (C) 2021 Corellium LLC + * Copyright (C) 2021 Mark Kettenis + * + * Author: Alyssa Rosenzweig + * Author: Marc Zyngier + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define CORE_RC_PHYIF_CTL 0x00024 +#define CORE_RC_PHYIF_CTL_RUN BIT(0) +#define CORE_RC_PHYIF_STAT 0x00028 +#define CORE_RC_PHYIF_STAT_REFCLK BIT(4) +#define CORE_RC_CTL 0x00050 +#define CORE_RC_CTL_RUN BIT(0) +#define CORE_RC_STAT 0x00058 +#define CORE_RC_STAT_READY BIT(0) +#define CORE_FABRIC_STAT 0x04000 +#define CORE_FABRIC_STAT_MASK 0x001F001F +#define CORE_LANE_CFG(port) (0x84000 + 0x4000 * (port)) +#define CORE_LANE_CFG_REFCLK0REQ BIT(0) +#define CORE_LANE_CFG_REFCLK1 BIT(1) +#define CORE_LANE_CFG_REFCLK0ACK BIT(2) +#define CORE_LANE_CFG_REFCLKEN (BIT(9) | BIT(10)) +#define CORE_LANE_CTL(port) (0x84004 + 0x4000 * (port)) +#define CORE_LANE_CTL_CFGACC BIT(15) + +#define PORT_LTSSMCTL 0x00080 +#define PORT_LTSSMCTL_START BIT(0) +#define PORT_INTSTAT 0x00100 +#define PORT_INT_TUNNEL_ERR 31 +#define PORT_INT_CPL_TIMEOUT 23 +#define PORT_INT_RID2SID_MAPERR 22 +#define PORT_INT_CPL_ABORT 21 +#define PORT_INT_MSI_BAD_DATA 19 +#define PORT_INT_MSI_ERR 18 +#define PORT_INT_REQADDR_GT32 17 +#define PORT_INT_AF_TIMEOUT 15 +#define PORT_INT_LINK_DOWN 14 +#define PORT_INT_LINK_UP 12 +#define PORT_INT_LINK_BWMGMT 11 +#define PORT_INT_AER_MASK (15 << 4) +#define PORT_INT_PORT_ERR 4 +#define PORT_INT_INTx(i) i +#define PORT_INT_INTx_MASK 15 +#define PORT_INTMSK 0x00104 +#define PORT_INTMSKSET 0x00108 +#define PORT_INTMSKCLR 0x0010c +#define PORT_MSICFG 0x00124 +#define PORT_MSICFG_EN BIT(0) +#define PORT_MSICFG_L2MSINUM_SHIFT 4 +#define PORT_MSIBASE 0x00128 +#define PORT_MSIBASE_1_SHIFT 16 +#define PORT_MSIADDR 0x00168 +#define PORT_LINKSTS 0x00208 +#define PORT_LINKSTS_UP BIT(0) +#define PORT_LINKSTS_BUSY BIT(2) +#define PORT_LINKCMDSTS 0x00210 +#define PORT_OUTS_NPREQS 0x00284 +#define PORT_OUTS_NPREQS_REQ BIT(24) +#define PORT_OUTS_NPREQS_CPL BIT(16) +#define PORT_RXWR_FIFO 0x00288 +#define PORT_RXWR_FIFO_HDR GENMASK(15, 10) +#define PORT_RXWR_FIFO_DATA GENMASK(9, 0) +#define PORT_RXRD_FIFO 0x0028C +#define PORT_RXRD_FIFO_REQ GENMASK(6, 0) +#define PORT_OUTS_CPLS 0x00290 +#define PORT_OUTS_CPLS_SHRD GENMASK(14, 8) +#define PORT_OUTS_CPLS_WAIT GENMASK(6, 0) +#define PORT_APPCLK 0x00800 +#define PORT_APPCLK_EN BIT(0) +#define PORT_APPCLK_CGDIS BIT(8) +#define PORT_STATUS 0x00804 +#define PORT_STATUS_READY BIT(0) +#define PORT_REFCLK 0x00810 +#define PORT_REFCLK_EN BIT(0) +#define PORT_REFCLK_CGDIS BIT(8) +#define PORT_PERST 0x00814 +#define PORT_PERST_OFF BIT(0) +#define PORT_RID2SID(i16) (0x00828 + 4 * (i16)) +#define PORT_RID2SID_VALID BIT(31) +#define PORT_RID2SID_SID_SHIFT 16 +#define PORT_RID2SID_BUS_SHIFT 8 +#define PORT_RID2SID_DEV_SHIFT 3 +#define PORT_RID2SID_FUNC_SHIFT 0 +#define PORT_OUTS_PREQS_HDR 0x00980 +#define PORT_OUTS_PREQS_HDR_MASK GENMASK(9, 0) +#define PORT_OUTS_PREQS_DATA 0x00984 +#define PORT_OUTS_PREQS_DATA_MASK GENMASK(15, 0) +#define PORT_TUNCTRL 0x00988 +#define PORT_TUNCTRL_PERST_ON BIT(0) +#define PORT_TUNCTRL_PERST_ACK_REQ BIT(1) +#define PORT_TUNSTAT 0x0098c +#define PORT_TUNSTAT_PERST_ON BIT(0) +#define PORT_TUNSTAT_PERST_ACK_PEND BIT(1) +#define PORT_PREFMEM_ENABLE 0x00994 + +struct apple_pcie { + struct device *dev; + void __iomem *base; +}; + +struct apple_pcie_port { + struct apple_pcie *pcie; + struct device_node *np; + void __iomem *base; + int idx; +}; + +static void rmw_set(u32 set, void __iomem *addr) +{ + writel_relaxed(readl_relaxed(addr) | set, addr); +} + +static int apple_pcie_setup_port(struct apple_pcie *pcie, + struct device_node *np) +{ + struct platform_device *platform = to_platform_device(pcie->dev); + struct apple_pcie_port *port; + struct gpio_desc *reset; + u32 stat, idx; + int ret; + + reset = gpiod_get_from_of_node(np, "reset-gpios", 0, + GPIOD_OUT_LOW, "#PERST"); + if (IS_ERR(reset)) + return PTR_ERR(reset); + + port = devm_kzalloc(pcie->dev, sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + ret = of_property_read_u32_index(np, "reg", 0, &idx); + if (ret) + return ret; + + /* Use the first reg entry to work out the port index */ + port->idx = idx >> 11; + port->pcie = pcie; + port->np = np; + + port->base = devm_platform_ioremap_resource(platform, port->idx + 2); + if (IS_ERR(port->base)) + return PTR_ERR(port->base); + + rmw_set(PORT_APPCLK_EN, port->base + PORT_APPCLK); + + rmw_set(PORT_PERST_OFF, port->base + PORT_PERST); + gpiod_set_value(reset, 1); + + ret = readl_relaxed_poll_timeout(port->base + PORT_STATUS, stat, + stat & PORT_STATUS_READY, 100, 250000); + if (ret < 0) { + dev_err(pcie->dev, "port %pOF ready wait timeout\n", np); + return ret; + } + + writel_relaxed(PORT_LTSSMCTL_START, port->base + PORT_LTSSMCTL); + + return 0; +} + +static int apple_pcie_init(struct pci_config_window *cfg) +{ + struct device *dev = cfg->parent; + struct platform_device *platform = to_platform_device(dev); + struct device_node *of_port; + struct apple_pcie *pcie; + int ret; + + pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); + if (!pcie) + return -ENOMEM; + + pcie->dev = dev; + + pcie->base = devm_platform_ioremap_resource(platform, 1); + if (IS_ERR(pcie->base)) + return PTR_ERR(pcie->base); + + for_each_child_of_node(dev->of_node, of_port) { + ret = apple_pcie_setup_port(pcie, of_port); + if (ret) { + dev_err(pcie->dev, "Port %pOF setup fail: %d\n", of_port, ret); + of_node_put(of_port); + return ret; + } + } + + return 0; +} + +static const struct pci_ecam_ops apple_pcie_cfg_ecam_ops = { + .init = apple_pcie_init, + .pci_ops = { + .map_bus = pci_ecam_map_bus, + .read = pci_generic_config_read, + .write = pci_generic_config_write, + } +}; + +static const struct of_device_id apple_pcie_of_match[] = { + { .compatible = "apple,pcie", .data = &apple_pcie_cfg_ecam_ops }, + { } +}; +MODULE_DEVICE_TABLE(of, apple_pcie_of_match); + +static struct platform_driver apple_pcie_driver = { + .probe = pci_host_common_probe, + .driver = { + .name = "pcie-apple", + .of_match_table = apple_pcie_of_match, + .suppress_bind_attrs = true, + }, +}; +module_platform_driver(apple_pcie_driver); + +MODULE_LICENSE("GPL v2"); From 1512f908f3809a2e95c1cd7eca11445445b4a5b1 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 29 Sep 2021 17:38:38 +0100 Subject: [PATCH 236/512] PCI: apple: Set up reference clocks when probing Apple's PCIe controller requires clocks to be configured in order to bring up the hardware. Add the register pokes required to do so. Adapted from Corellium's driver via Mark Kettenis's U-Boot patches. Co-developed-by: Stan Skowronek Link: https://lore.kernel.org/r/20210929163847.2807812-6-maz@kernel.org Signed-off-by: Stan Skowronek Signed-off-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-apple.c | 46 +++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c index ed78af70dbe7..66d4bc2f72e1 100644 --- a/drivers/pci/controller/pcie-apple.c +++ b/drivers/pci/controller/pcie-apple.c @@ -132,6 +132,48 @@ static void rmw_set(u32 set, void __iomem *addr) writel_relaxed(readl_relaxed(addr) | set, addr); } +static void rmw_clear(u32 clr, void __iomem *addr) +{ + writel_relaxed(readl_relaxed(addr) & ~clr, addr); +} + +static int apple_pcie_setup_refclk(struct apple_pcie *pcie, + struct apple_pcie_port *port) +{ + u32 stat; + int res; + + res = readl_relaxed_poll_timeout(pcie->base + CORE_RC_PHYIF_STAT, stat, + stat & CORE_RC_PHYIF_STAT_REFCLK, + 100, 50000); + if (res < 0) + return res; + + rmw_set(CORE_LANE_CTL_CFGACC, pcie->base + CORE_LANE_CTL(port->idx)); + rmw_set(CORE_LANE_CFG_REFCLK0REQ, pcie->base + CORE_LANE_CFG(port->idx)); + + res = readl_relaxed_poll_timeout(pcie->base + CORE_LANE_CFG(port->idx), + stat, stat & CORE_LANE_CFG_REFCLK0ACK, + 100, 50000); + if (res < 0) + return res; + + rmw_set(CORE_LANE_CFG_REFCLK1, pcie->base + CORE_LANE_CFG(port->idx)); + res = readl_relaxed_poll_timeout(pcie->base + CORE_LANE_CFG(port->idx), + stat, stat & CORE_LANE_CFG_REFCLK1, + 100, 50000); + + if (res < 0) + return res; + + rmw_clear(CORE_LANE_CTL_CFGACC, pcie->base + CORE_LANE_CTL(port->idx)); + + rmw_set(CORE_LANE_CFG_REFCLKEN, pcie->base + CORE_LANE_CFG(port->idx)); + rmw_set(PORT_REFCLK_EN, port->base + PORT_REFCLK); + + return 0; +} + static int apple_pcie_setup_port(struct apple_pcie *pcie, struct device_node *np) { @@ -165,6 +207,10 @@ static int apple_pcie_setup_port(struct apple_pcie *pcie, rmw_set(PORT_APPCLK_EN, port->base + PORT_APPCLK); + ret = apple_pcie_setup_refclk(pcie, port); + if (ret < 0) + return ret; + rmw_set(PORT_PERST_OFF, port->base + PORT_PERST); gpiod_set_value(reset, 1); From b22dbbb24571c052364f476381dbac110bdca4d5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:11 +0100 Subject: [PATCH 237/512] PCI: kirin: Support PERST# GPIOs for HiKey970 external PEX 8606 bridge On HiKey970, there's a PEX 8606 PCI bridge on its PHY with 6 lanes. Only 4 lanes are connected: lane 0 - connected to Kirin 970 (upstream) lane 4 - M.2 slot lane 5 - mini PCIe slot lane 6 - on-board Ethernet controller Each lane has its own PERST# GPIO pin and needs a clock request. Add support to parse a DT schema containing the above data. HiKey 970 requires a little more waiting time for the PCI bridge - which is outside the SoC - to finish the PERST# reset, and then initialize the eye diagram. Increase the waiting time for the PERST# signals accordingly. [bhelgaas: squash refcount fix from Wan Jiabing : https://lore.kernel.org/r/20211103062518.25695-1-wanjiabing@vivo.com and drop "parent" refcount per https://lore.kernel.org/all/20211103143059.GA683503@bhelgaas/] Link: https://lore.kernel.org/r/bb391a0e0f0863b66e645048315fab1a4f63f277.1634812676.git.mchehab+huawei@kernel.org Link: https://lore.kernel.org/all/9a365cffe5af9ec5a1f79638968c3a2efa979b65.1634622716.git.mchehab+huawei@kernel.org/ Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song Cc: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/pcie-kirin.c | 265 +++++++++++++++++++++--- 1 file changed, 233 insertions(+), 32 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 86c13661e02d..889e1eef4136 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -52,6 +52,18 @@ #define PCIE_DEBOUNCE_PARAM 0xF0F400 #define PCIE_OE_BYPASS (0x3 << 28) +/* + * Max number of connected PCI slots at an external PCI bridge + * + * This is used on HiKey 970, which has a PEX 8606 bridge with 4 connected + * lanes (lane 0 upstream, and the other three lanes, one connected to an + * in-board Ethernet adapter and the other two connected to M.2 and mini + * PCI slots. + * + * Each slot has a different clock source and uses a separate PERST# pin. + */ +#define MAX_PCI_SLOTS 3 + enum pcie_kirin_phy_type { PCIE_KIRIN_INTERNAL_PHY, PCIE_KIRIN_EXTERNAL_PHY @@ -64,6 +76,19 @@ struct kirin_pcie { struct regmap *apb; struct phy *phy; void *phy_priv; /* only for PCIE_KIRIN_INTERNAL_PHY */ + + /* DWC PERST# */ + int gpio_id_dwc_perst; + + /* Per-slot PERST# */ + int num_slots; + int gpio_id_reset[MAX_PCI_SLOTS]; + const char *reset_names[MAX_PCI_SLOTS]; + + /* Per-slot clkreq */ + int n_gpio_clkreq; + int gpio_id_clkreq[MAX_PCI_SLOTS]; + const char *clkreq_names[MAX_PCI_SLOTS]; }; /* @@ -87,7 +112,7 @@ struct kirin_pcie { #define CRGCTRL_PCIE_ASSERT_BIT 0x8c000000 /* Time for delay */ -#define REF_2_PERST_MIN 20000 +#define REF_2_PERST_MIN 21000 #define REF_2_PERST_MAX 25000 #define PERST_2_ACCESS_MIN 10000 #define PERST_2_ACCESS_MAX 12000 @@ -108,7 +133,6 @@ struct hi3660_pcie_phy { struct clk *phy_ref_clk; struct clk *aclk; struct clk *aux_clk; - int gpio_id_reset; }; /* Registers in PCIePHY */ @@ -171,16 +195,6 @@ static int hi3660_pcie_phy_get_resource(struct hi3660_pcie_phy *phy) if (IS_ERR(phy->sysctrl)) return PTR_ERR(phy->sysctrl); - /* gpios */ - phy->gpio_id_reset = of_get_named_gpio(dev->of_node, - "reset-gpios", 0); - if (phy->gpio_id_reset == -EPROBE_DEFER) { - return -EPROBE_DEFER; - } else if (!gpio_is_valid(phy->gpio_id_reset)) { - dev_err(phy->dev, "unable to get a valid gpio pin\n"); - return -ENODEV; - } - return 0; } @@ -297,15 +311,7 @@ static int hi3660_pcie_phy_power_on(struct kirin_pcie *pcie) if (ret) goto disable_clks; - /* perst assert Endpoint */ - if (!gpio_request(phy->gpio_id_reset, "pcie_perst")) { - usleep_range(REF_2_PERST_MIN, REF_2_PERST_MAX); - ret = gpio_direction_output(phy->gpio_id_reset, 1); - if (ret) - goto disable_clks; - usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX); - return 0; - } + return 0; disable_clks: hi3660_pcie_phy_clk_ctrl(phy, false); @@ -347,11 +353,100 @@ static const struct regmap_config pcie_kirin_regmap_conf = { .reg_stride = 4, }; +static int kirin_pcie_get_gpio_enable(struct kirin_pcie *pcie, + struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + char name[32]; + int ret, i; + + /* This is an optional property */ + ret = of_gpio_named_count(np, "hisilicon,clken-gpios"); + if (ret < 0) + return 0; + + if (ret > MAX_PCI_SLOTS) { + dev_err(dev, "Too many GPIO clock requests!\n"); + return -EINVAL; + } + + pcie->n_gpio_clkreq = ret; + + for (i = 0; i < pcie->n_gpio_clkreq; i++) { + pcie->gpio_id_clkreq[i] = of_get_named_gpio(dev->of_node, + "hisilicon,clken-gpios", i); + if (pcie->gpio_id_clkreq[i] < 0) + return pcie->gpio_id_clkreq[i]; + + sprintf(name, "pcie_clkreq_%d", i); + pcie->clkreq_names[i] = devm_kstrdup_const(dev, name, + GFP_KERNEL); + if (!pcie->clkreq_names[i]) + return -ENOMEM; + } + + return 0; +} + +static int kirin_pcie_parse_port(struct kirin_pcie *pcie, + struct platform_device *pdev, + struct device_node *node) +{ + struct device *dev = &pdev->dev; + struct device_node *parent, *child; + int ret, slot, i; + char name[32]; + + for_each_available_child_of_node(node, parent) { + for_each_available_child_of_node(parent, child) { + i = pcie->num_slots; + + pcie->gpio_id_reset[i] = of_get_named_gpio(child, + "reset-gpios", 0); + if (pcie->gpio_id_reset[i] < 0) + continue; + + pcie->num_slots++; + if (pcie->num_slots > MAX_PCI_SLOTS) { + dev_err(dev, "Too many PCI slots!\n"); + ret = -EINVAL; + goto put_node; + } + + ret = of_pci_get_devfn(child); + if (ret < 0) { + dev_err(dev, "failed to parse devfn: %d\n", ret); + goto put_node; + } + + slot = PCI_SLOT(ret); + + sprintf(name, "pcie_perst_%d", slot); + pcie->reset_names[i] = devm_kstrdup_const(dev, name, + GFP_KERNEL); + if (!pcie->reset_names[i]) { + ret = -ENOMEM; + goto put_node; + } + } + } + + return 0; + +put_node: + of_node_put(child); + of_node_put(parent); + return ret; +} + static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie, struct platform_device *pdev) { struct device *dev = &pdev->dev; + struct device_node *child, *node = dev->of_node; void __iomem *apb_base; + int ret; apb_base = devm_platform_ioremap_resource_byname(pdev, "apb"); if (IS_ERR(apb_base)) @@ -362,7 +457,32 @@ static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie, if (IS_ERR(kirin_pcie->apb)) return PTR_ERR(kirin_pcie->apb); + /* pcie internal PERST# gpio */ + kirin_pcie->gpio_id_dwc_perst = of_get_named_gpio(dev->of_node, + "reset-gpios", 0); + if (kirin_pcie->gpio_id_dwc_perst == -EPROBE_DEFER) { + return -EPROBE_DEFER; + } else if (!gpio_is_valid(kirin_pcie->gpio_id_dwc_perst)) { + dev_err(dev, "unable to get a valid gpio pin\n"); + return -ENODEV; + } + + ret = kirin_pcie_get_gpio_enable(kirin_pcie, pdev); + if (ret) + return ret; + + /* Parse OF children */ + for_each_available_child_of_node(node, child) { + ret = kirin_pcie_parse_port(kirin_pcie, pdev, child); + if (ret) + goto put_node; + } + return 0; + +put_node: + of_node_put(child); + return ret; } static void kirin_pcie_sideband_dbi_w_mode(struct kirin_pcie *kirin_pcie, @@ -419,9 +539,33 @@ static int kirin_pcie_wr_own_conf(struct pci_bus *bus, unsigned int devfn, return PCIBIOS_SUCCESSFUL; } +static int kirin_pcie_add_bus(struct pci_bus *bus) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(bus->sysdata); + struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci); + int i, ret; + + if (!kirin_pcie->num_slots) + return 0; + + /* Send PERST# to each slot */ + for (i = 0; i < kirin_pcie->num_slots; i++) { + ret = gpio_direction_output(kirin_pcie->gpio_id_reset[i], 1); + if (ret) { + dev_err(pci->dev, "PERST# %s error: %d\n", + kirin_pcie->reset_names[i], ret); + } + } + usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX); + + return 0; +} + + static struct pci_ops kirin_pci_ops = { .read = kirin_pcie_rd_own_conf, .write = kirin_pcie_wr_own_conf, + .add_bus = kirin_pcie_add_bus, }; static u32 kirin_pcie_read_dbi(struct dw_pcie *pci, void __iomem *base, @@ -477,6 +621,44 @@ static int kirin_pcie_host_init(struct pcie_port *pp) return 0; } +static int kirin_pcie_gpio_request(struct kirin_pcie *kirin_pcie, + struct device *dev) +{ + int ret, i; + + for (i = 0; i < kirin_pcie->num_slots; i++) { + if (!gpio_is_valid(kirin_pcie->gpio_id_reset[i])) { + dev_err(dev, "unable to get a valid %s gpio\n", + kirin_pcie->reset_names[i]); + return -ENODEV; + } + + ret = devm_gpio_request(dev, kirin_pcie->gpio_id_reset[i], + kirin_pcie->reset_names[i]); + if (ret) + return ret; + } + + for (i = 0; i < kirin_pcie->n_gpio_clkreq; i++) { + if (!gpio_is_valid(kirin_pcie->gpio_id_clkreq[i])) { + dev_err(dev, "unable to get a valid %s gpio\n", + kirin_pcie->clkreq_names[i]); + return -ENODEV; + } + + ret = devm_gpio_request(dev, kirin_pcie->gpio_id_clkreq[i], + kirin_pcie->clkreq_names[i]); + if (ret) + return ret; + + ret = gpio_direction_output(kirin_pcie->gpio_id_clkreq[i], 0); + if (ret) + return ret; + } + + return 0; +} + static const struct dw_pcie_ops kirin_dw_pcie_ops = { .read_dbi = kirin_pcie_read_dbi, .write_dbi = kirin_pcie_write_dbi, @@ -499,24 +681,43 @@ static int kirin_pcie_power_on(struct platform_device *pdev, if (ret) return ret; - return hi3660_pcie_phy_power_on(kirin_pcie); + ret = hi3660_pcie_phy_power_on(kirin_pcie); + if (ret) + return ret; + } else { + kirin_pcie->phy = devm_of_phy_get(dev, dev->of_node, NULL); + if (IS_ERR(kirin_pcie->phy)) + return PTR_ERR(kirin_pcie->phy); + + ret = kirin_pcie_gpio_request(kirin_pcie, dev); + if (ret) + return ret; + + ret = phy_init(kirin_pcie->phy); + if (ret) + goto err; + + ret = phy_power_on(kirin_pcie->phy); + if (ret) + goto err; } - kirin_pcie->phy = devm_of_phy_get(dev, dev->of_node, NULL); - if (IS_ERR(kirin_pcie->phy)) - return PTR_ERR(kirin_pcie->phy); + /* perst assert Endpoint */ + usleep_range(REF_2_PERST_MIN, REF_2_PERST_MAX); - ret = phy_init(kirin_pcie->phy); - if (ret) - goto err; + if (!gpio_request(kirin_pcie->gpio_id_dwc_perst, "pcie_perst_bridge")) { + ret = gpio_direction_output(kirin_pcie->gpio_id_dwc_perst, 1); + if (ret) + goto err; + } - ret = phy_power_on(kirin_pcie->phy); - if (ret) - goto err; + usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX); return 0; err: - phy_exit(kirin_pcie->phy); + if (kirin_pcie->type != PCIE_KIRIN_INTERNAL_PHY) + phy_exit(kirin_pcie->phy); + return ret; } From e636c1690941a396aa7643ae2c4397cebaa2851e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:13 +0100 Subject: [PATCH 238/512] PCI: kirin: Add Kirin 970 compatible Now that everything is in place, add a compatible for Kirin 970. Link: https://lore.kernel.org/r/ac8c730c0300b90d96bdaaf387d458d8949241a9.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 889e1eef4136..0e3c70f54e94 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -739,6 +739,10 @@ static const struct of_device_id kirin_pcie_match[] = { .compatible = "hisilicon,kirin960-pcie", .data = (void *)PCIE_KIRIN_INTERNAL_PHY }, + { + .compatible = "hisilicon,kirin970-pcie", + .data = (void *)PCIE_KIRIN_EXTERNAL_PHY + }, {}, }; From a4099c59a4b8f8521def15e091b3167dfae9ea16 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:14 +0100 Subject: [PATCH 239/512] PCI: kirin: Add MODULE_* macros This driver misses the MODULE_* macros. Add them. Link: https://lore.kernel.org/r/f7a951d0c2009f5765214fc2e83e24cf41585023.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 0e3c70f54e94..66e3aefcb817 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -805,3 +805,8 @@ static struct platform_driver kirin_pcie_driver = { }, }; builtin_platform_driver(kirin_pcie_driver); + +MODULE_DEVICE_TABLE(of, kirin_pcie_match); +MODULE_DESCRIPTION("PCIe host controller driver for Kirin Phone SoCs"); +MODULE_AUTHOR("Xiaowei Song "); +MODULE_LICENSE("GPL v2"); From aed9d9e44926f63344c069b79890738c542bc513 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:15 +0100 Subject: [PATCH 240/512] PCI: kirin: Allow building it as a module There's nothing preventing this driver from being loaded as a module. Change its config from bool to tristate. Link: https://lore.kernel.org/r/b5e7cfe9df09b492750bd6db0f0c911eaae8c2d4.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 76c0a63a3f64..a07c08d714c9 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -266,7 +266,7 @@ config PCIE_KEEMBAY_EP config PCIE_KIRIN depends on OF && (ARM64 || COMPILE_TEST) - bool "HiSilicon Kirin series SoCs PCIe controllers" + tristate "HiSilicon Kirin series SoCs PCIe controllers" depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST help From 76afbdc76b801f459452fa71d3ea00b7f8afd0fc Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:16 +0100 Subject: [PATCH 241/512] PCI: kirin: Add power_off support for Kirin 960 PHY In order to prepare for module unload, add a power_off method for HiKey 960. Link: https://lore.kernel.org/r/b095818b0d7fadae4cae200f481caf7a66e61fb4.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 66e3aefcb817..6c9fb3f219ba 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -342,6 +342,18 @@ static int hi3660_pcie_phy_init(struct platform_device *pdev, return hi3660_pcie_phy_get_resource(phy); } +static int hi3660_pcie_phy_power_off(struct kirin_pcie *pcie) +{ + struct hi3660_pcie_phy *phy = pcie->phy_priv; + + /* Drop power supply for Host */ + regmap_write(phy->sysctrl, SCTRL_PCIE_CMOS_OFFSET, 0x00); + + hi3660_pcie_phy_clk_ctrl(phy, false); + + return 0; +} + /* * The non-PHY part starts here */ @@ -561,7 +573,6 @@ static int kirin_pcie_add_bus(struct pci_bus *bus) return 0; } - static struct pci_ops kirin_pci_ops = { .read = kirin_pcie_rd_own_conf, .write = kirin_pcie_wr_own_conf, @@ -715,8 +726,12 @@ static int kirin_pcie_power_on(struct platform_device *pdev, return 0; err: - if (kirin_pcie->type != PCIE_KIRIN_INTERNAL_PHY) + if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) { + hi3660_pcie_phy_power_off(kirin_pcie); + } else { + phy_power_off(kirin_pcie->phy); phy_exit(kirin_pcie->phy); + } return ret; } @@ -726,7 +741,7 @@ static int __exit kirin_pcie_remove(struct platform_device *pdev) struct kirin_pcie *kirin_pcie = platform_get_drvdata(pdev); if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) - return 0; + return hi3660_pcie_phy_power_off(kirin_pcie); phy_power_off(kirin_pcie->phy); phy_exit(kirin_pcie->phy); From 79cf014bf3b0a72d1d4b4c0e24c325c386fa5479 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:17 +0100 Subject: [PATCH 242/512] PCI: kirin: Move the power-off code to a common routine Instead of having two copies of the same logic, place the power-off logic in a separate function. No functional changes. Link: https://lore.kernel.org/r/64f6e8da3e5fff38b6c8fcb208ace46efe6555bb.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 26 ++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 6c9fb3f219ba..5ebdf1b4b8d5 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -681,6 +681,19 @@ static const struct dw_pcie_host_ops kirin_pcie_host_ops = { .host_init = kirin_pcie_host_init, }; +static int kirin_pcie_power_off(struct kirin_pcie *kirin_pcie) +{ + int i; + + if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) + return hi3660_pcie_phy_power_off(kirin_pcie); + + phy_power_off(kirin_pcie->phy); + phy_exit(kirin_pcie->phy); + + return 0; +} + static int kirin_pcie_power_on(struct platform_device *pdev, struct kirin_pcie *kirin_pcie) { @@ -726,12 +739,7 @@ static int kirin_pcie_power_on(struct platform_device *pdev, return 0; err: - if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) { - hi3660_pcie_phy_power_off(kirin_pcie); - } else { - phy_power_off(kirin_pcie->phy); - phy_exit(kirin_pcie->phy); - } + kirin_pcie_power_off(kirin_pcie); return ret; } @@ -740,11 +748,7 @@ static int __exit kirin_pcie_remove(struct platform_device *pdev) { struct kirin_pcie *kirin_pcie = platform_get_drvdata(pdev); - if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) - return hi3660_pcie_phy_power_off(kirin_pcie); - - phy_power_off(kirin_pcie->phy); - phy_exit(kirin_pcie->phy); + kirin_pcie_power_off(kirin_pcie); return 0; } From 5b1e8c00afc32df8b4cf39a5c6cc7378a924abb7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:18 +0100 Subject: [PATCH 243/512] PCI: kirin: Disable clkreq during poweroff sequence The logic at kirin_pcie_gpio_request() enables some clkreq GPIO lines. Disable them during power-off. Link: https://lore.kernel.org/r/f403e590843de1a581cade2d534d34715706f54e.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 5ebdf1b4b8d5..1878c91a33a3 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -688,6 +688,9 @@ static int kirin_pcie_power_off(struct kirin_pcie *kirin_pcie) if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) return hi3660_pcie_phy_power_off(kirin_pcie); + for (i = 0; i < kirin_pcie->n_gpio_clkreq; i++) + gpio_direction_output(kirin_pcie->gpio_id_clkreq[i], 1); + phy_power_off(kirin_pcie->phy); phy_exit(kirin_pcie->phy); From dc47d2f4c0549982a81d25a8ec6a9dbfd2211122 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:19 +0100 Subject: [PATCH 244/512] PCI: kirin: De-init the dwc driver The logic under .remove ops is missing a call to dw_pcie_host_deinit(). Add it, in order to allow the DWC core to be properly cleaned up. Link: https://lore.kernel.org/r/838621e1c84ebaac153ccd9c36ea5e1254c61ead.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 1878c91a33a3..24c596d8f27f 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -751,6 +751,8 @@ static int __exit kirin_pcie_remove(struct platform_device *pdev) { struct kirin_pcie *kirin_pcie = platform_get_drvdata(pdev); + dw_pcie_host_deinit(&kirin_pcie->pci->pp); + kirin_pcie_power_off(kirin_pcie); return 0; From e4c72797fd1609c630ead5bd86d5df16bb0ed5e9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 21 Oct 2021 11:45:20 +0100 Subject: [PATCH 245/512] PCI: kirin: Allow removing the driver Now that everything is in place at the poweroff sequence, this driver can use module_platform_driver(), which allows it to be removed. Link: https://lore.kernel.org/r/53b40494252444a9b830827922c4e3a301b8f863.1634812676.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Xiaowei Song --- drivers/pci/controller/dwc/pcie-kirin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 24c596d8f27f..095afbccf9c1 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -828,7 +828,7 @@ static struct platform_driver kirin_pcie_driver = { .suppress_bind_attrs = true, }, }; -builtin_platform_driver(kirin_pcie_driver); +module_platform_driver(kirin_pcie_driver); MODULE_DEVICE_TABLE(of, kirin_pcie_match); MODULE_DESCRIPTION("PCIe host controller driver for Kirin Phone SoCs"); From d8fcbe52d7d382106ab1dfa89c4b6a4952524125 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Sep 2021 17:38:39 +0100 Subject: [PATCH 246/512] PCI: apple: Add INTx and per-port interrupt support Add support for the per-port interrupt controller that deals with both INTx signalling and management interrupts. This allows the Link-up/Link-down interrupts to be wired, allowing the bring-up to be synchronised (and provide debug information). The framework can further be used to handle the rest of the per port events if and when necessary. Likewise, INTx signalling is implemented so that end-points can actually be used. Link: https://lore.kernel.org/r/20210929163847.2807812-7-maz@kernel.org Link: https://lore.kernel.org/r/20211004150552.3844830-1-maz@kernel.org Tested-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-apple.c | 210 ++++++++++++++++++++++++++++ kernel/irq/irqdomain.c | 1 + 2 files changed, 211 insertions(+) diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c index 66d4bc2f72e1..8cd2f0903c1e 100644 --- a/drivers/pci/controller/pcie-apple.c +++ b/drivers/pci/controller/pcie-apple.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -118,12 +119,14 @@ struct apple_pcie { struct device *dev; void __iomem *base; + struct completion event; }; struct apple_pcie_port { struct apple_pcie *pcie; struct device_node *np; void __iomem *base; + struct irq_domain *domain; int idx; }; @@ -137,6 +140,201 @@ static void rmw_clear(u32 clr, void __iomem *addr) writel_relaxed(readl_relaxed(addr) & ~clr, addr); } +static void apple_port_irq_mask(struct irq_data *data) +{ + struct apple_pcie_port *port = irq_data_get_irq_chip_data(data); + + writel_relaxed(BIT(data->hwirq), port->base + PORT_INTMSKSET); +} + +static void apple_port_irq_unmask(struct irq_data *data) +{ + struct apple_pcie_port *port = irq_data_get_irq_chip_data(data); + + writel_relaxed(BIT(data->hwirq), port->base + PORT_INTMSKCLR); +} + +static bool hwirq_is_intx(unsigned int hwirq) +{ + return BIT(hwirq) & PORT_INT_INTx_MASK; +} + +static void apple_port_irq_ack(struct irq_data *data) +{ + struct apple_pcie_port *port = irq_data_get_irq_chip_data(data); + + if (!hwirq_is_intx(data->hwirq)) + writel_relaxed(BIT(data->hwirq), port->base + PORT_INTSTAT); +} + +static int apple_port_irq_set_type(struct irq_data *data, unsigned int type) +{ + /* + * It doesn't seem that there is any way to configure the + * trigger, so assume INTx have to be level (as per the spec), + * and the rest is edge (which looks likely). + */ + if (hwirq_is_intx(data->hwirq) ^ !!(type & IRQ_TYPE_LEVEL_MASK)) + return -EINVAL; + + irqd_set_trigger_type(data, type); + return 0; +} + +static struct irq_chip apple_port_irqchip = { + .name = "PCIe", + .irq_ack = apple_port_irq_ack, + .irq_mask = apple_port_irq_mask, + .irq_unmask = apple_port_irq_unmask, + .irq_set_type = apple_port_irq_set_type, +}; + +static int apple_port_irq_domain_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *args) +{ + struct apple_pcie_port *port = domain->host_data; + struct irq_fwspec *fwspec = args; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_flow_handler_t flow = handle_edge_irq; + unsigned int type = IRQ_TYPE_EDGE_RISING; + + if (hwirq_is_intx(fwspec->param[0] + i)) { + flow = handle_level_irq; + type = IRQ_TYPE_LEVEL_HIGH; + } + + irq_domain_set_info(domain, virq + i, fwspec->param[0] + i, + &apple_port_irqchip, port, flow, + NULL, NULL); + + irq_set_irq_type(virq + i, type); + } + + return 0; +} + +static void apple_port_irq_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + int i; + + for (i = 0; i < nr_irqs; i++) { + struct irq_data *d = irq_domain_get_irq_data(domain, virq + i); + + irq_set_handler(virq + i, NULL); + irq_domain_reset_irq_data(d); + } +} + +static const struct irq_domain_ops apple_port_irq_domain_ops = { + .translate = irq_domain_translate_onecell, + .alloc = apple_port_irq_domain_alloc, + .free = apple_port_irq_domain_free, +}; + +static void apple_port_irq_handler(struct irq_desc *desc) +{ + struct apple_pcie_port *port = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned long stat; + int i; + + chained_irq_enter(chip, desc); + + stat = readl_relaxed(port->base + PORT_INTSTAT); + + for_each_set_bit(i, &stat, 32) + generic_handle_domain_irq(port->domain, i); + + chained_irq_exit(chip, desc); +} + +static int apple_pcie_port_setup_irq(struct apple_pcie_port *port) +{ + struct fwnode_handle *fwnode = &port->np->fwnode; + unsigned int irq; + + /* FIXME: consider moving each interrupt under each port */ + irq = irq_of_parse_and_map(to_of_node(dev_fwnode(port->pcie->dev)), + port->idx); + if (!irq) + return -ENXIO; + + port->domain = irq_domain_create_linear(fwnode, 32, + &apple_port_irq_domain_ops, + port); + if (!port->domain) + return -ENOMEM; + + /* Disable all interrupts */ + writel_relaxed(~0, port->base + PORT_INTMSKSET); + writel_relaxed(~0, port->base + PORT_INTSTAT); + + irq_set_chained_handler_and_data(irq, apple_port_irq_handler, port); + + return 0; +} + +static irqreturn_t apple_pcie_port_irq(int irq, void *data) +{ + struct apple_pcie_port *port = data; + unsigned int hwirq = irq_domain_get_irq_data(port->domain, irq)->hwirq; + + switch (hwirq) { + case PORT_INT_LINK_UP: + dev_info_ratelimited(port->pcie->dev, "Link up on %pOF\n", + port->np); + complete_all(&port->pcie->event); + break; + case PORT_INT_LINK_DOWN: + dev_info_ratelimited(port->pcie->dev, "Link down on %pOF\n", + port->np); + break; + default: + return IRQ_NONE; + } + + return IRQ_HANDLED; +} + +static int apple_pcie_port_register_irqs(struct apple_pcie_port *port) +{ + static struct { + unsigned int hwirq; + const char *name; + } port_irqs[] = { + { PORT_INT_LINK_UP, "Link up", }, + { PORT_INT_LINK_DOWN, "Link down", }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(port_irqs); i++) { + struct irq_fwspec fwspec = { + .fwnode = &port->np->fwnode, + .param_count = 1, + .param = { + [0] = port_irqs[i].hwirq, + }, + }; + unsigned int irq; + int ret; + + irq = irq_domain_alloc_irqs(port->domain, 1, NUMA_NO_NODE, + &fwspec); + if (WARN_ON(!irq)) + continue; + + ret = request_irq(irq, apple_pcie_port_irq, 0, + port_irqs[i].name, port); + WARN_ON(ret); + } + + return 0; +} + static int apple_pcie_setup_refclk(struct apple_pcie *pcie, struct apple_pcie_port *port) { @@ -221,8 +419,20 @@ static int apple_pcie_setup_port(struct apple_pcie *pcie, return ret; } + ret = apple_pcie_port_setup_irq(port); + if (ret) + return ret; + + init_completion(&pcie->event); + + ret = apple_pcie_port_register_irqs(port); + WARN_ON(ret); + writel_relaxed(PORT_LTSSMCTL_START, port->base + PORT_LTSSMCTL); + if (!wait_for_completion_timeout(&pcie->event, HZ / 10)) + dev_warn(pcie->dev, "%pOF link didn't come up\n", np); + return 0; } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a698c1f6cc6..40e85a46f913 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1502,6 +1502,7 @@ out_free_desc: irq_free_descs(virq, nr_irqs); return ret; } +EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs); /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) From 476c41ed4597512f9da334be8ddf2fb2262d3057 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Sep 2021 17:38:40 +0100 Subject: [PATCH 247/512] PCI: apple: Implement MSI support Probe for the 'msi-ranges' property, and implement the MSI support in the form of the usual two-level hierarchy. Note that contrary to the wired interrupts, MSIs are shared among all the ports. Link: https://lore.kernel.org/r/20210929163847.2807812-8-maz@kernel.org Tested-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-apple.c | 169 +++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c index 8cd2f0903c1e..8f413509e37d 100644 --- a/drivers/pci/controller/pcie-apple.c +++ b/drivers/pci/controller/pcie-apple.c @@ -116,10 +116,22 @@ #define PORT_TUNSTAT_PERST_ACK_PEND BIT(1) #define PORT_PREFMEM_ENABLE 0x00994 +/* + * The doorbell address is set to 0xfffff000, which by convention + * matches what MacOS does, and it is possible to use any other + * address (in the bottom 4GB, as the base register is only 32bit). + */ +#define DOORBELL_ADDR 0xfffff000 + struct apple_pcie { + struct mutex lock; struct device *dev; void __iomem *base; + struct irq_domain *domain; + unsigned long *bitmap; struct completion event; + struct irq_fwspec fwspec; + u32 nvecs; }; struct apple_pcie_port { @@ -140,6 +152,101 @@ static void rmw_clear(u32 clr, void __iomem *addr) writel_relaxed(readl_relaxed(addr) & ~clr, addr); } +static void apple_msi_top_irq_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void apple_msi_top_irq_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip apple_msi_top_chip = { + .name = "PCIe MSI", + .irq_mask = apple_msi_top_irq_mask, + .irq_unmask = apple_msi_top_irq_unmask, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_set_type = irq_chip_set_type_parent, +}; + +static void apple_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + msg->address_hi = upper_32_bits(DOORBELL_ADDR); + msg->address_lo = lower_32_bits(DOORBELL_ADDR); + msg->data = data->hwirq; +} + +static struct irq_chip apple_msi_bottom_chip = { + .name = "MSI", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_set_type = irq_chip_set_type_parent, + .irq_compose_msi_msg = apple_msi_compose_msg, +}; + +static int apple_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct apple_pcie *pcie = domain->host_data; + struct irq_fwspec fwspec = pcie->fwspec; + unsigned int i; + int ret, hwirq; + + mutex_lock(&pcie->lock); + + hwirq = bitmap_find_free_region(pcie->bitmap, pcie->nvecs, + order_base_2(nr_irqs)); + + mutex_unlock(&pcie->lock); + + if (hwirq < 0) + return -ENOSPC; + + fwspec.param[1] += hwirq; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, &fwspec); + if (ret) + return ret; + + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &apple_msi_bottom_chip, + domain->host_data); + } + + return 0; +} + +static void apple_msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct apple_pcie *pcie = domain->host_data; + + mutex_lock(&pcie->lock); + + bitmap_release_region(pcie->bitmap, d->hwirq, order_base_2(nr_irqs)); + + mutex_unlock(&pcie->lock); +} + +static const struct irq_domain_ops apple_msi_domain_ops = { + .alloc = apple_msi_domain_alloc, + .free = apple_msi_domain_free, +}; + +static struct msi_domain_info apple_msi_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), + .chip = &apple_msi_top_chip, +}; + static void apple_port_irq_mask(struct irq_data *data) { struct apple_pcie_port *port = irq_data_get_irq_chip_data(data); @@ -275,6 +382,15 @@ static int apple_pcie_port_setup_irq(struct apple_pcie_port *port) irq_set_chained_handler_and_data(irq, apple_port_irq_handler, port); + /* Configure MSI base address */ + BUILD_BUG_ON(upper_32_bits(DOORBELL_ADDR)); + writel_relaxed(lower_32_bits(DOORBELL_ADDR), port->base + PORT_MSIADDR); + + /* Enable MSIs, shared between all ports */ + writel_relaxed(0, port->base + PORT_MSIBASE); + writel_relaxed((ilog2(port->pcie->nvecs) << PORT_MSICFG_L2MSINUM_SHIFT) | + PORT_MSICFG_EN, port->base + PORT_MSICFG); + return 0; } @@ -436,6 +552,55 @@ static int apple_pcie_setup_port(struct apple_pcie *pcie, return 0; } +static int apple_msi_init(struct apple_pcie *pcie) +{ + struct fwnode_handle *fwnode = dev_fwnode(pcie->dev); + struct of_phandle_args args = {}; + struct irq_domain *parent; + int ret; + + ret = of_parse_phandle_with_args(to_of_node(fwnode), "msi-ranges", + "#interrupt-cells", 0, &args); + if (ret) + return ret; + + ret = of_property_read_u32_index(to_of_node(fwnode), "msi-ranges", + args.args_count + 1, &pcie->nvecs); + if (ret) + return ret; + + of_phandle_args_to_fwspec(args.np, args.args, args.args_count, + &pcie->fwspec); + + pcie->bitmap = devm_bitmap_zalloc(pcie->dev, pcie->nvecs, GFP_KERNEL); + if (!pcie->bitmap) + return -ENOMEM; + + parent = irq_find_matching_fwspec(&pcie->fwspec, DOMAIN_BUS_WIRED); + if (!parent) { + dev_err(pcie->dev, "failed to find parent domain\n"); + return -ENXIO; + } + + parent = irq_domain_create_hierarchy(parent, 0, pcie->nvecs, fwnode, + &apple_msi_domain_ops, pcie); + if (!parent) { + dev_err(pcie->dev, "failed to create IRQ domain\n"); + return -ENOMEM; + } + irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS); + + pcie->domain = pci_msi_create_irq_domain(fwnode, &apple_msi_info, + parent); + if (!pcie->domain) { + dev_err(pcie->dev, "failed to create MSI domain\n"); + irq_domain_remove(parent); + return -ENOMEM; + } + + return 0; +} + static int apple_pcie_init(struct pci_config_window *cfg) { struct device *dev = cfg->parent; @@ -450,6 +615,8 @@ static int apple_pcie_init(struct pci_config_window *cfg) pcie->dev = dev; + mutex_init(&pcie->lock); + pcie->base = devm_platform_ioremap_resource(platform, 1); if (IS_ERR(pcie->base)) return PTR_ERR(pcie->base); @@ -463,7 +630,7 @@ static int apple_pcie_init(struct pci_config_window *cfg) } } - return 0; + return apple_msi_init(pcie); } static const struct pci_ecam_ops apple_pcie_cfg_ecam_ops = { From 946d619fa25f55483206befb035fed6a99fbe7b5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Sep 2021 17:38:41 +0100 Subject: [PATCH 248/512] iommu/dart: Exclude MSI doorbell from PCIe device IOVA range The MSI doorbell on Apple HW can be any address in the low 4GB range. However, the MSI write is matched by the PCIe block before hitting the iommu. It must thus be excluded from the IOVA range that is assigned to any PCIe device. Link: https://lore.kernel.org/r/20210929163847.2807812-9-maz@kernel.org Tested-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Sven Peter --- drivers/iommu/apple-dart.c | 27 +++++++++++++++++++++++++++ drivers/pci/controller/Kconfig | 5 +++++ drivers/pci/controller/pcie-apple.c | 4 +++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 559db9259e65..f1f1f024c604 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -721,6 +721,31 @@ static int apple_dart_def_domain_type(struct device *dev) return 0; } +#ifndef CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR +/* Keep things compiling when CONFIG_PCI_APPLE isn't selected */ +#define CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR 0 +#endif +#define DOORBELL_ADDR (CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR & PAGE_MASK) + +static void apple_dart_get_resv_regions(struct device *dev, + struct list_head *head) +{ + if (IS_ENABLED(CONFIG_PCIE_APPLE) && dev_is_pci(dev)) { + struct iommu_resv_region *region; + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; + + region = iommu_alloc_resv_region(DOORBELL_ADDR, + PAGE_SIZE, prot, + IOMMU_RESV_MSI); + if (!region) + return; + + list_add_tail(®ion->list, head); + } + + iommu_dma_get_resv_regions(dev, head); +} + static const struct iommu_ops apple_dart_iommu_ops = { .domain_alloc = apple_dart_domain_alloc, .domain_free = apple_dart_domain_free, @@ -737,6 +762,8 @@ static const struct iommu_ops apple_dart_iommu_ops = { .device_group = apple_dart_device_group, .of_xlate = apple_dart_of_xlate, .def_domain_type = apple_dart_def_domain_type, + .get_resv_regions = apple_dart_get_resv_regions, + .put_resv_regions = generic_iommu_put_resv_regions, .pgsize_bitmap = -1UL, /* Restricted during dart probe */ }; diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 953d9acc031f..5661d4a84832 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -312,6 +312,11 @@ config PCIE_HISI_ERR Say Y here if you want error handling support for the PCIe controller's errors on HiSilicon HIP SoCs +config PCIE_APPLE_MSI_DOORBELL_ADDR + hex + default 0xfffff000 + depends on PCIE_APPLE + config PCIE_APPLE tristate "Apple PCIe controller" depends on ARCH_APPLE || COMPILE_TEST diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c index 8f413509e37d..3646638f3df0 100644 --- a/drivers/pci/controller/pcie-apple.c +++ b/drivers/pci/controller/pcie-apple.c @@ -120,8 +120,10 @@ * The doorbell address is set to 0xfffff000, which by convention * matches what MacOS does, and it is possible to use any other * address (in the bottom 4GB, as the base register is only 32bit). + * However, it has to be excluded from the IOVA range, and the DART + * driver has to know about it. */ -#define DOORBELL_ADDR 0xfffff000 +#define DOORBELL_ADDR CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR struct apple_pcie { struct mutex lock; From 468c8d52c33271d21aac070ebef9283f302094cc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Sep 2021 17:38:42 +0100 Subject: [PATCH 249/512] PCI: apple: Configure RID to SID mapper on device addition The Apple PCIe controller doesn't directly feed the endpoint's Requester ID to the IOMMU (DART), but instead maps RIDs onto Stream IDs (SIDs). The DART and the PCIe controller must thus agree on the SIDs that are used for translation (by using the 'iommu-map' property). For this purpose, parse the 'iommu-map' property each time a device gets added, and use the resulting translation to configure the PCIe RID-to-SID mapper. Similarly, remove the translation if/when the device gets removed. This is all driven from a bus notifier which gets registered at probe time. Hopefully this is the only PCI controller driver in the whole system. [bhelgaas: squash indentation from Zhaoyu Liu : https://lore.kernel.org/r/20211031135544.GA1616@pc] Link: https://lore.kernel.org/r/20210929163847.2807812-10-maz@kernel.org Tested-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Sven Peter --- drivers/pci/controller/pcie-apple.c | 165 +++++++++++++++++++++++++++- 1 file changed, 163 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c index 3646638f3df0..1bf4d75b61be 100644 --- a/drivers/pci/controller/pcie-apple.c +++ b/drivers/pci/controller/pcie-apple.c @@ -23,8 +23,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -116,6 +118,8 @@ #define PORT_TUNSTAT_PERST_ACK_PEND BIT(1) #define PORT_PREFMEM_ENABLE 0x00994 +#define MAX_RID2SID 64 + /* * The doorbell address is set to 0xfffff000, which by convention * matches what MacOS does, and it is possible to use any other @@ -131,6 +135,7 @@ struct apple_pcie { void __iomem *base; struct irq_domain *domain; unsigned long *bitmap; + struct list_head ports; struct completion event; struct irq_fwspec fwspec; u32 nvecs; @@ -141,6 +146,9 @@ struct apple_pcie_port { struct device_node *np; void __iomem *base; struct irq_domain *domain; + struct list_head entry; + DECLARE_BITMAP(sid_map, MAX_RID2SID); + int sid_map_sz; int idx; }; @@ -490,6 +498,14 @@ static int apple_pcie_setup_refclk(struct apple_pcie *pcie, return 0; } +static u32 apple_pcie_rid2sid_write(struct apple_pcie_port *port, + int idx, u32 val) +{ + writel_relaxed(val, port->base + PORT_RID2SID(idx)); + /* Read back to ensure completion of the write */ + return readl_relaxed(port->base + PORT_RID2SID(idx)); +} + static int apple_pcie_setup_port(struct apple_pcie *pcie, struct device_node *np) { @@ -497,7 +513,7 @@ static int apple_pcie_setup_port(struct apple_pcie *pcie, struct apple_pcie_port *port; struct gpio_desc *reset; u32 stat, idx; - int ret; + int ret, i; reset = gpiod_get_from_of_node(np, "reset-gpios", 0, GPIOD_OUT_LOW, "#PERST"); @@ -541,6 +557,18 @@ static int apple_pcie_setup_port(struct apple_pcie *pcie, if (ret) return ret; + /* Reset all RID/SID mappings, and check for RAZ/WI registers */ + for (i = 0; i < MAX_RID2SID; i++) { + if (apple_pcie_rid2sid_write(port, i, 0xbad1d) != 0xbad1d) + break; + apple_pcie_rid2sid_write(port, i, 0); + } + + dev_dbg(pcie->dev, "%pOF: %d RID/SID mapping entries\n", np, i); + + port->sid_map_sz = i; + + list_add_tail(&port->entry, &pcie->ports); init_completion(&pcie->event); ret = apple_pcie_port_register_irqs(port); @@ -603,6 +631,121 @@ static int apple_msi_init(struct apple_pcie *pcie) return 0; } +static struct apple_pcie_port *apple_pcie_get_port(struct pci_dev *pdev) +{ + struct pci_config_window *cfg = pdev->sysdata; + struct apple_pcie *pcie = cfg->priv; + struct pci_dev *port_pdev; + struct apple_pcie_port *port; + + /* Find the root port this device is on */ + port_pdev = pcie_find_root_port(pdev); + + /* If finding the port itself, nothing to do */ + if (WARN_ON(!port_pdev) || pdev == port_pdev) + return NULL; + + list_for_each_entry(port, &pcie->ports, entry) { + if (port->idx == PCI_SLOT(port_pdev->devfn)) + return port; + } + + return NULL; +} + +static int apple_pcie_add_device(struct apple_pcie_port *port, + struct pci_dev *pdev) +{ + u32 sid, rid = PCI_DEVID(pdev->bus->number, pdev->devfn); + int idx, err; + + dev_dbg(&pdev->dev, "added to bus %s, index %d\n", + pci_name(pdev->bus->self), port->idx); + + err = of_map_id(port->pcie->dev->of_node, rid, "iommu-map", + "iommu-map-mask", NULL, &sid); + if (err) + return err; + + mutex_lock(&port->pcie->lock); + + idx = bitmap_find_free_region(port->sid_map, port->sid_map_sz, 0); + if (idx >= 0) { + apple_pcie_rid2sid_write(port, idx, + PORT_RID2SID_VALID | + (sid << PORT_RID2SID_SID_SHIFT) | rid); + + dev_dbg(&pdev->dev, "mapping RID%x to SID%x (index %d)\n", + rid, sid, idx); + } + + mutex_unlock(&port->pcie->lock); + + return idx >= 0 ? 0 : -ENOSPC; +} + +static void apple_pcie_release_device(struct apple_pcie_port *port, + struct pci_dev *pdev) +{ + u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn); + int idx; + + mutex_lock(&port->pcie->lock); + + for_each_set_bit(idx, port->sid_map, port->sid_map_sz) { + u32 val; + + val = readl_relaxed(port->base + PORT_RID2SID(idx)); + if ((val & 0xffff) == rid) { + apple_pcie_rid2sid_write(port, idx, 0); + bitmap_release_region(port->sid_map, idx, 0); + dev_dbg(&pdev->dev, "Released %x (%d)\n", val, idx); + break; + } + } + + mutex_unlock(&port->pcie->lock); +} + +static int apple_pcie_bus_notifier(struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct apple_pcie_port *port; + int err; + + /* + * This is a bit ugly. We assume that if we get notified for + * any PCI device, we must be in charge of it, and that there + * is no other PCI controller in the whole system. It probably + * holds for now, but who knows for how long? + */ + port = apple_pcie_get_port(pdev); + if (!port) + return NOTIFY_DONE; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + err = apple_pcie_add_device(port, pdev); + if (err) + return notifier_from_errno(err); + break; + case BUS_NOTIFY_DEL_DEVICE: + apple_pcie_release_device(port, pdev); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static struct notifier_block apple_pcie_nb = { + .notifier_call = apple_pcie_bus_notifier, +}; + static int apple_pcie_init(struct pci_config_window *cfg) { struct device *dev = cfg->parent; @@ -623,6 +766,9 @@ static int apple_pcie_init(struct pci_config_window *cfg) if (IS_ERR(pcie->base)) return PTR_ERR(pcie->base); + cfg->priv = pcie; + INIT_LIST_HEAD(&pcie->ports); + for_each_child_of_node(dev->of_node, of_port) { ret = apple_pcie_setup_port(pcie, of_port); if (ret) { @@ -635,6 +781,21 @@ static int apple_pcie_init(struct pci_config_window *cfg) return apple_msi_init(pcie); } +static int apple_pcie_probe(struct platform_device *pdev) +{ + int ret; + + ret = bus_register_notifier(&pci_bus_type, &apple_pcie_nb); + if (ret) + return ret; + + ret = pci_host_common_probe(pdev); + if (ret) + bus_unregister_notifier(&pci_bus_type, &apple_pcie_nb); + + return ret; +} + static const struct pci_ecam_ops apple_pcie_cfg_ecam_ops = { .init = apple_pcie_init, .pci_ops = { @@ -651,7 +812,7 @@ static const struct of_device_id apple_pcie_of_match[] = { MODULE_DEVICE_TABLE(of, apple_pcie_of_match); static struct platform_driver apple_pcie_driver = { - .probe = pci_host_common_probe, + .probe = apple_pcie_probe, .driver = { .name = "pcie-apple", .of_match_table = apple_pcie_of_match, From acd61ffb2f1678fa5eb4170a3a4c530145fe2e64 Mon Sep 17 00:00:00 2001 From: Nathan Rossi Date: Fri, 10 Sep 2021 02:58:23 +0000 Subject: [PATCH 250/512] PCI: Add ACS quirk for Pericom PI7C9X2G switches The Pericom PI7C9X2G404/PI7C9X2G304/PI7C9X2G303 PCIe switches have an erratum for ACS P2P Request Redirect behaviour when used in the cut-through forwarding mode. The recommended work around for this issue is to use the switch in store and forward mode. The erratum results in packets being queued and not being delivered upstream, which can be observed as very poor downstream device performance and/or dropped device-generated data/interrupts. Add a fixup so that when enabling or resuming the downstream port we check if it has enabled ACS P2P Request Redirect, and if so, change the device (via the upstream port) to use the store and forward operating mode. Link: https://bugzilla.kernel.org/show_bug.cgi?id=177471 Link: https://lore.kernel.org/r/20210910025823.196508-1-nathan@nathanrossi.com Tested-by: Alex Williamson Signed-off-by: Nathan Rossi Signed-off-by: Bjorn Helgaas --- drivers/pci/quirks.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6c957124f84d..126c256bbb8a 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5796,3 +5796,58 @@ static void apex_pci_fixup_class(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_CLASS_HEADER(0x1ac1, 0x089a, PCI_CLASS_NOT_DEFINED, 8, apex_pci_fixup_class); + +/* + * Pericom PI7C9X2G404/PI7C9X2G304/PI7C9X2G303 switch erratum E5 - + * ACS P2P Request Redirect is not functional + * + * When ACS P2P Request Redirect is enabled and bandwidth is not balanced + * between upstream and downstream ports, packets are queued in an internal + * buffer until CPLD packet. The workaround is to use the switch in store and + * forward mode. + */ +#define PI7C9X2Gxxx_MODE_REG 0x74 +#define PI7C9X2Gxxx_STORE_FORWARD_MODE BIT(0) +static void pci_fixup_pericom_acs_store_forward(struct pci_dev *pdev) +{ + struct pci_dev *upstream; + u16 val; + + /* Downstream ports only */ + if (pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM) + return; + + /* Check for ACS P2P Request Redirect use */ + if (!pdev->acs_cap) + return; + pci_read_config_word(pdev, pdev->acs_cap + PCI_ACS_CTRL, &val); + if (!(val & PCI_ACS_RR)) + return; + + upstream = pci_upstream_bridge(pdev); + if (!upstream) + return; + + pci_read_config_word(upstream, PI7C9X2Gxxx_MODE_REG, &val); + if (!(val & PI7C9X2Gxxx_STORE_FORWARD_MODE)) { + pci_info(upstream, "Setting PI7C9X2Gxxx store-forward mode to avoid ACS erratum\n"); + pci_write_config_word(upstream, PI7C9X2Gxxx_MODE_REG, val | + PI7C9X2Gxxx_STORE_FORWARD_MODE); + } +} +/* + * Apply fixup on enable and on resume, in order to apply the fix up whenever + * ACS configuration changes or switch mode is reset + */ +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2404, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2404, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2304, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2304, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2303, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2303, + pci_fixup_pericom_acs_store_forward); From 0d35e382e4e96a4fd97a1438bc1b11a91d2d85a6 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 5 Nov 2021 08:39:01 +0900 Subject: [PATCH 251/512] cifs: Create a new shared file holding smb2 pdu definitions This file will contain all the definitions we need for SMB2 packets and will follow the naming convention of MS-SMB2.PDF as closely as possible to make it easier to cross-reference beween the definitions and the standard. The content of this file will mostly consist of migration of existing definitions in the cifs/smb2.pdu.h and ksmbd/smb2pdu.h files with some additional tweaks as the two files have diverged. This patch introduces the new smbfs_common/smb2pdu.h file and migrates the SMB2 header as well as TREE_CONNECT and TREE_DISCONNECT to the shared file. Signed-off-by: Ronnie Sahlberg Reviewed-by: Namjae Jeon Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 1 - fs/cifs/cifsglob.h | 3 +- fs/cifs/connect.c | 4 +- fs/cifs/misc.c | 2 +- fs/cifs/smb2maperror.c | 16 +- fs/cifs/smb2misc.c | 43 +++-- fs/cifs/smb2ops.c | 65 +++---- fs/cifs/smb2pdu.c | 106 ++++++----- fs/cifs/smb2pdu.h | 373 ++++---------------------------------- fs/cifs/smb2proto.h | 2 +- fs/cifs/smb2transport.c | 36 ++-- fs/smbfs_common/smb2pdu.h | 318 ++++++++++++++++++++++++++++++++ 12 files changed, 493 insertions(+), 476 deletions(-) create mode 100644 fs/smbfs_common/smb2pdu.h diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9fa930dfd78d..dca42aa87d30 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -38,7 +38,6 @@ #include #include "cifs_spnego.h" #include "fscache.h" -#include "smb2pdu.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e916470468ea..abff31dcd005 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -20,6 +20,7 @@ #include #include #include +#include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ @@ -776,7 +777,7 @@ revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) static inline void revert_current_mid_from_hdr(struct TCP_Server_Info *server, - const struct smb2_sync_hdr *shdr) + const struct smb2_hdr *shdr) { unsigned int num = le16_to_cpu(shdr->CreditCharge); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e6e261dfd107..da3b1fff8c4a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -677,7 +677,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) static unsigned int smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + struct smb2_hdr *shdr = (struct smb2_hdr *)buffer; /* * SMB1 does not use credits. @@ -877,7 +877,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) static void smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + struct smb2_hdr *shdr = (struct smb2_hdr *)buffer; int scredits, in_flight; /* diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bb1185fff8cc..ba2c3e897b29 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -152,7 +152,7 @@ cifs_buf_get(void) * SMB2 header is bigger than CIFS one - no problems to clean some * more bytes for CIFS. */ - size_t buf_size = sizeof(struct smb2_sync_hdr); + size_t buf_size = sizeof(struct smb2_hdr); /* * We could use negotiated size instead of max_msgsize - diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 181514b8770d..194799ddd382 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -2439,14 +2439,16 @@ smb2_print_status(__le32 status) int map_smb2_to_linux_error(char *buf, bool log_err) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; unsigned int i; int rc = -EIO; __le32 smb2err = shdr->Status; if (smb2err == 0) { - trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId)); + trace_smb3_cmd_done(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), + le64_to_cpu(shdr->MessageId)); return 0; } @@ -2470,8 +2472,10 @@ map_smb2_to_linux_error(char *buf, bool log_err) cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n", __le32_to_cpu(smb2err), rc); - trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), - le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc); + trace_smb3_cmd_err(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), + le64_to_cpu(shdr->MessageId), + le32_to_cpu(smb2err), rc); return rc; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 29b5554f6263..ce7d6cc652b3 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -8,7 +8,6 @@ * */ #include -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" @@ -19,7 +18,7 @@ #include "nterr.h" static int -check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid) +check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid) { __u64 wire_mid = le64_to_cpu(shdr->MessageId); @@ -81,9 +80,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) }; -#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_sync_hdr) + sizeof(struct smb2_negotiate_rsp)) +#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_hdr) + sizeof(struct smb2_negotiate_rsp)) -static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, +static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen) { __u16 neg_count; @@ -135,13 +134,13 @@ static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, int smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; + struct smb2_pdu *pdu = (struct smb2_pdu *)shdr; __u64 mid; __u32 clc_len; /* calculated length */ int command; - int pdu_size = sizeof(struct smb2_sync_pdu); - int hdr_size = sizeof(struct smb2_sync_hdr); + int pdu_size = sizeof(struct smb2_pdu); + int hdr_size = sizeof(struct smb2_hdr); /* * Add function to do table lookup of StructureSize by command @@ -155,7 +154,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) { - if (ses->Suid == thdr->SessionId) + if (ses->Suid == le64_to_cpu(thdr->SessionId)) break; } spin_unlock(&cifs_tcp_ses_lock); @@ -296,7 +295,7 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { * area and the offset to it (from the beginning of the smb are also returned. */ char * -smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) +smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) { *off = 0; *len = 0; @@ -401,8 +400,8 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *srvr) { - struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf; - struct smb2_sync_hdr *shdr = &pdu->sync_hdr; + struct smb2_pdu *pdu = (struct smb2_pdu *)buf; + struct smb2_hdr *shdr = &pdu->hdr; int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ /* Structure Size has already been checked to make sure it is 64 */ @@ -669,7 +668,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Checking for oplock break\n"); - if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) + if (rsp->hdr.Command != SMB2_OPLOCK_BREAK) return false; if (rsp->StructureSize != @@ -816,23 +815,23 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *sync_hdr = mid->resp_buf; + struct smb2_hdr *hdr = mid->resp_buf; struct smb2_create_rsp *rsp = mid->resp_buf; struct cifs_tcon *tcon; int rc; - if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE || - sync_hdr->Status != STATUS_SUCCESS) + if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || hdr->Command != SMB2_CREATE || + hdr->Status != STATUS_SUCCESS) return 0; - tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId, - sync_hdr->TreeId); + tcon = smb2_find_smb_tcon(server, le64_to_cpu(hdr->SessionId), + le32_to_cpu(hdr->Id.SyncId.TreeId)); if (!tcon) return -ENOENT; rc = __smb2_handle_cancelled_cmd(tcon, - le16_to_cpu(sync_hdr->Command), - le64_to_cpu(sync_hdr->MessageId), + le16_to_cpu(hdr->Command), + le64_to_cpu(hdr->MessageId), rsp->PersistentFileId, rsp->VolatileFileId); if (rc) @@ -856,10 +855,10 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec) { int i, rc; struct sdesc *d; - struct smb2_sync_hdr *hdr; + struct smb2_hdr *hdr; struct TCP_Server_Info *server = cifs_ses_server(ses); - hdr = (struct smb2_sync_hdr *)iov[0].iov_base; + hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ if (hdr->Command == SMB2_NEGOTIATE) goto ok; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bda606dc72b1..2ad223d2db20 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -325,7 +325,7 @@ static struct mid_q_entry * __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue) { struct mid_q_entry *mid; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; __u64 wire_mid = le64_to_cpu(shdr->MessageId); if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { @@ -367,11 +367,11 @@ static void smb2_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, - shdr->ProcessId); + shdr->Id.SyncId.ProcessId); cifs_server_dbg(VFS, "smb buf %p len %u\n", buf, server->ops->calc_smb_size(buf, server)); #endif @@ -888,7 +888,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->persistent_fid = o_rsp->PersistentFileId; oparms.fid->volatile_fid = o_rsp->VolatileFileId; #ifdef CONFIG_CIFS_DEBUG2 - oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); + oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ tcon->crfid.tcon = tcon; @@ -2391,7 +2391,7 @@ again: /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; - if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) { + if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) { cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc); goto qdf_free; } @@ -2410,7 +2410,7 @@ again: atomic_inc(&tcon->num_remote_opens); qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base; - if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + if (qd_rsp->hdr.Status == STATUS_NO_MORE_FILES) { trace_smb3_query_dir_done(xid, fid->persistent_fid, tcon->tid, tcon->ses->Suid, 0, 0); srch_inf->endOfSearch = true; @@ -2462,7 +2462,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; int scredits, in_flight; if (shdr->Status != STATUS_PENDING) @@ -2489,13 +2489,14 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) static bool smb2_is_session_expired(char *buf) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED && shdr->Status != STATUS_USER_SESSION_DELETED) return false; - trace_smb3_ses_expired(shdr->TreeId, shdr->SessionId, + trace_smb3_ses_expired(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId)); cifs_dbg(FYI, "Session expired or deleted\n"); @@ -2506,7 +2507,7 @@ smb2_is_session_expired(char *buf) static bool smb2_is_status_io_timeout(char *buf) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; if (shdr->Status == STATUS_IO_TIMEOUT) return true; @@ -2517,7 +2518,7 @@ smb2_is_status_io_timeout(char *buf) static void smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct list_head *tmp, *tmp1; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -2530,7 +2531,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) ses = list_entry(tmp, struct cifs_ses, smb_ses_list); list_for_each(tmp1, &ses->tcon_list) { tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); - if (tcon->tid == shdr->TreeId) { + if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { tcon->need_reconnect = true; spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", @@ -2558,9 +2559,9 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, void smb2_set_related(struct smb_rqst *rqst) { - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; - shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); if (shdr == NULL) { cifs_dbg(FYI, "shdr NULL in smb2_set_related\n"); return; @@ -2573,13 +2574,13 @@ char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0}; void smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) { - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = ses->server; unsigned long len = smb_rqst_len(server, rqst); int i, num_padding; - shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); if (shdr == NULL) { cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n"); return; @@ -3124,7 +3125,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype, rsp_iov); create_rsp = rsp_iov[0].iov_base; - if (create_rsp && create_rsp->sync_hdr.Status) + if (create_rsp && create_rsp->hdr.Status) err_iov = rsp_iov[0]; ioctl_rsp = rsp_iov[1].iov_base; @@ -4369,8 +4370,8 @@ static void fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, struct smb_rqst *old_rq, __le16 cipher_type) { - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)old_rq->rq_iov[0].iov_base; memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; @@ -4496,7 +4497,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); - rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key); + rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__, enc ? "en" : "de"); @@ -4788,7 +4789,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int cur_page_idx; unsigned int pad_len; struct cifs_readdata *rdata = mid->callback_data; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct bio_vec *bvec = NULL; struct iov_iter iter; struct kvec iov; @@ -5117,7 +5118,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, { int ret, length; char *buf = server->smallbuf; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; unsigned int pdu_length = server->pdu_size; unsigned int buf_size; struct mid_q_entry *mid_entry; @@ -5147,7 +5148,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, next_is_large = server->large_buf; one_more: - shdr = (struct smb2_sync_hdr *)buf; + shdr = (struct smb2_hdr *)buf; if (shdr->NextCommand) { if (next_is_large) next_buffer = (char *)cifs_buf_get(); @@ -5213,7 +5214,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); if (pdu_length < sizeof(struct smb2_transform_hdr) + - sizeof(struct smb2_sync_hdr)) { + sizeof(struct smb2_hdr)) { cifs_server_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); cifs_reconnect(server); @@ -5246,7 +5247,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) static int smb2_next_header(char *buf) { - struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf; if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) @@ -5788,7 +5789,7 @@ struct smb_version_values smb20_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5809,7 +5810,7 @@ struct smb_version_values smb21_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5830,7 +5831,7 @@ struct smb_version_values smb3any_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5851,7 +5852,7 @@ struct smb_version_values smbdefault_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5872,7 +5873,7 @@ struct smb_version_values smb30_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5893,7 +5894,7 @@ struct smb_version_values smb302_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5914,7 +5915,7 @@ struct smb_version_values smb311_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7829c590eeac..dbbd804b94eb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -23,7 +23,6 @@ #include #include #include -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsacl.h" #include "cifsproto.h" @@ -84,7 +83,7 @@ int smb3_encryption_required(const struct cifs_tcon *tcon) } static void -smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, +smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, const struct cifs_tcon *tcon, struct TCP_Server_Info *server) { @@ -104,7 +103,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, } else { shdr->CreditRequest = cpu_to_le16(2); } - shdr->ProcessId = cpu_to_le32((__u16)current->tgid); + shdr->Id.SyncId.ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) goto out; @@ -115,10 +114,10 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, shdr->CreditCharge = cpu_to_le16(1); /* else CreditCharge MBZ */ - shdr->TreeId = tcon->tid; + shdr->Id.SyncId.TreeId = cpu_to_le32(tcon->tid); /* Uid is not converted */ if (tcon->ses) - shdr->SessionId = tcon->ses->Suid; + shdr->SessionId = cpu_to_le64(tcon->ses->Suid); /* * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have @@ -331,7 +330,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, unsigned int *total_len) { - struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf; + struct smb2_pdu *spdu = (struct smb2_pdu *)buf; /* lookup word count ie StructureSize from table */ __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)]; @@ -341,10 +340,10 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, */ memset(buf, 0, 256); - smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon, server); + smb2_hdr_assemble(&spdu->hdr, smb2_command, tcon, server); spdu->StructureSize2 = cpu_to_le16(parmsize); - *total_len = parmsize + sizeof(struct smb2_sync_hdr); + *total_len = parmsize + sizeof(struct smb2_hdr); } /* @@ -367,7 +366,7 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, } fill_small_buf(smb2_command, tcon, server, - (struct smb2_sync_hdr *)(*request_buf), + (struct smb2_hdr *)(*request_buf), total_len); if (tcon != NULL) { @@ -857,7 +856,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if (rc) return rc; - req->sync_hdr.SessionId = 0; + req->hdr.SessionId = 0; memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); @@ -1018,7 +1017,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, - (struct smb2_sync_hdr *)rsp); + (struct smb2_hdr *)rsp); /* * See MS-SMB2 section 2.2.4: if no blob, client picks default which * for us will be @@ -1250,13 +1249,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) return rc; if (sess_data->ses->binding) { - req->sync_hdr.SessionId = sess_data->ses->Suid; - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.SessionId = cpu_to_le64(sess_data->ses->Suid); + req->hdr.Flags |= SMB2_FLAGS_SIGNED; req->PreviousSessionId = 0; req->Flags = SMB2_SESSION_REQ_FLAG_BINDING; } else { /* First session, not a reauthenticate */ - req->sync_hdr.SessionId = 0; + req->hdr.SessionId = 0; /* * if reconnect, we need to send previous sess id * otherwise it is 0 @@ -1266,7 +1265,7 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) } /* enough to enable echos and oplocks and one max size write */ - req->sync_hdr.CreditRequest = cpu_to_le16(130); + req->hdr.CreditRequest = cpu_to_le16(130); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -1425,7 +1424,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; /* keep session id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1501,7 +1500,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* If true, rc here is expected and not an error */ if (sess_data->buf0_type != CIFS_NO_BUFFER && - rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) rc = 0; if (rc) @@ -1523,7 +1522,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* keep existing ses id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1558,7 +1557,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) goto out; req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; - req->sync_hdr.SessionId = ses->Suid; + req->hdr.SessionId = cpu_to_le64(ses->Suid); rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, sess_data->nls_cp); @@ -1584,7 +1583,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) /* keep existing ses id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1715,12 +1714,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) return rc; /* since no tcon, smb2_init can not do this, so do here */ - req->sync_hdr.SessionId = ses->Suid; + req->hdr.SessionId = cpu_to_le64(ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) flags |= CIFS_TRANSFORM_REQ; else if (server->sign) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; flags |= CIFS_NO_RSP_BUF; @@ -1828,14 +1827,14 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && ((ses->user_name != NULL) || (ses->sectype == Kerberos))) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); rqst.rq_iov = iov; rqst.rq_nvec = 2; /* Need 64 for max size write so ask for more in case not there yet */ - req->sync_hdr.CreditRequest = cpu_to_le16(64); + req->hdr.CreditRequest = cpu_to_le16(64); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -1871,7 +1870,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tidStatus = CifsGood; tcon->need_reconnect = false; - tcon->tid = rsp->sync_hdr.TreeId; + tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && @@ -1892,9 +1891,8 @@ tcon_exit: return rc; tcon_error_exit: - if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { + if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - } goto tcon_exit; } @@ -2608,7 +2606,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, if (tcon->share_flags & SHI1005_FLAGS_DFS) { int name_len; - req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, utf16_path); @@ -2740,7 +2738,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (tcon->share_flags & SHI1005_FLAGS_DFS) { int name_len; - req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, path); @@ -2952,7 +2950,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, oparms->fid->volatile_fid = rsp->VolatileFileId; oparms->fid->access = oparms->desired_access; #ifdef CONFIG_CIFS_DEBUG2 - oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId); + oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ if (buf) { @@ -3052,7 +3050,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, * response size smaller. */ req->MaxOutputResponse = cpu_to_le32(max_response_size); - req->sync_hdr.CreditCharge = + req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size), SMB2_MAX_BUFFER_SIZE)); if (is_fsctl) @@ -3062,7 +3060,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; return 0; } @@ -3687,7 +3685,7 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED || mid->mid_state == MID_RESPONSE_MALFORMED) { - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; } @@ -3787,7 +3785,7 @@ SMB2_echo(struct TCP_Server_Info *server) if (rc) return rc; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); iov[0].iov_len = total_len; iov[0].iov_base = (char *)req; @@ -3891,7 +3889,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, { int rc = -EACCES; struct smb2_read_plain_req *req = NULL; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct TCP_Server_Info *server = io_parms->server; rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, server, @@ -3902,8 +3900,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (server == NULL) return -ECONNABORTED; - shdr = &req->sync_hdr; - shdr->ProcessId = cpu_to_le32(io_parms->pid); + shdr = &req->hdr; + shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); req->PersistentFileId = io_parms->persistent_fid; req->VolatileFileId = io_parms->volatile_fid; @@ -3964,8 +3962,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * Related requests use info from previous read request * in chain. */ - shdr->SessionId = 0xFFFFFFFFFFFFFFFF; - shdr->TreeId = 0xFFFFFFFF; + shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); + shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF); req->PersistentFileId = 0xFFFFFFFFFFFFFFFF; req->VolatileFileId = 0xFFFFFFFFFFFFFFFF; } @@ -3985,8 +3983,8 @@ smb2_readv_callback(struct mid_q_entry *mid) struct cifs_readdata *rdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = rdata->server; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rdata->iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], .rq_nvec = 1, @@ -4072,7 +4070,7 @@ smb2_async_readv(struct cifs_readdata *rdata) { int rc, flags = 0; char *buf; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 1 }; @@ -4105,7 +4103,7 @@ smb2_async_readv(struct cifs_readdata *rdata) rdata->iov[0].iov_base = buf; rdata->iov[0].iov_len = total_len; - shdr = (struct smb2_sync_hdr *)buf; + shdr = (struct smb2_hdr *)buf; if (rdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, @@ -4238,7 +4236,7 @@ smb2_writev_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; wdata->result = smb2_check_receive(mid, server, 0); if (wdata->result != 0) @@ -4264,7 +4262,7 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -EAGAIN; break; case MID_RESPONSE_MALFORMED: - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; fallthrough; default: @@ -4311,7 +4309,7 @@ smb2_async_writev(struct cifs_writedata *wdata, { int rc = -EACCES, flags = 0; struct smb2_write_req *req = NULL; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); struct TCP_Server_Info *server = wdata->server; struct kvec iov[1]; @@ -4329,8 +4327,8 @@ smb2_async_writev(struct cifs_writedata *wdata, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - shdr = (struct smb2_sync_hdr *)req; - shdr->ProcessId = cpu_to_le32(wdata->cfile->pid); + shdr = (struct smb2_hdr *)req; + shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); req->PersistentFileId = wdata->cfile->fid.persistent_fid; req->VolatileFileId = wdata->cfile->fid.volatile_fid; @@ -4481,7 +4479,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (smb3_encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); req->PersistentFileId = io_parms->persistent_fid; req->VolatileFileId = io_parms->volatile_fid; @@ -4866,7 +4864,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { if (rc == -ENODATA && - rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + rsp->hdr.Status == STATUS_NO_MORE_FILES) { trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid, tcon->ses->Suid, index, 0); srch_inf->endOfSearch = true; @@ -4914,7 +4912,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (rc) return rc; - req->sync_hdr.ProcessId = cpu_to_le32(pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid); req->InfoType = info_type; req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; @@ -5074,7 +5072,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFid = volatile_fid; req->PersistentFid = persistent_fid; req->OplockLevel = oplock_level; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); flags |= CIFS_NO_RSP_BUF; @@ -5376,7 +5374,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.ProcessId = cpu_to_le32(pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid); req->LockCount = cpu_to_le16(num_lock); req->PersistentFileId = persist_fid; @@ -5452,7 +5450,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); req->StructureSize = cpu_to_le16(36); total_len += 12; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f32c99c9ba13..739e98d117ed 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -14,156 +14,12 @@ #include #include "cifsacl.h" -/* - * Note that, due to trying to use names similar to the protocol specifications, - * there are many mixed case field names in the structures below. Although - * this does not match typical Linux kernel style, it is necessary to be - * able to match against the protocol specfication. - * - * SMB2 commands - * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses - * (ie no useful data other than the SMB error code itself) and are marked such. - * Knowing this helps avoid response buffer allocations and copy in some cases. - */ - -/* List of commands in host endian */ -#define SMB2_NEGOTIATE_HE 0x0000 -#define SMB2_SESSION_SETUP_HE 0x0001 -#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */ -#define SMB2_TREE_CONNECT_HE 0x0003 -#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */ -#define SMB2_CREATE_HE 0x0005 -#define SMB2_CLOSE_HE 0x0006 -#define SMB2_FLUSH_HE 0x0007 /* trivial resp */ -#define SMB2_READ_HE 0x0008 -#define SMB2_WRITE_HE 0x0009 -#define SMB2_LOCK_HE 0x000A -#define SMB2_IOCTL_HE 0x000B -#define SMB2_CANCEL_HE 0x000C -#define SMB2_ECHO_HE 0x000D -#define SMB2_QUERY_DIRECTORY_HE 0x000E -#define SMB2_CHANGE_NOTIFY_HE 0x000F -#define SMB2_QUERY_INFO_HE 0x0010 -#define SMB2_SET_INFO_HE 0x0011 -#define SMB2_OPLOCK_BREAK_HE 0x0012 - -/* The same list in little endian */ -#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) -#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE) -#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE) -#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE) -#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE) -#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE) -#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE) -#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE) -#define SMB2_READ cpu_to_le16(SMB2_READ_HE) -#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE) -#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE) -#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE) -#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE) -#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE) -#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE) -#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE) -#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE) -#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) -#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) - -#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF) - -#define NUMBER_OF_SMB2_COMMANDS 0x0013 - /* 52 transform hdr + 64 hdr + 88 create rsp */ #define SMB2_TRANSFORM_HEADER_SIZE 52 #define MAX_SMB2_HDR_SIZE 204 -#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) -#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) -#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc) - -/* - * SMB2 Header Definition - * - * "MBZ" : Must be Zero - * "BB" : BugBug, Something to check/review/analyze later - * "PDU" : "Protocol Data Unit" (ie a network "frame") - * - */ - -#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) - -struct smb2_sync_hdr { - __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ - __le16 StructureSize; /* 64 */ - __le16 CreditCharge; /* MBZ */ - __le32 Status; /* Error from server */ - __le16 Command; - __le16 CreditRequest; /* CreditResponse */ - __le32 Flags; - __le32 NextCommand; - __le64 MessageId; - __le32 ProcessId; - __u32 TreeId; /* opaque - so do not make little endian */ - __u64 SessionId; /* opaque - so do not make little endian */ - __u8 Signature[16]; -} __packed; - /* The total header size for SMB2 read and write */ -#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr)) - -struct smb2_sync_pdu { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize2; /* size of wct area (varies, request specific) */ -} __packed; - -#define SMB3_AES_CCM_NONCE 11 -#define SMB3_AES_GCM_NONCE 12 - -/* Transform flags (for 3.0 dialect this flag indicates CCM */ -#define TRANSFORM_FLAG_ENCRYPTED 0x0001 -struct smb2_transform_hdr { - __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ - __u8 Signature[16]; - __u8 Nonce[16]; - __le32 OriginalMessageSize; - __u16 Reserved1; - __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */ - __u64 SessionId; -} __packed; - -/* See MS-SMB2 2.2.42 */ -struct smb2_compression_transform_hdr_unchained { - __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ - __le32 OriginalCompressedSegmentSize; - __le16 CompressionAlgorithm; - __le16 Flags; - __le16 Length; /* if chained it is length, else offset */ -} __packed; - -/* See MS-SMB2 2.2.42.1 */ -#define SMB2_COMPRESSION_FLAG_NONE 0x0000 -#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 - -struct compression_payload_header { - __le16 CompressionAlgorithm; - __le16 Flags; - __le32 Length; /* length of compressed playload including field below if present */ - /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ -} __packed; - -/* See MS-SMB2 2.2.42.2 */ -struct smb2_compression_transform_hdr_chained { - __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ - __le32 OriginalCompressedSegmentSize; - /* struct compression_payload_header[] */ -} __packed; - -/* See MS-SMB2 2.2.42.2.2 */ -struct compression_pattern_payload_v1 { - __le16 Pattern; - __le16 Reserved1; - __le16 Reserved2; - __le32 Repetitions; -} __packed; +#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_hdr)) /* See MS-SMB2 2.2.43 */ struct smb2_rdma_transform { @@ -189,17 +45,6 @@ struct smb2_rdma_crypto_transform { /* followed by padding */ } __packed; -/* - * SMB2 flag definitions - */ -#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) -#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) -#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) -#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) -#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */ -#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) -#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ - /* * Definitions for SMB2 Protocol Data Units (network frames) * @@ -214,7 +59,7 @@ struct smb2_rdma_crypto_transform { #define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) struct smb2_err_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; __le16 Reserved; /* MBZ */ __le32 ByteCount; /* even if zero, at least one byte follows */ @@ -273,7 +118,7 @@ struct share_redirect_error_context_rsp { #define SMB2_CLIENT_GUID_SIZE 16 struct smb2_negotiate_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 36 */ __le16 DialectCount; __le16 SecurityMode; @@ -472,7 +317,7 @@ struct smb2_posix_neg_context { } __packed; struct smb2_negotiate_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 65 */ __le16 SecurityMode; __le16 DialectRevision; @@ -495,7 +340,7 @@ struct smb2_negotiate_rsp { #define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 struct smb2_sess_setup_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 25 */ __u8 Flags; __u8 SecurityMode; @@ -512,7 +357,7 @@ struct smb2_sess_setup_req { #define SMB2_SESSION_FLAG_IS_NULL 0x0002 #define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 struct smb2_sess_setup_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ __le16 SessionFlags; __le16 SecurityBufferOffset; @@ -521,161 +366,13 @@ struct smb2_sess_setup_rsp { } __packed; struct smb2_logoff_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; struct smb2_logoff_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -/* Flags/Reserved for SMB3.1.1 */ -#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001) -#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002) -#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004) - -struct smb2_tree_connect_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */ - __le16 PathOffset; - __le16 PathLength; - __u8 Buffer[1]; /* variable length */ -} __packed; - -/* See MS-SMB2 section 2.2.9.2 */ -/* Context Types */ -#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000 -#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001) - -struct tree_connect_contexts { - __le16 ContextType; - __le16 DataLength; - __le32 Reserved; - __u8 Data[]; -} __packed; - -/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */ -struct smb3_blob_data { - __le16 BlobSize; - __u8 BlobData[]; -} __packed; - -/* Valid values for Attr */ -#define SE_GROUP_MANDATORY 0x00000001 -#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002 -#define SE_GROUP_ENABLED 0x00000004 -#define SE_GROUP_OWNER 0x00000008 -#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010 -#define SE_GROUP_INTEGRITY 0x00000020 -#define SE_GROUP_INTEGRITY_ENABLED 0x00000040 -#define SE_GROUP_RESOURCE 0x20000000 -#define SE_GROUP_LOGON_ID 0xC0000000 - -/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ - -struct sid_array_data { - __le16 SidAttrCount; - /* SidAttrList - array of sid_attr_data structs */ -} __packed; - -struct luid_attr_data { - -} __packed; - -/* - * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5 - * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA - */ - -struct privilege_array_data { - __le16 PrivilegeCount; - /* array of privilege_data structs */ -} __packed; - -struct remoted_identity_tcon_context { - __le16 TicketType; /* must be 0x0001 */ - __le16 TicketSize; /* total size of this struct */ - __le16 User; /* offset to SID_ATTR_DATA struct with user info */ - __le16 UserName; /* offset to null terminated Unicode username string */ - __le16 Domain; /* offset to null terminated Unicode domain name */ - __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */ - __le16 RestrictedGroups; /* similar to above */ - __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */ - __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */ - __le16 Owner; /* offset to BLOB_DATA struct */ - __le16 DefaultDacl; /* offset to BLOB_DATA struct */ - __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */ - __le16 UserClaims; /* offset to BLOB_DATA struct */ - __le16 DeviceClaims; /* offset to BLOB_DATA struct */ - __u8 TicketInfo[]; /* variable length buf - remoted identity data */ -} __packed; - -struct smb2_tree_connect_req_extension { - __le32 TreeConnectContextOffset; - __le16 TreeConnectContextCount; - __u8 Reserved[10]; - __u8 PathName[]; /* variable sized array */ - /* followed by array of TreeConnectContexts */ -} __packed; - -struct smb2_tree_connect_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 16 */ - __u8 ShareType; /* see below */ - __u8 Reserved; - __le32 ShareFlags; /* see below */ - __le32 Capabilities; /* see below */ - __le32 MaximalAccess; -} __packed; - -/* Possible ShareType values */ -#define SMB2_SHARE_TYPE_DISK 0x01 -#define SMB2_SHARE_TYPE_PIPE 0x02 -#define SMB2_SHARE_TYPE_PRINT 0x03 - -/* - * Possible ShareFlags - exactly one and only one of the first 4 caching flags - * must be set (any of the remaining, SHI1005, flags may be set individually - * or in combination. - */ -#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000 -#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010 -#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020 -#define SMB2_SHAREFLAG_NO_CACHING 0x00000030 -#define SHI1005_FLAGS_DFS 0x00000001 -#define SHI1005_FLAGS_DFS_ROOT 0x00000002 -#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 -#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 -#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 -#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 -#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 -#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 -#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 -#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 -#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ -#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ -#define SHI1005_FLAGS_ALL 0x0014FF33 - -/* Possible share capabilities */ -#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ -#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ -#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ -#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ -#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ -#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */ - -struct smb2_tree_disconnect_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_tree_disconnect_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -808,7 +505,7 @@ struct smb2_tree_disconnect_rsp { #define SMB2_CREATE_IOV_SIZE 8 struct smb2_create_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ __u8 SecurityFlags; __u8 RequestedOplockLevel; @@ -835,7 +532,7 @@ struct smb2_create_req { #define MAX_SMB2_CREATE_RESPONSE_SIZE 880 struct smb2_create_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 89 */ __u8 OplockLevel; __u8 Flag; /* 0x01 if reparse point */ @@ -1210,7 +907,7 @@ struct duplicate_extents_to_file { #define SMB2_IOCTL_IOV_SIZE 2 struct smb2_ioctl_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -1228,7 +925,7 @@ struct smb2_ioctl_req { } __packed; struct smb2_ioctl_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -1246,7 +943,7 @@ struct smb2_ioctl_rsp { /* Currently defined values for close flags */ #define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) struct smb2_close_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 24 */ __le16 Flags; __le32 Reserved; @@ -1260,7 +957,7 @@ struct smb2_close_req { #define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 struct smb2_close_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* 60 */ __le16 Flags; __le32 Reserved; @@ -1274,7 +971,7 @@ struct smb2_close_rsp { } __packed; struct smb2_flush_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 24 */ __le16 Reserved1; __le32 Reserved2; @@ -1283,7 +980,7 @@ struct smb2_flush_req { } __packed; struct smb2_flush_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; __le16 Reserved; } __packed; @@ -1300,7 +997,7 @@ struct smb2_flush_rsp { /* SMB2 read request without RFC1001 length at the beginning */ struct smb2_read_plain_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 49 */ __u8 Padding; /* offset from start of SMB2 header to place read */ __u8 Flags; /* MBZ unless SMB3.02 or later */ @@ -1321,7 +1018,7 @@ struct smb2_read_plain_req { #define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001 struct smb2_read_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 17 */ __u8 DataOffset; __u8 Reserved; @@ -1336,7 +1033,7 @@ struct smb2_read_rsp { #define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ struct smb2_write_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 49 */ __le16 DataOffset; /* offset from start of SMB2 header to write data */ __le32 Length; @@ -1352,7 +1049,7 @@ struct smb2_write_req { } __packed; struct smb2_write_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 17 */ __u8 DataOffset; __u8 Reserved; @@ -1380,7 +1077,7 @@ struct smb2_write_rsp { #define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 struct smb2_change_notify_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; __le16 Flags; __le32 OutputBufferLength; @@ -1391,7 +1088,7 @@ struct smb2_change_notify_req { } __packed; struct smb2_change_notify_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1411,7 +1108,7 @@ struct smb2_lock_element { } __packed; struct smb2_lock_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 48 */ __le16 LockCount; /* @@ -1426,19 +1123,19 @@ struct smb2_lock_req { } __packed; struct smb2_lock_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; struct smb2_echo_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; struct smb2_echo_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; @@ -1468,7 +1165,7 @@ struct smb2_echo_rsp { */ struct smb2_query_directory_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 33 */ __u8 FileInformationClass; __u8 Flags; @@ -1482,7 +1179,7 @@ struct smb2_query_directory_req { } __packed; struct smb2_query_directory_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1515,7 +1212,7 @@ struct smb2_query_directory_rsp { #define SL_INDEX_SPECIFIED 0x00000004 struct smb2_query_info_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 41 */ __u8 InfoType; __u8 FileInfoClass; @@ -1531,7 +1228,7 @@ struct smb2_query_info_req { } __packed; struct smb2_query_info_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1548,7 +1245,7 @@ struct smb2_query_info_rsp { #define SMB2_SET_INFO_IOV_SIZE 3 struct smb2_set_info_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 33 */ __u8 InfoType; __u8 FileInfoClass; @@ -1562,12 +1259,12 @@ struct smb2_set_info_req { } __packed; struct smb2_set_info_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 2 */ } __packed; struct smb2_oplock_break { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 24 */ __u8 OplockLevel; __u8 Reserved; @@ -1579,7 +1276,7 @@ struct smb2_oplock_break { #define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) struct smb2_lease_break { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 44 */ __le16 Epoch; __le32 Flags; @@ -1592,7 +1289,7 @@ struct smb2_lease_break { } __packed; struct smb2_lease_ack { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 36 */ __le16 Reserved; __le32 Flags; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 547945443fa7..096fada16ebd 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -25,7 +25,7 @@ extern int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *server); extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server); extern char *smb2_get_data_area_len(int *off, int *len, - struct smb2_sync_hdr *shdr); + struct smb2_hdr *shdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index f59b956f9d25..2bf047b390a9 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -19,7 +19,6 @@ #include #include #include -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" @@ -213,14 +212,14 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; + struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; struct shash_desc *shash; struct crypto_shash *hash; struct sdesc *sdesc = NULL; struct smb_rqst drqst; - ses = smb2_find_smb_ses(server, shdr->SessionId); + ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); if (!ses) { cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); return 0; @@ -534,14 +533,14 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; + struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct shash_desc *shash; struct crypto_shash *hash; struct sdesc *sdesc = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; - rc = smb2_get_sign_key(shdr->SessionId, server, key); + rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); if (rc) return 0; @@ -611,12 +610,12 @@ static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct smb2_sess_setup_req *ssr; bool is_binding; bool is_signed; - shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + shdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; ssr = (struct smb2_sess_setup_req *)shdr; is_binding = shdr->Command == SMB2_SESSION_SETUP && @@ -642,8 +641,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { unsigned int rc; char server_response_sig[SMB2_SIGNATURE_SIZE]; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; if ((shdr->Command == SMB2_NEGOTIATE) || (shdr->Command == SMB2_SESSION_SETUP) || @@ -689,7 +688,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) */ static inline void smb2_seq_num_into_buf(struct TCP_Server_Info *server, - struct smb2_sync_hdr *shdr) + struct smb2_hdr *shdr) { unsigned int i, num = le16_to_cpu(shdr->CreditCharge); @@ -700,7 +699,7 @@ smb2_seq_num_into_buf(struct TCP_Server_Info *server, } static struct mid_q_entry * -smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, +smb2_mid_entry_alloc(const struct smb2_hdr *shdr, struct TCP_Server_Info *server) { struct mid_q_entry *temp; @@ -732,14 +731,15 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; - trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), temp->mid); + trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), temp->mid); return temp; } static int smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, - struct smb2_sync_hdr *shdr, struct mid_q_entry **mid) + struct smb2_hdr *shdr, struct mid_q_entry **mid) { if (server->tcpStatus == CifsExiting) return -ENOENT; @@ -807,8 +807,8 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(server, shdr); @@ -833,8 +833,8 @@ struct mid_q_entry * smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; if (server->tcpStatus == CifsNeedNegotiate && diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h new file mode 100644 index 000000000000..f191ed64c1ee --- /dev/null +++ b/fs/smbfs_common/smb2pdu.h @@ -0,0 +1,318 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ +#ifndef _COMMON_SMB2PDU_H +#define _COMMON_SMB2PDU_H + +/* + * Note that, due to trying to use names similar to the protocol specifications, + * there are many mixed case field names in the structures below. Although + * this does not match typical Linux kernel style, it is necessary to be + * able to match against the protocol specfication. + * + * SMB2 commands + * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses + * (ie no useful data other than the SMB error code itself) and are marked such. + * Knowing this helps avoid response buffer allocations and copy in some cases. + */ + +/* List of commands in host endian */ +#define SMB2_NEGOTIATE_HE 0x0000 +#define SMB2_SESSION_SETUP_HE 0x0001 +#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */ +#define SMB2_TREE_CONNECT_HE 0x0003 +#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */ +#define SMB2_CREATE_HE 0x0005 +#define SMB2_CLOSE_HE 0x0006 +#define SMB2_FLUSH_HE 0x0007 /* trivial resp */ +#define SMB2_READ_HE 0x0008 +#define SMB2_WRITE_HE 0x0009 +#define SMB2_LOCK_HE 0x000A +#define SMB2_IOCTL_HE 0x000B +#define SMB2_CANCEL_HE 0x000C +#define SMB2_ECHO_HE 0x000D +#define SMB2_QUERY_DIRECTORY_HE 0x000E +#define SMB2_CHANGE_NOTIFY_HE 0x000F +#define SMB2_QUERY_INFO_HE 0x0010 +#define SMB2_SET_INFO_HE 0x0011 +#define SMB2_OPLOCK_BREAK_HE 0x0012 + +/* The same list in little endian */ +#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) +#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE) +#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE) +#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE) +#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE) +#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE) +#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE) +#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE) +#define SMB2_READ cpu_to_le16(SMB2_READ_HE) +#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE) +#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE) +#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE) +#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE) +#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE) +#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE) +#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE) +#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE) +#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) +#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) + +#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF) + +#define NUMBER_OF_SMB2_COMMANDS 0x0013 + +/* + * SMB2 Header Definition + * + * "MBZ" : Must be Zero + * "BB" : BugBug, Something to check/review/analyze later + * "PDU" : "Protocol Data Unit" (ie a network "frame") + * + */ + +#define __SMB2_HEADER_STRUCTURE_SIZE 64 +#define SMB2_HEADER_STRUCTURE_SIZE \ + cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE) + +#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) +#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) +#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc) + +/* + * SMB2 flag definitions + */ +#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) +#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) +#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) +#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */ +#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) +#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ + +/* See MS-SMB2 section 2.2.1 */ +struct smb2_hdr { + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ + __le16 StructureSize; /* 64 */ + __le16 CreditCharge; /* MBZ */ + __le32 Status; /* Error from server */ + __le16 Command; + __le16 CreditRequest; /* CreditResponse */ + __le32 Flags; + __le32 NextCommand; + __le64 MessageId; + union { + struct { + __le32 ProcessId; + __le32 TreeId; + } __packed SyncId; + __le64 AsyncId; + } __packed Id; + __le64 SessionId; + __u8 Signature[16]; +} __packed; + +struct smb2_pdu { + struct smb2_hdr hdr; + __le16 StructureSize2; /* size of wct area (varies, request specific) */ +} __packed; + +#define SMB3_AES_CCM_NONCE 11 +#define SMB3_AES_GCM_NONCE 12 + +/* Transform flags (for 3.0 dialect this flag indicates CCM */ +#define TRANSFORM_FLAG_ENCRYPTED 0x0001 +struct smb2_transform_hdr { + __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ + __u8 Signature[16]; + __u8 Nonce[16]; + __le32 OriginalMessageSize; + __u16 Reserved1; + __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */ + __le64 SessionId; +} __packed; + + +/* See MS-SMB2 2.2.42 */ +struct smb2_compression_transform_hdr_unchained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + __le16 CompressionAlgorithm; + __le16 Flags; + __le16 Length; /* if chained it is length, else offset */ +} __packed; + +/* See MS-SMB2 2.2.42.1 */ +#define SMB2_COMPRESSION_FLAG_NONE 0x0000 +#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 + +struct compression_payload_header { + __le16 CompressionAlgorithm; + __le16 Flags; + __le32 Length; /* length of compressed playload including field below if present */ + /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ +} __packed; + +/* See MS-SMB2 2.2.42.2 */ +struct smb2_compression_transform_hdr_chained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + /* struct compression_payload_header[] */ +} __packed; + +/* See MS-SMB2 2.2.42.2.2 */ +struct compression_pattern_payload_v1 { + __le16 Pattern; + __le16 Reserved1; + __le16 Reserved2; + __le32 Repetitions; +} __packed; + +/* See MS-SMB2 section 2.2.9.2 */ +/* Context Types */ +#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000 +#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001) + +struct tree_connect_contexts { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + __u8 Data[]; +} __packed; + +/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */ +struct smb3_blob_data { + __le16 BlobSize; + __u8 BlobData[]; +} __packed; + +/* Valid values for Attr */ +#define SE_GROUP_MANDATORY 0x00000001 +#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002 +#define SE_GROUP_ENABLED 0x00000004 +#define SE_GROUP_OWNER 0x00000008 +#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010 +#define SE_GROUP_INTEGRITY 0x00000020 +#define SE_GROUP_INTEGRITY_ENABLED 0x00000040 +#define SE_GROUP_RESOURCE 0x20000000 +#define SE_GROUP_LOGON_ID 0xC0000000 + +/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ + +struct sid_array_data { + __le16 SidAttrCount; + /* SidAttrList - array of sid_attr_data structs */ +} __packed; + +struct luid_attr_data { + +} __packed; + +/* + * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5 + * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA + */ + +struct privilege_array_data { + __le16 PrivilegeCount; + /* array of privilege_data structs */ +} __packed; + +struct remoted_identity_tcon_context { + __le16 TicketType; /* must be 0x0001 */ + __le16 TicketSize; /* total size of this struct */ + __le16 User; /* offset to SID_ATTR_DATA struct with user info */ + __le16 UserName; /* offset to null terminated Unicode username string */ + __le16 Domain; /* offset to null terminated Unicode domain name */ + __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */ + __le16 RestrictedGroups; /* similar to above */ + __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */ + __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */ + __le16 Owner; /* offset to BLOB_DATA struct */ + __le16 DefaultDacl; /* offset to BLOB_DATA struct */ + __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */ + __le16 UserClaims; /* offset to BLOB_DATA struct */ + __le16 DeviceClaims; /* offset to BLOB_DATA struct */ + __u8 TicketInfo[]; /* variable length buf - remoted identity data */ +} __packed; + +struct smb2_tree_connect_req_extension { + __le32 TreeConnectContextOffset; + __le16 TreeConnectContextCount; + __u8 Reserved[10]; + __u8 PathName[]; /* variable sized array */ + /* followed by array of TreeConnectContexts */ +} __packed; + +/* Flags/Reserved for SMB3.1.1 */ +#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001) +#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002) +#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004) + +struct smb2_tree_connect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 Flags; /* Flags in SMB3.1.1 */ + __le16 PathOffset; + __le16 PathLength; + __u8 Buffer[1]; /* variable length */ +} __packed; + +/* Possible ShareType values */ +#define SMB2_SHARE_TYPE_DISK 0x01 +#define SMB2_SHARE_TYPE_PIPE 0x02 +#define SMB2_SHARE_TYPE_PRINT 0x03 + +/* + * Possible ShareFlags - exactly one and only one of the first 4 caching flags + * must be set (any of the remaining, SHI1005, flags may be set individually + * or in combination. + */ +#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000 +#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010 +#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020 +#define SMB2_SHAREFLAG_NO_CACHING 0x00000030 +#define SHI1005_FLAGS_DFS 0x00000001 +#define SHI1005_FLAGS_DFS_ROOT 0x00000002 +#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 +#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 +#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 +#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 +#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 +#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 +#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 +#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 +#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ +#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ +#define SHI1005_FLAGS_ALL 0x0014FF33 + +/* Possible share capabilities */ +#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ +#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ +#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ +#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ +#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ +#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */ + +struct smb2_tree_connect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 16 */ + __u8 ShareType; /* see below */ + __u8 Reserved; + __le32 ShareFlags; /* see below */ + __le32 Capabilities; /* see below */ + __le32 MaximalAccess; +} __packed; + +struct smb2_tree_disconnect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_tree_disconnect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + + +#endif /* _COMMON_SMB2PDU_H */ From fc0b3844694948a945595315a01063040bbe7855 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 8 Sep 2021 12:10:13 +1000 Subject: [PATCH 252/512] cifs: move NEGOTIATE_PROTOCOL definitions out into the common area Signed-off-by: Ronnie Sahlberg Reviewed-by: Namjae Jeon Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 4 +- fs/cifs/smb2pdu.h | 220 ------------------------------------ fs/smbfs_common/smb2pdu.h | 229 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 231 insertions(+), 222 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index dbbd804b94eb..0b51372c9719 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -413,8 +413,8 @@ build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES; pneg_ctxt->DataLength = cpu_to_le16(38); pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1); - pneg_ctxt->SaltLength = cpu_to_le16(SMB311_LINUX_CLIENT_SALT_SIZE); - get_random_bytes(pneg_ctxt->Salt, SMB311_LINUX_CLIENT_SALT_SIZE); + pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE); + get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE); pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 739e98d117ed..2a95768e33b4 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -115,226 +115,6 @@ struct share_redirect_error_context_rsp { /* __u8 ResourceName[] */ /* Name of share as counted Unicode string */ } __packed; -#define SMB2_CLIENT_GUID_SIZE 16 - -struct smb2_negotiate_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 36 */ - __le16 DialectCount; - __le16 SecurityMode; - __le16 Reserved; /* MBZ */ - __le32 Capabilities; - __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; - /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ - __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ - __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ - __le16 Reserved2; - __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */ -} __packed; - -/* Dialects */ -#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */ -#define SMB20_PROT_ID 0x0202 -#define SMB21_PROT_ID 0x0210 -#define SMB30_PROT_ID 0x0300 -#define SMB302_PROT_ID 0x0302 -#define SMB311_PROT_ID 0x0311 -#define BAD_PROT_ID 0xFFFF - -/* SecurityMode flags */ -#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 -#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 -#define SMB2_SEC_MODE_FLAGS_ALL 0x0003 - -/* Capabilities flags */ -#define SMB2_GLOBAL_CAP_DFS 0x00000001 -#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ -#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ -#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ -/* Internal types */ -#define SMB2_NT_FIND 0x00100000 -#define SMB2_LARGE_FILES 0x00200000 - - -/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */ -#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) -#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) -#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) -#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) -#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) -#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) -#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) -#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) - -struct smb2_neg_context { - __le16 ContextType; - __le16 DataLength; - __le32 Reserved; - /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */ -} __packed; - -#define SMB311_LINUX_CLIENT_SALT_SIZE 32 -/* Hash Algorithm Types */ -#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) -#define SMB2_PREAUTH_HASH_SIZE 64 - -/* - * SaltLength that the server send can be zero, so the only three required - * fields (all __le16) end up six bytes total, so the minimum context data len - * in the response is six bytes which accounts for - * - * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm. - */ -#define MIN_PREAUTH_CTXT_DATA_LEN 6 - -struct smb2_preauth_neg_context { - __le16 ContextType; /* 1 */ - __le16 DataLength; - __le32 Reserved; - __le16 HashAlgorithmCount; /* 1 */ - __le16 SaltLength; - __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ - __u8 Salt[SMB311_LINUX_CLIENT_SALT_SIZE]; -} __packed; - -/* Encryption Algorithms Ciphers */ -#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) -#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) -/* we currently do not request AES256_CCM since presumably GCM faster */ -#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) -#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) - -/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ -#define MIN_ENCRYPT_CTXT_DATA_LEN 4 -struct smb2_encryption_neg_context { - __le16 ContextType; /* 2 */ - __le16 DataLength; - __le32 Reserved; - /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ - __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ - __le16 Ciphers[3]; -} __packed; - -/* See MS-SMB2 2.2.3.1.3 */ -#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000) -#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001) -#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) -#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) -/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ -#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ - -/* Compression Flags */ -#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) -#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001) - -struct smb2_compression_capabilities_context { - __le16 ContextType; /* 3 */ - __le16 DataLength; - __u32 Reserved; - __le16 CompressionAlgorithmCount; - __u16 Padding; - __u32 Flags; - __le16 CompressionAlgorithms[3]; - __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */ - /* Check if pad needed */ -} __packed; - -/* - * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4. - * Its struct simply contains NetName, an array of Unicode characters - */ -struct smb2_netname_neg_context { - __le16 ContextType; /* 5 */ - __le16 DataLength; - __le32 Reserved; - __le16 NetName[]; /* hostname of target converted to UCS-2 */ -} __packed; - -/* - * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 - * and 2.2.4.1.5 - */ - -/* Flags */ -#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 - -struct smb2_transport_capabilities_context { - __le16 ContextType; /* 6 */ - __le16 DataLength; - __u32 Reserved; - __le32 Flags; - __u32 Pad; -} __packed; - -/* - * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 - * and 2.2.4.1.6 - */ - -/* RDMA Transform IDs */ -#define SMB2_RDMA_TRANSFORM_NONE 0x0000 -#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 -#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 - -struct smb2_rdma_transform_capabilities_context { - __le16 ContextType; /* 7 */ - __le16 DataLength; - __u32 Reserved; - __le16 TransformCount; - __u16 Reserved1; - __u32 Reserved2; - __le16 RDMATransformIds[]; -} __packed; - -/* - * For signing capabilities context see MS-SMB2 2.2.3.1.7 - * and 2.2.4.1.7 - */ - -/* Signing algorithms */ -#define SIGNING_ALG_HMAC_SHA256 0 -#define SIGNING_ALG_AES_CMAC 1 -#define SIGNING_ALG_AES_GMAC 2 - -struct smb2_signing_capabilities { - __le16 ContextType; /* 8 */ - __le16 DataLength; - __u32 Reserved; - __le16 SigningAlgorithmCount; - __le16 SigningAlgorithms[]; - /* Followed by padding to 8 byte boundary (required by some servers) */ -} __packed; - -#define POSIX_CTXT_DATA_LEN 16 -struct smb2_posix_neg_context { - __le16 ContextType; /* 0x100 */ - __le16 DataLength; - __le32 Reserved; - __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ -} __packed; - -struct smb2_negotiate_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 65 */ - __le16 SecurityMode; - __le16 DialectRevision; - __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ - __u8 ServerGUID[16]; - __le32 Capabilities; - __le32 MaxTransactSize; - __le32 MaxReadSize; - __le32 MaxWriteSize; - __le64 SystemTime; /* MBZ */ - __le64 ServerStartTime; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - /* Flags */ #define SMB2_SESSION_REQ_FLAG_BINDING 0x01 #define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index f191ed64c1ee..a1f661a1b89d 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -315,4 +315,233 @@ struct smb2_tree_disconnect_rsp { } __packed; +/* + * SMB2_NEGOTIATE_PROTOCOL See MS-SMB2 section 2.2.3 + */ +/* SecurityMode flags */ +#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 +#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001) +#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 +#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002) +#define SMB2_SEC_MODE_FLAGS_ALL 0x0003 + +/* Capabilities flags */ +#define SMB2_GLOBAL_CAP_DFS 0x00000001 +#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ +/* Internal types */ +#define SMB2_NT_FIND 0x00100000 +#define SMB2_LARGE_FILES 0x00200000 + +#define SMB2_CLIENT_GUID_SIZE 16 +#define SMB2_CREATE_GUID_SIZE 16 + +/* Dialects */ +#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */ +#define SMB20_PROT_ID 0x0202 +#define SMB21_PROT_ID 0x0210 +#define SMB2X_PROT_ID 0x02FF +#define SMB30_PROT_ID 0x0300 +#define SMB302_PROT_ID 0x0302 +#define SMB311_PROT_ID 0x0311 +#define BAD_PROT_ID 0xFFFF + +#define SMB311_SALT_SIZE 32 +/* Hash Algorithm Types */ +#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) +#define SMB2_PREAUTH_HASH_SIZE 64 + +/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */ +#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) +#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) +#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) +#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) +#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) +#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) +#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) +#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) + +struct smb2_neg_context { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */ +} __packed; + +/* + * SaltLength that the server send can be zero, so the only three required + * fields (all __le16) end up six bytes total, so the minimum context data len + * in the response is six bytes which accounts for + * + * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm. + */ +#define MIN_PREAUTH_CTXT_DATA_LEN 6 + +struct smb2_preauth_neg_context { + __le16 ContextType; /* 1 */ + __le16 DataLength; + __le32 Reserved; + __le16 HashAlgorithmCount; /* 1 */ + __le16 SaltLength; + __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ + __u8 Salt[SMB311_SALT_SIZE]; +} __packed; + +/* Encryption Algorithms Ciphers */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) +#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) + +/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ +#define MIN_ENCRYPT_CTXT_DATA_LEN 4 +struct smb2_encryption_neg_context { + __le16 ContextType; /* 2 */ + __le16 DataLength; + __le32 Reserved; + /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ + __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ + __le16 Ciphers[]; +} __packed; + +/* See MS-SMB2 2.2.3.1.3 */ +#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000) +#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001) +#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) +#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) +/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ +#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ + +/* Compression Flags */ +#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) +#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001) + +struct smb2_compression_capabilities_context { + __le16 ContextType; /* 3 */ + __le16 DataLength; + __le32 Reserved; + __le16 CompressionAlgorithmCount; + __le16 Padding; + __le32 Flags; + __le16 CompressionAlgorithms[3]; + __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */ + /* Check if pad needed */ +} __packed; + +/* + * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4. + * Its struct simply contains NetName, an array of Unicode characters + */ +struct smb2_netname_neg_context { + __le16 ContextType; /* 5 */ + __le16 DataLength; + __le32 Reserved; + __le16 NetName[]; /* hostname of target converted to UCS-2 */ +} __packed; + +/* + * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 + * and 2.2.4.1.5 + */ + +/* Flags */ +#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 + +struct smb2_transport_capabilities_context { + __le16 ContextType; /* 6 */ + __le16 DataLength; + __u32 Reserved; + __le32 Flags; + __u32 Pad; +} __packed; + +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.6 + */ + +/* RDMA Transform IDs */ +#define SMB2_RDMA_TRANSFORM_NONE 0x0000 +#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 + +struct smb2_rdma_transform_capabilities_context { + __le16 ContextType; /* 7 */ + __le16 DataLength; + __u32 Reserved; + __le16 TransformCount; + __u16 Reserved1; + __u32 Reserved2; + __le16 RDMATransformIds[]; +} __packed; + +/* + * For signing capabilities context see MS-SMB2 2.2.3.1.7 + * and 2.2.4.1.7 + */ + +/* Signing algorithms */ +#define SIGNING_ALG_HMAC_SHA256 0 +#define SIGNING_ALG_HMAC_SHA256_LE cpu_to_le16(0) +#define SIGNING_ALG_AES_CMAC 1 +#define SIGNING_ALG_AES_CMAC_LE cpu_to_le16(1) +#define SIGNING_ALG_AES_GMAC 2 +#define SIGNING_ALG_AES_GMAC_LE cpu_to_le16(2) + +struct smb2_signing_capabilities { + __le16 ContextType; /* 8 */ + __le16 DataLength; + __le32 Reserved; + __le16 SigningAlgorithmCount; + __le16 SigningAlgorithms[]; + /* Followed by padding to 8 byte boundary (required by some servers) */ +} __packed; + +#define POSIX_CTXT_DATA_LEN 16 +struct smb2_posix_neg_context { + __le16 ContextType; /* 0x100 */ + __le16 DataLength; + __le32 Reserved; + __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ +} __packed; + +struct smb2_negotiate_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 DialectCount; + __le16 SecurityMode; + __le16 Reserved; /* MBZ */ + __le32 Capabilities; + __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; + /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ + __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ + __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ + __le16 Reserved2; + __le16 Dialects[]; +} __packed; + +struct smb2_negotiate_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 65 */ + __le16 SecurityMode; + __le16 DialectRevision; + __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ + __u8 ServerGUID[16]; + __le32 Capabilities; + __le32 MaxTransactSize; + __le32 MaxReadSize; + __le32 MaxWriteSize; + __le64 SystemTime; /* MBZ */ + __le64 ServerStartTime; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + + #endif /* _COMMON_SMB2PDU_H */ From d8d9de532de9fa3f3ee0c1c96c42da9507fbade6 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 8 Sep 2021 12:10:14 +1000 Subject: [PATCH 253/512] cifs: Move more definitions into the shared area Move SMB2_SessionSetup, SMB2_Close, SMB2_Read, SMB2_Write and SMB2_ChangeNotify commands into smbfs_common/smb2pdu.h Signed-off-by: Ronnie Sahlberg Reviewed-by: Namjae Jeon Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 64 +++++----- fs/cifs/smb2pdu.h | 197 ------------------------------- fs/smbfs_common/smb2pdu.h | 241 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 276 insertions(+), 226 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0b51372c9719..4fe49b007651 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1260,7 +1260,7 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) * if reconnect, we need to send previous sess id * otherwise it is 0 */ - req->PreviousSessionId = sess_data->previous_session; + req->PreviousSessionId = cpu_to_le64(sess_data->previous_session); req->Flags = 0; /* MBZ */ } @@ -3234,8 +3234,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); if (query_attrs) req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB; else @@ -3598,8 +3598,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); /* See note 354 of MS-SMB2, 64K max */ req->OutputBufferLength = cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); @@ -3821,8 +3821,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; @@ -3888,7 +3888,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, unsigned int remaining_bytes, int request_type) { int rc = -EACCES; - struct smb2_read_plain_req *req = NULL; + struct smb2_read_req *req = NULL; struct smb2_hdr *shdr; struct TCP_Server_Info *server = io_parms->server; @@ -3903,8 +3903,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, shdr = &req->hdr; shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = io_parms->persistent_fid; - req->VolatileFileId = io_parms->volatile_fid; + req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); + req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); req->ReadChannelInfoOffset = 0; /* reserved */ req->ReadChannelInfoLength = 0; /* reserved */ req->Channel = 0; /* reserved */ @@ -3938,7 +3938,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; req->ReadChannelInfoOffset = - cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer)); + cpu_to_le16(offsetof(struct smb2_read_req, Buffer)); req->ReadChannelInfoLength = cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; @@ -3964,8 +3964,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, */ shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF); - req->PersistentFileId = 0xFFFFFFFFFFFFFFFF; - req->VolatileFileId = 0xFFFFFFFFFFFFFFFF; + req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); + req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); } } if (remaining_bytes > io_parms->length) @@ -4142,7 +4142,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, { struct smb_rqst rqst; int resp_buftype, rc; - struct smb2_read_plain_req *req = NULL; + struct smb2_read_req *req = NULL; struct smb2_read_rsp *rsp = NULL; struct kvec iov[1]; struct kvec rsp_iov; @@ -4176,19 +4176,22 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc != -ENODATA) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); - trace_smb3_read_err(xid, req->PersistentFileId, + trace_smb3_read_err(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length, rc); } else - trace_smb3_read_done(xid, req->PersistentFileId, - io_parms->tcon->tid, ses->Suid, - io_parms->offset, 0); + trace_smb3_read_done(xid, + le64_to_cpu(req->PersistentFileId), + io_parms->tcon->tid, ses->Suid, + io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else - trace_smb3_read_done(xid, req->PersistentFileId, + trace_smb3_read_done(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); @@ -4330,8 +4333,8 @@ smb2_async_writev(struct cifs_writedata *wdata, shdr = (struct smb2_hdr *)req; shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); - req->PersistentFileId = wdata->cfile->fid.persistent_fid; - req->VolatileFileId = wdata->cfile->fid.volatile_fid; + req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid); + req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid); req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4428,7 +4431,8 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata, flags, &wdata->credits); if (rc) { - trace_smb3_write_err(0 /* no xid */, req->PersistentFileId, + trace_smb3_write_err(0 /* no xid */, + le64_to_cpu(req->PersistentFileId), tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes, rc); kref_put(&wdata->refcount, release); @@ -4481,8 +4485,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = io_parms->persistent_fid; - req->VolatileFileId = io_parms->volatile_fid; + req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); + req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4510,7 +4514,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { - trace_smb3_write_err(xid, req->PersistentFileId, + trace_smb3_write_err(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, io_parms->tcon->ses->Suid, io_parms->offset, io_parms->length, rc); @@ -4518,10 +4523,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_dbg(VFS, "Send error in write = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); - trace_smb3_write_done(xid, req->PersistentFileId, - io_parms->tcon->tid, - io_parms->tcon->ses->Suid, - io_parms->offset, *nbytes); + trace_smb3_write_done(xid, + le64_to_cpu(req->PersistentFileId), + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, *nbytes); } cifs_small_buf_release(req); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 2a95768e33b4..56f3cc568028 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -115,48 +115,6 @@ struct share_redirect_error_context_rsp { /* __u8 ResourceName[] */ /* Name of share as counted Unicode string */ } __packed; -/* Flags */ -#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 -#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 - -struct smb2_sess_setup_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 25 */ - __u8 Flags; - __u8 SecurityMode; - __le32 Capabilities; - __le32 Channel; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __u64 PreviousSessionId; - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - -/* Currently defined SessionFlags */ -#define SMB2_SESSION_FLAG_IS_GUEST 0x0001 -#define SMB2_SESSION_FLAG_IS_NULL 0x0002 -#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 -struct smb2_sess_setup_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 SessionFlags; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - -struct smb2_logoff_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_logoff_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - /* File Attrubutes */ #define FILE_ATTRIBUTE_READONLY 0x00000001 #define FILE_ATTRIBUTE_HIDDEN 0x00000002 @@ -720,161 +678,6 @@ struct smb2_ioctl_rsp { /* char * buffer[] */ } __packed; -/* Currently defined values for close flags */ -#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) -struct smb2_close_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 24 */ - __le16 Flags; - __le32 Reserved; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ -} __packed; - -/* - * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) - */ -#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 - -struct smb2_close_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* 60 */ - __le16 Flags; - __le32 Reserved; - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ - __le64 EndOfFile; - __le32 Attributes; -} __packed; - -struct smb2_flush_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 24 */ - __le16 Reserved1; - __le32 Reserved2; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ -} __packed; - -struct smb2_flush_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; - __le16 Reserved; -} __packed; - -/* For read request Flags field below, following flag is defined for SMB3.02 */ -#define SMB2_READFLAG_READ_UNBUFFERED 0x01 -#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */ - -/* Channel field for read and write: exactly one of following flags can be set*/ -#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) -#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */ -#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */ -#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */ - -/* SMB2 read request without RFC1001 length at the beginning */ -struct smb2_read_plain_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 49 */ - __u8 Padding; /* offset from start of SMB2 header to place read */ - __u8 Flags; /* MBZ unless SMB3.02 or later */ - __le32 Length; - __le64 Offset; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 MinimumCount; - __le32 Channel; /* MBZ except for SMB3 or later */ - __le32 RemainingBytes; - __le16 ReadChannelInfoOffset; - __le16 ReadChannelInfoLength; - __u8 Buffer[1]; -} __packed; - -/* Read flags */ -#define SMB2_READFLAG_RESPONSE_NONE 0x00000000 -#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001 - -struct smb2_read_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 17 */ - __u8 DataOffset; - __u8 Reserved; - __le32 DataLength; - __le32 DataRemaining; - __u32 Flags; - __u8 Buffer[1]; -} __packed; - -/* For write request Flags field below the following flags are defined: */ -#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ -#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ - -struct smb2_write_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 49 */ - __le16 DataOffset; /* offset from start of SMB2 header to write data */ - __le32 Length; - __le64 Offset; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 Channel; /* MBZ unless SMB3.02 or later */ - __le32 RemainingBytes; - __le16 WriteChannelInfoOffset; - __le16 WriteChannelInfoLength; - __le32 Flags; - __u8 Buffer[1]; -} __packed; - -struct smb2_write_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 17 */ - __u8 DataOffset; - __u8 Reserved; - __le32 DataLength; - __le32 DataRemaining; - __u32 Reserved2; - __u8 Buffer[1]; -} __packed; - -/* notify flags */ -#define SMB2_WATCH_TREE 0x0001 - -/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */ -#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 -#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 -#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 -#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 -#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 -#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 -#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 -#define FILE_NOTIFY_CHANGE_EA 0x00000080 -#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 -#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 -#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 -#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 - -struct smb2_change_notify_req { - struct smb2_hdr hdr; - __le16 StructureSize; - __le16 Flags; - __le32 OutputBufferLength; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 CompletionFilter; - __u32 Reserved; -} __packed; - -struct smb2_change_notify_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; /* array of file notify structs */ -} __packed; - #define SMB2_LOCKFLAG_SHARED_LOCK 0x0001 #define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002 #define SMB2_LOCKFLAG_UNLOCK 0x0004 diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index a1f661a1b89d..0d9c3ebdb773 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -544,4 +544,245 @@ struct smb2_negotiate_rsp { } __packed; +/* + * SMB2_SESSION_SETUP See MS-SMB2 section 2.2.5 + */ +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + +struct smb2_sess_setup_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 25 */ + __u8 Flags; + __u8 SecurityMode; + __le32 Capabilities; + __le32 Channel; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le64 PreviousSessionId; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +/* Currently defined SessionFlags */ +#define SMB2_SESSION_FLAG_IS_GUEST 0x0001 +#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001) +#define SMB2_SESSION_FLAG_IS_NULL 0x0002 +#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002) +#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 +#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004) + +struct smb2_sess_setup_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 SessionFlags; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + + +/* + * SMB2_LOGOFF See MS-SMB2 section 2.2.7 + */ +struct smb2_logoff_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_logoff_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + + +/* + * SMB2_CLOSE See MS-SMB2 section 2.2.15 + */ +/* Currently defined values for close flags */ +#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) +struct smb2_close_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Flags; + __le32 Reserved; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ +} __packed; + +/* + * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) + */ +#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 + +struct smb2_close_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* 60 */ + __le16 Flags; + __le32 Reserved; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; + __le32 Attributes; +} __packed; + + +/* + * SMB2_READ See MS-SMB2 section 2.2.19 + */ +/* For read request Flags field below, following flag is defined for SMB3.02 */ +#define SMB2_READFLAG_READ_UNBUFFERED 0x01 +#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */ + +/* Channel field for read and write: exactly one of following flags can be set*/ +#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) +#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) +#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) + +/* SMB2 read request without RFC1001 length at the beginning */ +struct smb2_read_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __u8 Padding; /* offset from start of SMB2 header to place read */ + __u8 Flags; /* MBZ unless SMB3.02 or later */ + __le32 Length; + __le64 Offset; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 MinimumCount; + __le32 Channel; /* MBZ except for SMB3 or later */ + __le32 RemainingBytes; + __le16 ReadChannelInfoOffset; + __le16 ReadChannelInfoLength; + __u8 Buffer[1]; +} __packed; + +/* Read flags */ +#define SMB2_READFLAG_RESPONSE_NONE cpu_to_le32(0x00000000) +#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM cpu_to_le32(0x00000001) + +struct smb2_read_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __le32 Flags; + __u8 Buffer[1]; +} __packed; + + +/* + * SMB2_WRITE See MS-SMB2 section 2.2.21 + */ +/* For write request Flags field below the following flags are defined: */ +#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ +#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ + +struct smb2_write_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 DataOffset; /* offset from start of SMB2 header to write data */ + __le32 Length; + __le64 Offset; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ + __le32 Channel; /* MBZ unless SMB3.02 or later */ + __le32 RemainingBytes; + __le16 WriteChannelInfoOffset; + __le16 WriteChannelInfoLength; + __le32 Flags; + __u8 Buffer[1]; +} __packed; + +struct smb2_write_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + + +/* + * SMB2_FLUSH See MS-SMB2 section 2.2.17 + */ +struct smb2_flush_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Reserved1; + __le32 Reserved2; + __le64 PersistentFileId; + __le64 VolatileFileId; +} __packed; + +struct smb2_flush_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Reserved; +} __packed; + + +/* + * SMB2_NOTIFY See MS-SMB2 section 2.2.35 + */ +/* notify flags */ +#define SMB2_WATCH_TREE 0x0001 + +/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */ +#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 +#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 +#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 +#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 +#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 +#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 +#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 +#define FILE_NOTIFY_CHANGE_EA 0x00000080 +#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 +#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 +#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 +#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 + +/* SMB2 Notify Action Flags */ +#define FILE_ACTION_ADDED 0x00000001 +#define FILE_ACTION_REMOVED 0x00000002 +#define FILE_ACTION_MODIFIED 0x00000003 +#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004 +#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005 +#define FILE_ACTION_ADDED_STREAM 0x00000006 +#define FILE_ACTION_REMOVED_STREAM 0x00000007 +#define FILE_ACTION_MODIFIED_STREAM 0x00000008 +#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009 + +struct smb2_change_notify_req { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Flags; + __le32 OutputBufferLength; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ + __le32 CompletionFilter; + __u32 Reserved; +} __packed; + +struct smb2_change_notify_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; /* array of file notify structs */ +} __packed; + + + #endif /* _COMMON_SMB2PDU_H */ From c462870bf8547d9cefa2e89abd0f78599c9786fe Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 8 Sep 2021 12:10:15 +1000 Subject: [PATCH 254/512] cifs: Move SMB2_Create definitions to the shared area Move all SMB2_Create definitions (except contexts) into the shared area. Signed-off-by: Ronnie Sahlberg Reviewed-by: Namjae Jeon Signed-off-by: Steve French --- fs/cifs/smb2misc.c | 4 +- fs/cifs/smb2ops.c | 8 +- fs/cifs/smb2pdu.c | 13 ++- fs/cifs/smb2pdu.h | 165 ------------------------------- fs/smbfs_common/smb2pdu.h | 201 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 215 insertions(+), 176 deletions(-) diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index ce7d6cc652b3..cdcdef32759e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -832,8 +832,8 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve rc = __smb2_handle_cancelled_cmd(tcon, le16_to_cpu(hdr->Command), le64_to_cpu(hdr->MessageId), - rsp->PersistentFileId, - rsp->VolatileFileId); + le64_to_cpu(rsp->PersistentFileId), + le64_to_cpu(rsp->VolatileFileId)); if (rc) cifs_put_tcon(tcon); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 2ad223d2db20..7acf71defea7 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -885,8 +885,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, atomic_inc(&tcon->num_remote_opens); o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; - oparms.fid->persistent_fid = o_rsp->PersistentFileId; - oparms.fid->volatile_fid = o_rsp->VolatileFileId; + oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId); + oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId); #ifdef CONFIG_CIFS_DEBUG2 oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ @@ -2395,8 +2395,8 @@ again: cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc); goto qdf_free; } - fid->persistent_fid = op_rsp->PersistentFileId; - fid->volatile_fid = op_rsp->VolatileFileId; + fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId); + fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId); /* Anything else than ENODATA means a genuine error */ if (rc && rc != -ENODATA) { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 4fe49b007651..d2ecb2ea37c0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2670,11 +2670,13 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, } rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; - trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, + trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId), + tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); - SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId); + SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId), + le64_to_cpu(rsp->VolatileFileId)); /* Eventually save off posix specific response info and timestaps */ @@ -2941,13 +2943,14 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } goto creat_exit; } else - trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, + trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId), + tcon->tid, ses->Suid, oparms->create_options, oparms->desired_access); atomic_inc(&tcon->num_remote_opens); - oparms->fid->persistent_fid = rsp->PersistentFileId; - oparms->fid->volatile_fid = rsp->VolatileFileId; + oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId); + oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId); oparms->fid->access = oparms->desired_access; #ifdef CONFIG_CIFS_DEBUG2 oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 56f3cc568028..33cfd0a1adf1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -115,120 +115,6 @@ struct share_redirect_error_context_rsp { /* __u8 ResourceName[] */ /* Name of share as counted Unicode string */ } __packed; -/* File Attrubutes */ -#define FILE_ATTRIBUTE_READONLY 0x00000001 -#define FILE_ATTRIBUTE_HIDDEN 0x00000002 -#define FILE_ATTRIBUTE_SYSTEM 0x00000004 -#define FILE_ATTRIBUTE_DIRECTORY 0x00000010 -#define FILE_ATTRIBUTE_ARCHIVE 0x00000020 -#define FILE_ATTRIBUTE_NORMAL 0x00000080 -#define FILE_ATTRIBUTE_TEMPORARY 0x00000100 -#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200 -#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400 -#define FILE_ATTRIBUTE_COMPRESSED 0x00000800 -#define FILE_ATTRIBUTE_OFFLINE 0x00001000 -#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 -#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 -#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000 -#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000 - -/* Oplock levels */ -#define SMB2_OPLOCK_LEVEL_NONE 0x00 -#define SMB2_OPLOCK_LEVEL_II 0x01 -#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 -#define SMB2_OPLOCK_LEVEL_BATCH 0x09 -#define SMB2_OPLOCK_LEVEL_LEASE 0xFF -/* Non-spec internal type */ -#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 - -/* Desired Access Flags */ -#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) -#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002) -#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004) -#define FILE_READ_EA_LE cpu_to_le32(0x00000008) -#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010) -#define FILE_EXECUTE_LE cpu_to_le32(0x00000020) -#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080) -#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100) -#define FILE_DELETE_LE cpu_to_le32(0x00010000) -#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000) -#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000) -#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000) -#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000) -#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000) -#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000) -#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000) -#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000) -#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000) -#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000) - -/* ShareAccess Flags */ -#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001) -#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002) -#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004) -#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007) - -/* CreateDisposition Flags */ -#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000) -#define FILE_OPEN_LE cpu_to_le32(0x00000001) -#define FILE_CREATE_LE cpu_to_le32(0x00000002) -#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003) -#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004) -#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005) - -/* CreateOptions Flags */ -#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001) -/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */ -#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) -#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) -#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008) -#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) -#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE cpu_to_le32(0x00000020) -#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) -#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) -#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) -#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) -#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) -#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) -#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) -#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) -#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000) -#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) -#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) -#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000) - -#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ - | FILE_READ_ATTRIBUTES_LE) -#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \ - | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE) -#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE) - -/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */ -#define IL_ANONYMOUS cpu_to_le32(0x00000000) -#define IL_IDENTIFICATION cpu_to_le32(0x00000001) -#define IL_IMPERSONATION cpu_to_le32(0x00000002) -#define IL_DELEGATE cpu_to_le32(0x00000003) - -/* Create Context Values */ -#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */ -#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ -#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" -#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" -#define SMB2_CREATE_ALLOCATION_SIZE "AISi" -#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" -#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" -#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" -#define SMB2_CREATE_REQUEST_LEASE "RqLs" -#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" -#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" -#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 -#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010 -#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83 -#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C - -/* Flag (SMB3 open response) values */ -#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 - /* * Maximum number of iovs we need for an open/create request. * [0] : struct smb2_create_req @@ -242,26 +128,6 @@ struct share_redirect_error_context_rsp { */ #define SMB2_CREATE_IOV_SIZE 8 -struct smb2_create_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 57 */ - __u8 SecurityFlags; - __u8 RequestedOplockLevel; - __le32 ImpersonationLevel; - __le64 SmbCreateFlags; - __le64 Reserved; - __le32 DesiredAccess; - __le32 FileAttributes; - __le32 ShareAccess; - __le32 CreateDisposition; - __le32 CreateOptions; - __le16 NameOffset; - __le16 NameLength; - __le32 CreateContextsOffset; - __le32 CreateContextsLength; - __u8 Buffer[]; -} __packed; - /* * Maximum size of a SMB2_CREATE response is 64 (smb2 header) + * 88 (fixed part of create response) + 520 (path) + 208 (contexts) + @@ -269,37 +135,6 @@ struct smb2_create_req { */ #define MAX_SMB2_CREATE_RESPONSE_SIZE 880 -struct smb2_create_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 89 */ - __u8 OplockLevel; - __u8 Flag; /* 0x01 if reparse point */ - __le32 CreateAction; - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 AllocationSize; - __le64 EndofFile; - __le32 FileAttributes; - __le32 Reserved2; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 CreateContextsOffset; - __le32 CreateContextsLength; - __u8 Buffer[1]; -} __packed; - -struct create_context { - __le32 Next; - __le16 NameOffset; - __le16 NameLength; - __le16 Reserved; - __le16 DataOffset; - __le32 DataLength; - __u8 Buffer[]; -} __packed; - #define SMB2_LEASE_READ_CACHING_HE 0x01 #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 0d9c3ebdb773..7ccadcbe684b 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -784,5 +784,206 @@ struct smb2_change_notify_rsp { } __packed; +/* + * SMB2_CREATE See MS-SMB2 section 2.2.13 + */ +/* Oplock levels */ +#define SMB2_OPLOCK_LEVEL_NONE 0x00 +#define SMB2_OPLOCK_LEVEL_II 0x01 +#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 +#define SMB2_OPLOCK_LEVEL_BATCH 0x09 +#define SMB2_OPLOCK_LEVEL_LEASE 0xFF +/* Non-spec internal type */ +#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 + +/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */ +#define IL_ANONYMOUS cpu_to_le32(0x00000000) +#define IL_IDENTIFICATION cpu_to_le32(0x00000001) +#define IL_IMPERSONATION cpu_to_le32(0x00000002) +#define IL_DELEGATE cpu_to_le32(0x00000003) + +/* File Attrubutes */ +#define FILE_ATTRIBUTE_READONLY 0x00000001 +#define FILE_ATTRIBUTE_HIDDEN 0x00000002 +#define FILE_ATTRIBUTE_SYSTEM 0x00000004 +#define FILE_ATTRIBUTE_DIRECTORY 0x00000010 +#define FILE_ATTRIBUTE_ARCHIVE 0x00000020 +#define FILE_ATTRIBUTE_NORMAL 0x00000080 +#define FILE_ATTRIBUTE_TEMPORARY 0x00000100 +#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200 +#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400 +#define FILE_ATTRIBUTE_COMPRESSED 0x00000800 +#define FILE_ATTRIBUTE_OFFLINE 0x00001000 +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 +#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 +#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000 +#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000 +#define FILE_ATTRIBUTE__MASK 0x00007FB7 + +#define FILE_ATTRIBUTE_READONLY_LE cpu_to_le32(0x00000001) +#define FILE_ATTRIBUTE_HIDDEN_LE cpu_to_le32(0x00000002) +#define FILE_ATTRIBUTE_SYSTEM_LE cpu_to_le32(0x00000004) +#define FILE_ATTRIBUTE_DIRECTORY_LE cpu_to_le32(0x00000010) +#define FILE_ATTRIBUTE_ARCHIVE_LE cpu_to_le32(0x00000020) +#define FILE_ATTRIBUTE_NORMAL_LE cpu_to_le32(0x00000080) +#define FILE_ATTRIBUTE_TEMPORARY_LE cpu_to_le32(0x00000100) +#define FILE_ATTRIBUTE_SPARSE_FILE_LE cpu_to_le32(0x00000200) +#define FILE_ATTRIBUTE_REPARSE_POINT_LE cpu_to_le32(0x00000400) +#define FILE_ATTRIBUTE_COMPRESSED_LE cpu_to_le32(0x00000800) +#define FILE_ATTRIBUTE_OFFLINE_LE cpu_to_le32(0x00001000) +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED_LE cpu_to_le32(0x00002000) +#define FILE_ATTRIBUTE_ENCRYPTED_LE cpu_to_le32(0x00004000) +#define FILE_ATTRIBUTE_INTEGRITY_STREAM_LE cpu_to_le32(0x00008000) +#define FILE_ATTRIBUTE_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000) +#define FILE_ATTRIBUTE_MASK_LE cpu_to_le32(0x00007FB7) + +/* Desired Access Flags */ +#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) +#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001) +#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002) +#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004) +#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004) +#define FILE_READ_EA_LE cpu_to_le32(0x00000008) +#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010) +#define FILE_EXECUTE_LE cpu_to_le32(0x00000020) +#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040) +#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080) +#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100) +#define FILE_DELETE_LE cpu_to_le32(0x00010000) +#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000) +#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000) +#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000) +#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000) +#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000) +#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000) +#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000) +#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000) +#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000) +#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000) +#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF) + + +#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \ + FILE_READ_EA_LE | \ + FILE_GENERIC_READ_LE) +#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \ + FILE_APPEND_DATA_LE | \ + FILE_WRITE_EA_LE | \ + FILE_WRITE_ATTRIBUTES_LE | \ + FILE_GENERIC_WRITE_LE) + +/* ShareAccess Flags */ +#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001) +#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002) +#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004) +#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007) + +/* CreateDisposition Flags */ +#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000) +#define FILE_OPEN_LE cpu_to_le32(0x00000001) +#define FILE_CREATE_LE cpu_to_le32(0x00000002) +#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003) +#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004) +#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005) +#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007) + +#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \ + | FILE_READ_ATTRIBUTES) +#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES) +#define FILE_EXEC_RIGHTS (FILE_EXECUTE) + +/* CreateOptions Flags */ +#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001) +/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */ +#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) +#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) +#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008) +#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) +#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) +#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) +#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) +#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) +#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) +#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) +#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) +#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) +#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) +#define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF) + +#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ + | FILE_READ_ATTRIBUTES_LE) +#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \ + | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE) +#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE) + +/* Create Context Values */ +#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */ +#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" +#define SMB2_CREATE_ALLOCATION_SIZE "AISi" +#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" +#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" +#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" +#define SMB2_CREATE_REQUEST_LEASE "RqLs" +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" +#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" + +/* Flag (SMB3 open response) values */ +#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 + +struct create_context { + __le32 Next; + __le16 NameOffset; + __le16 NameLength; + __le16 Reserved; + __le16 DataOffset; + __le32 DataLength; + __u8 Buffer[]; +} __packed; + +struct smb2_create_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u8 SecurityFlags; + __u8 RequestedOplockLevel; + __le32 ImpersonationLevel; + __le64 SmbCreateFlags; + __le64 Reserved; + __le32 DesiredAccess; + __le32 FileAttributes; + __le32 ShareAccess; + __le32 CreateDisposition; + __le32 CreateOptions; + __le16 NameOffset; + __le16 NameLength; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[]; +} __packed; + +struct smb2_create_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 89 */ + __u8 OplockLevel; + __u8 Flags; /* 0x01 if reparse point */ + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; + __le64 EndofFile; + __le32 FileAttributes; + __le32 Reserved2; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[1]; +} __packed; + #endif /* _COMMON_SMB2PDU_H */ From d7171cd1acf70eb949ece8ccc95be27b3dfcf4da Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 4 Nov 2021 15:56:37 -0500 Subject: [PATCH 255/512] smb3: add dynamic trace points for socket connection In debugging user problems with ip address/DNS issues with smb3 mounts, we sometimes needed additional info on the hostname and ip address. Add two tracepoints, one to show socket connection success and one for failures to connect to the socket. Sample output: mount.cifs-14551 [005] ..... 7636.547906: smb3_connect_done: conn_id=0x1 server=localhost addr=127.0.0.1:445 mount.cifs-14558 [004] ..... 7642.405413: smb3_connect_done: conn_id=0x2 server=smfrench.file.core.windows.net addr=52.239.158.232:445 mount.cifs-14741 [005] ..... 7818.490716: smb3_connect_done: conn_id=0x3 server=::1 addr=[::1]:445/0%0 mount.cifs-14810 [000] ..... 7966.380337: smb3_connect_err: rc=-101 conn_id=0x4 server=::2 addr=[::2]:445/0%0 mount.cifs-14810 [000] ..... 7966.380356: smb3_connect_err: rc=-101 conn_id=0x4 server=::2 addr=[::2]:139/0%0 mount.cifs-14818 [003] ..... 7986.771992: smb3_connect_done: conn_id=0x5 server=127.0.0.9 addr=127.0.0.9:445 mount.cifs-14825 [008] ..... 8008.178109: smb3_connect_err: rc=-115 conn_id=0x6 server=124.23.0.9 addr=124.23.0.9:445 mount.cifs-14825 [008] ..... 8013.298085: smb3_connect_err: rc=-115 conn_id=0x6 server=124.23.0.9 addr=124.23.0.9:139 cifsd-14553 [006] ..... 8036.735615: smb3_reconnect: conn_id=0x1 server=localhost current_mid=32 cifsd-14743 [010] ..... 8036.735644: smb3_reconnect: conn_id=0x3 server=::1 current_mid=29 cifsd-14743 [010] ..... 8039.921740: smb3_connect_err: rc=-111 conn_id=0x3 server=::1 addr=[::1]:445/0%0 cifsd-14553 [008] ..... 8042.993894: smb3_connect_err: rc=-111 conn_id=0x1 server=localhost addr=127.0.0.1:445 cifsd-14743 [010] ..... 8042.993894: smb3_connect_err: rc=-111 conn_id=0x3 server=::1 addr=[::1]:445/0%0 cifsd-14553 [008] ..... 8046.065824: smb3_connect_err: rc=-111 conn_id=0x1 server=localhost addr=127.0.0.1:445 cifsd-14743 [010] ..... 8046.065824: smb3_connect_err: rc=-111 conn_id=0x3 server=::1 addr=[::1]:445/0%0 cifsd-14553 [008] ..... 8049.137796: smb3_connect_done: conn_id=0x1 server=localhost addr=127.0.0.1:445 cifsd-14743 [010] ..... 8049.137796: smb3_connect_done: conn_id=0x3 server=::1 addr=[::1]:445/0%0 Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/connect.c | 3 +- fs/cifs/trace.h | 71 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index da3b1fff8c4a..0abbff4e4135 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2649,11 +2649,12 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = 0; if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); + trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); sock_release(socket); server->ssocket = NULL; return rc; } - + trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr); if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index dafcb6ab050d..6cecf302dcfd 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -11,6 +11,8 @@ #define _CIFS_TRACE_H #include +#include +#include /* * Please use this 3-part article as a reference for writing new tracepoints: @@ -854,6 +856,75 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \ DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); +DECLARE_EVENT_CLASS(smb3_connect_class, + TP_PROTO(char *hostname, + __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr), + TP_ARGS(hostname, conn_id, dst_addr), + TP_STRUCT__entry( + __string(hostname, hostname) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname, hostname); + ), + TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_CONNECT_EVENT(name) \ +DEFINE_EVENT(smb3_connect_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr), \ + TP_ARGS(hostname, conn_id, addr)) + +DEFINE_SMB3_CONNECT_EVENT(connect_done); + +DECLARE_EVENT_CLASS(smb3_connect_err_class, + TP_PROTO(char *hostname, __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr, int rc), + TP_ARGS(hostname, conn_id, dst_addr, rc), + TP_STRUCT__entry( + __string(hostname, hostname) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + __field(int, rc) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + __entry->rc = rc; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname, hostname); + ), + TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->rc, + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_CONNECT_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr, \ + int rc), \ + TP_ARGS(hostname, conn_id, addr, rc)) + +DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err); + DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, __u64 conn_id, From baef114759a11b1c80ad8178da6a1f576500859e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 5 Nov 2021 13:34:36 -0700 Subject: [PATCH 256/512] scripts/spelling.txt: add more spellings to spelling.txt Some of the more common spelling mistakes and typos that I've found while fixing up spelling mistakes in the kernel in the past few months. Link: https://lkml.kernel.org/r/20210907072941.7033-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 17fdc620d548..fd8f07317b8e 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -178,6 +178,7 @@ assum||assume assumtpion||assumption asuming||assuming asycronous||asynchronous +asychronous||asynchronous asynchnous||asynchronous asynchromous||asynchronous asymetric||asymmetric @@ -241,6 +242,7 @@ beter||better betweeen||between bianries||binaries bitmast||bitmask +bitwiedh||bitwidth boardcast||broadcast borad||board boundry||boundary @@ -265,7 +267,10 @@ calucate||calculate calulate||calculate cancelation||cancellation cancle||cancel +cant||can't +cant'||can't canot||cannot +cann't||can't capabilites||capabilities capabilties||capabilities capabilty||capability @@ -501,6 +506,7 @@ disble||disable disgest||digest disired||desired dispalying||displaying +dissable||disable diplay||display directon||direction direcly||directly @@ -595,6 +601,7 @@ exceded||exceeded exceds||exceeds exceeed||exceed excellant||excellent +exchnage||exchange execeeded||exceeded execeeds||exceeds exeed||exceed @@ -938,6 +945,7 @@ migrateable||migratable milliseonds||milliseconds minium||minimum minimam||minimum +minimun||minimum miniumum||minimum minumum||minimum misalinged||misaligned @@ -956,6 +964,7 @@ mmnemonic||mnemonic mnay||many modfiy||modify modifer||modifier +modul||module modulues||modules momery||memory memomry||memory @@ -1154,6 +1163,7 @@ programable||programmable programers||programmers programm||program programms||programs +progres||progress progresss||progress prohibitted||prohibited prohibitting||prohibiting @@ -1328,6 +1338,7 @@ servive||service setts||sets settting||setting shapshot||snapshot +shoft||shift shotdown||shutdown shoud||should shouldnt||shouldn't @@ -1439,6 +1450,7 @@ syfs||sysfs symetric||symmetric synax||syntax synchonized||synchronized +synchronization||synchronization synchronuously||synchronously syncronize||synchronize syncronized||synchronized @@ -1521,6 +1533,7 @@ unexpexted||unexpected unfortunatelly||unfortunately unifiy||unify uniterrupted||uninterrupted +uninterruptable||uninterruptible unintialized||uninitialized unitialized||uninitialized unkmown||unknown @@ -1553,6 +1566,7 @@ unuseful||useless unvalid||invalid upate||update upsupported||unsupported +useable||usable usefule||useful usefull||useful usege||usage @@ -1574,6 +1588,7 @@ varient||variant vaule||value verbse||verbose veify||verify +verfication||verification veriosn||version verisons||versions verison||version @@ -1586,6 +1601,7 @@ visiters||visitors vitual||virtual vunerable||vulnerable wakeus||wakeups +was't||wasn't wathdog||watchdog wating||waiting wiat||wait From 655edc52678d6f1c3c0253f49adbec5b50d716e3 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 5 Nov 2021 13:34:39 -0700 Subject: [PATCH 257/512] scripts/spelling.txt: fix "mistake" version of "synchronization" If both "mistake" version and "correction" version are the same, a warning message is created by checkpatch which is impossible to fix. But it was noticed that Colan Ian King created a commit e6c0a0889b80 ("ALSA: aloop: Fix spelling mistake "synchronization" -> "synchronization"") which suggests that this spelling mistake was fixed by replacing the word "synchronization" with itself. But the actual diff shows that the mistake in the code was "sychronization". It is rather likely that the "mistake" in spelling.txt should have been the latter. Link: https://lkml.kernel.org/r/20210926065529.6880-1-sven@narfation.org Fixes: 2e74c9433ba8 ("scripts/spelling.txt: add more spellings to spelling.txt") Signed-off-by: Sven Eckelmann Reviewed-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index fd8f07317b8e..acf6ea711299 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1450,7 +1450,7 @@ syfs||sysfs symetric||symmetric synax||syntax synchonized||synchronized -synchronization||synchronization +sychronization||synchronization synchronuously||synchronously syncronize||synchronize syncronized||synchronized From 75e2f715dffcf0cadedf49f2f3692bdd33fdd889 Mon Sep 17 00:00:00 2001 From: weidonghui Date: Fri, 5 Nov 2021 13:34:42 -0700 Subject: [PATCH 258/512] scripts/decodecode: fix faulting instruction no print when opps.file is DOS format If opps.file is in DOS format, faulting instruction cannot be printed: / # ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- / # ./scripts/decodecode < oops.file [ 0.734345] Code: d0002881 912f9c21 94067e68 d2800001 (b900003f) aarch64-linux-gnu-strip: '/tmp/tmp.5Y9eybnnSi.o': No such file aarch64-linux-gnu-objdump: '/tmp/tmp.5Y9eybnnSi.o': No such file All code ======== 0: d0002881 adrp x1, 0x512000 4: 912f9c21 add x1, x1, #0xbe7 8: 94067e68 bl 0x19f9a8 c: d2800001 mov x1, #0x0 // #0 10: b900003f str wzr, [x1] Code starting with the faulting instruction =========================================== Background: The compilation environment is Ubuntu, and the test environment is Windows. Most logs are generated in the Windows environment. In this way, CR (carriage return) will inevitably appear, which will affect the use of decodecode in the Ubuntu environment. The repaired effect is as follows: / # ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- / # ./scripts/decodecode < oops.file [ 0.734345] Code: d0002881 912f9c21 94067e68 d2800001 (b900003f) All code ======== 0: d0002881 adrp x1, 0x512000 4: 912f9c21 add x1, x1, #0xbe7 8: 94067e68 bl 0x19f9a8 c: d2800001 mov x1, #0x0 // #0 10:* b900003f str wzr, [x1] <-- trapping instruction Code starting with the faulting instruction =========================================== 0: b900003f str wzr, [x1] Link: https://lkml.kernel.org/r/20211008064712.926-1-weidonghui@allwinnertech.com Signed-off-by: weidonghui Acked-by: Borislav Petkov Cc: Marc Zyngier Cc: Will Deacon Cc: Rabin Vincent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/decodecode | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/decodecode b/scripts/decodecode index 31d884e35f2f..c711a196511c 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -126,7 +126,7 @@ if [ $marker -ne 0 ]; then fi echo Code starting with the faulting instruction > $T.aa echo =========================================== >> $T.aa -code=`echo $code | sed -e 's/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'` +code=`echo $code | sed -e 's/\r//;s/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'` echo -n " .$type 0x" > $T.s echo $code >> $T.s disas $T 0 From ae3fab5bcc725271a50843e5e284ee20d8b3532b Mon Sep 17 00:00:00 2001 From: Chenyuan Mi Date: Fri, 5 Nov 2021 13:34:45 -0700 Subject: [PATCH 259/512] ocfs2: fix handle refcount leak in two exception handling paths The reference counting issue happens in two exception handling paths of ocfs2_replay_truncate_records(). When executing these two exception handling paths, the function forgets to decrease the refcount of handle increased by ocfs2_start_trans(), causing a refcount leak. Fix this issue by using ocfs2_commit_trans() to decrease the refcount of handle in two handling paths. Link: https://lkml.kernel.org/r/20210908102055.10168-1-cymi20@fudan.edu.cn Signed-off-by: Chenyuan Mi Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Reviewed-by: Joseph Qi Cc: Wengang Wang Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5d9ae17bd443..1550f18be451 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } @@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, data_alloc_bh, start_blk, num_clusters); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } From da5e7c87827e8caa6a1eeec6d95dcf74ab592a01 Mon Sep 17 00:00:00 2001 From: Valentin Vidic Date: Fri, 5 Nov 2021 13:34:49 -0700 Subject: [PATCH 260/512] ocfs2: cleanup journal init and shutdown Allocate and free struct ocfs2_journal in ocfs2_journal_init and ocfs2_journal_shutdown. Init and release of system inodes references the journal so reorder calls to make sure they work correctly. Link: https://lkml.kernel.org/r/20211009145006.3478-1-vvidic@valentin-vidic.from.hr Signed-off-by: Valentin Vidic Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 4 ++-- fs/ocfs2/journal.c | 26 +++++++++++++++++++++----- fs/ocfs2/journal.h | 3 +-- fs/ocfs2/super.c | 40 +++------------------------------------- 4 files changed, 27 insertions(+), 46 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index bc8f32fab964..6c2411c2afcf 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; - journal_t *journal = OCFS2_SB(sb)->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ - if (journal) { + if (osb->journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4f15750aac5d..b9c339335a53 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; struct inode *inode = NULL; /* the journal inode */ journal_t *j_journal = NULL; + struct ocfs2_journal *journal = NULL; struct ocfs2_dinode *di = NULL; struct buffer_head *bh = NULL; - struct ocfs2_super *osb; int inode_lock = 0; - BUG_ON(!journal); + /* initialize our journal structure */ + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto done; + } + osb->journal = journal; + journal->j_osb = osb; - osb = journal->j_osb; + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = 1UL; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); + journal->j_state = OCFS2_JOURNAL_FREE; /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, @@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) journal->j_state = OCFS2_JOURNAL_FREE; -// up_write(&journal->j_trans_barrier); done: iput(inode); + kfree(journal); + osb->journal = NULL; } static void ocfs2_clear_journal_error(struct super_block *sb, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d158acb8b38a..8dcb2f2cadbc 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); -int ocfs2_journal_init(struct ocfs2_journal *journal, - int *dirty); +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 5c914ce9b3ac..1286b88b6fa1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - ocfs2_journal_shutdown(osb); - ocfs2_sync_blockdev(sb); ocfs2_purge_refcount_trees(osb); @@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_release_system_inodes(osb); + ocfs2_journal_shutdown(osb); + /* * If we're dismounting due to mount error, mount.ocfs2 will clean * up heartbeat. If we're a local mount, there is no heartbeat. @@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb, int i, cbits, bbits; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; - struct ocfs2_journal *journal; struct ocfs2_super *osb; u64 total_blocks; @@ -2197,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); - /* FIXME - * This should be done in ocfs2_journal_init(), but unknown - * ordering issues will cause the filesystem to crash. - * If anyone wants to figure out what part of the code - * refers to osb->journal before ocfs2_journal_init() is run, - * be my guest. - */ - /* initialize our journal structure */ - - journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); - if (!journal) { - mlog(ML_ERROR, "unable to alloc journal\n"); - status = -ENOMEM; - goto bail; - } - osb->journal = journal; - journal->j_osb = osb; - - atomic_set(&journal->j_num_trans, 0); - init_rwsem(&journal->j_trans_barrier); - init_waitqueue_head(&journal->j_checkpointed); - spin_lock_init(&journal->j_lock); - journal->j_trans_id = (unsigned long) 1; - INIT_LIST_HEAD(&journal->j_la_cleanups); - INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); - journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2404,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * ourselves. */ /* Init our journal object. */ - status = ocfs2_journal_init(osb->journal, &dirty); + status = ocfs2_journal_init(osb, &dirty); if (status < 0) { mlog(ML_ERROR, "Could not initialize journal!\n"); goto finally; @@ -2513,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); - /* FIXME - * This belongs in journal shutdown, but because we have to - * allocate osb->journal at the start of ocfs2_initialize_osb(), - * we free it here. - */ - kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); From 848be75d154daf3d2a69a12f97bbe10248e75bf5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 5 Nov 2021 13:34:52 -0700 Subject: [PATCH 261/512] ocfs2/dlm: remove redundant assignment of variable ret The variable ret is being assigned a value that is never read, it is updated later on with a different value. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Link: https://lkml.kernel.org/r/20211007233452.30815-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0e7aad1b11cc..5cd5f7511dac 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) continue; } retry: - ret = -EINVAL; mlog(0, "attempting to send begin reco msg to %d\n", nodenum); ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, From 839b63860eb3835da165642923120d305925561d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Nov 2021 13:34:55 -0700 Subject: [PATCH 262/512] ocfs2: fix data corruption on truncate Patch series "ocfs2: Truncate data corruption fix". As further testing has shown, commit 5314454ea3f ("ocfs2: fix data corruption after conversion from inline format") didn't fix all the data corruption issues the customer started observing after 6dbf7bb55598 ("fs: Don't invalidate page buffers in block_write_full_page()") This time I have tracked them down to two bugs in ocfs2 truncation code. One bug (truncating page cache before clearing tail cluster and setting i_size) could cause data corruption even before 6dbf7bb55598, but before that commit it needed a race with page fault, after 6dbf7bb55598 it started to be pretty deterministic. Another bug (zeroing pages beyond old i_size) used to be harmless inefficiency before commit 6dbf7bb55598. But after commit 6dbf7bb55598 in combination with the first bug it resulted in deterministic data corruption. Although fixing only the first problem is needed to stop data corruption, I've fixed both issues to make the code more robust. This patch (of 2): ocfs2_truncate_file() did unmap invalidate page cache pages before zeroing partial tail cluster and setting i_size. Thus some pages could be left (and likely have left if the cluster zeroing happened) in the page cache beyond i_size after truncate finished letting user possibly see stale data once the file was extended again. Also the tail cluster zeroing was not guaranteed to finish before truncate finished causing possible stale data exposure. The problem started to be particularly easy to hit after commit 6dbf7bb55598 "fs: Don't invalidate page buffers in block_write_full_page()" stopped invalidation of pages beyond i_size from page writeback path. Fix these problems by unmapping and invalidating pages in the page cache after the i_size is reduced and tail cluster is zeroed out. Link: https://lkml.kernel.org/r/20211025150008.29002-1-jack@suse.cz Link: https://lkml.kernel.org/r/20211025151332.11301-1-jack@suse.cz Fixes: ccd979bdbce9 ("[PATCH] OCFS2: The Second Oracle Cluster Filesystem") Signed-off-by: Jan Kara Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 54d7843c0211..fc5f780fa235 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode, * greater than page size, so we have to truncate them * anyway. */ - unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(inode->i_mapping, new_i_size); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + unmap_mapping_range(inode->i_mapping, + new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); status = ocfs2_truncate_inline(inode, di_bh, new_i_size, i_size_read(inode), 1); if (status) @@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode, goto bail_unlock_sem; } + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + status = ocfs2_commit_truncate(osb, inode, di_bh); if (status < 0) { mlog_errno(status); From c7c14a369de936957946b3843aae050d92451472 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Nov 2021 13:34:58 -0700 Subject: [PATCH 263/512] ocfs2: do not zero pages beyond i_size ocfs2_zero_range_for_truncate() can try to zero pages beyond current inode size despite the fact that underlying blocks should be already zeroed out and writeback will skip writing such pages anyway. Avoid the pointless work. Link: https://lkml.kernel.org/r/20211025151332.11301-2-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 1550f18be451..bb247bc349e4 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6923,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, } /* - * Zero the area past i_size but still within an allocated - * cluster. This avoids exposing nonzero data on subsequent file - * extends. + * Zero partial cluster for a hole punch or truncate. This avoids exposing + * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because * otherwise block_write_full_page() will skip writeout of pages past - * i_size. The new_i_size parameter is passed for this reason. + * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, u64 range_start, u64 range_end) @@ -6947,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) return 0; + /* + * Avoid zeroing pages fully beyond current i_size. It is pointless as + * underlying blocks of those pages should be already zeroed out and + * page writeback will skip them anyway. + */ + range_end = min_t(u64, range_end, i_size_read(inode)); + if (range_start >= range_end) + return 0; + pages = kcalloc(ocfs2_pages_per_cluster(sb), sizeof(struct page *), GFP_NOFS); if (pages == NULL) { @@ -6955,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, goto out; } - if (range_start == range_end) - goto out; - ret = ocfs2_extent_map_get_blocks(inode, range_start >> sb->s_blocksize_bits, &phys, NULL, &ext_flags); From d1cef29adc22938d778b4652af34e72facd8184b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Nov 2021 13:35:01 -0700 Subject: [PATCH 264/512] fs/posix_acl.c: avoid -Wempty-body warning The fallthrough comment for an ignored cmpxchg() return value produces a harmless warning with 'make W=1': fs/posix_acl.c: In function 'get_acl': fs/posix_acl.c:127:36: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 127 | /* fall through */ ; | ^ Simplify it as a step towards a clean W=1 build. As all architectures define cmpxchg() as a statement expression these days, it is no longer necessary to evaluate its return code, and the if() can just be droped. Link: https://lkml.kernel.org/r/20210927102410.1863853-1-arnd@kernel.org Link: https://lore.kernel.org/all/20210322132103.qiun2rjilnlgztxe@wittgenstein/ Signed-off-by: Arnd Bergmann Reviewed-by: Christian Brauner Cc: Alexander Viro Cc: James Morris Cc: Serge Hallyn Cc: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/posix_acl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index f5c25f580dd9..9323a854a60a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -134,8 +134,7 @@ struct posix_acl *get_acl(struct inode *inode, int type) * to just call ->get_acl to fetch the ACL ourself. (This is going to * be an unlikely race.) */ - if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) - /* fall through */ ; + cmpxchg(p, ACL_NOT_CACHED, sentinel); /* * Normally, the ACL returned by ->get_acl will be cached. From d41b60359ffb24a39c93ea1f4bffaafd651118c3 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 5 Nov 2021 13:35:04 -0700 Subject: [PATCH 265/512] d_path: fix Kernel doc validator complaining Kernel doc validator complains: Function parameter or member 'p' not described in 'prepend_name' Excess function parameter 'buffer' description in 'prepend_name' Link: https://lkml.kernel.org/r/20211011005614.26189-1-justin.he@arm.com Fixes: ad08ae586586 ("d_path: introduce struct prepend_buffer") Signed-off-by: Jia He Reviewed-by: Andy Shevchenko Acked-by: Randy Dunlap Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/d_path.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/d_path.c b/fs/d_path.c index cd60c7535181..e4e0ebad1f15 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen) /** * prepend_name - prepend a pathname in front of current buffer pointer - * @buffer: buffer pointer - * @buflen: allocated length of the buffer - * @name: name string and length qstr structure + * @p: prepend buffer which contains buffer pointer and allocated length + * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are @@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry - * @buffer: pointer to the end of the buffer - * @buflen: pointer to buffer length + * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. From 8587ca6f34152ea650bad4b2db68456601159024 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 13:35:07 -0700 Subject: [PATCH 266/512] mm: move kvmalloc-related functions to slab.h Not all files in the kernel should include mm.h. Migrating callers from kmalloc to kvmalloc is easier if the kvmalloc functions are in slab.h. [akpm@linux-foundation.org: move the new kvrealloc() also] [akpm@linux-foundation.org: drivers/hwmon/occ/p9_sbe.c needs slab.h] Link: https://lkml.kernel.org/r/20210622215757.3525604-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/occ/p9_sbe.c | 1 + drivers/of/kexec.c | 1 + include/linux/mm.h | 34 ---------------------------------- include/linux/slab.h | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 34 deletions(-) diff --git a/drivers/hwmon/occ/p9_sbe.c b/drivers/hwmon/occ/p9_sbe.c index f6387cc0b754..6c540b24b32f 100644 --- a/drivers/hwmon/occ/p9_sbe.c +++ b/drivers/hwmon/occ/p9_sbe.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index 761fd870d1db..053e241f593c 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #define RNG_SEED_SIZE 128 diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a52aba448f..32b2ecc47408 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -799,40 +799,6 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kvmalloc(size_t size, gfp_t flags) -{ - return kvmalloc_node(size, flags, NUMA_NO_NODE); -} -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) -{ - return kvmalloc_node(size, flags | __GFP_ZERO, node); -} -static inline void *kvzalloc(size_t size, gfp_t flags) -{ - return kvmalloc(size, flags | __GFP_ZERO); -} - -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) -{ - size_t bytes; - - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - - return kvmalloc(bytes, flags); -} - -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) -{ - return kvmalloc_array(n, size, flags | __GFP_ZERO); -} - -extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags); -extern void kvfree(const void *addr); -extern void kvfree_sensitive(const void *addr, size_t len); - static inline int head_compound_mapcount(struct page *head) { return atomic_read(compound_mapcount_ptr(head)) + 1; diff --git a/include/linux/slab.h b/include/linux/slab.h index 083f3ce550bc..c0d46b6fa12a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -732,6 +732,40 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) return kmalloc_node(size, flags | __GFP_ZERO, node); } +extern void *kvmalloc_node(size_t size, gfp_t flags, int node); +static inline void *kvmalloc(size_t size, gfp_t flags) +{ + return kvmalloc_node(size, flags, NUMA_NO_NODE); +} +static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) +{ + return kvmalloc_node(size, flags | __GFP_ZERO, node); +} +static inline void *kvzalloc(size_t size, gfp_t flags) +{ + return kvmalloc(size, flags | __GFP_ZERO); +} + +static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + + return kvmalloc(bytes, flags); +} + +static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) +{ + return kvmalloc_array(n, size, flags | __GFP_ZERO); +} + +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, + gfp_t flags); +extern void kvfree(const void *addr); +extern void kvfree_sensitive(const void *addr, size_t len); + unsigned int kmem_cache_size(struct kmem_cache *s); void __init kmem_cache_init_late(void); From ffc95a46d677adb5f49f01789db9d8c3ae0af5e2 Mon Sep 17 00:00:00 2001 From: Shi Lei Date: Fri, 5 Nov 2021 13:35:10 -0700 Subject: [PATCH 267/512] mm/slab.c: remove useless lines in enable_cpucache() These lines are useless, so remove them. Link: https://lkml.kernel.org/r/20210930034845.2539-1-shi_lei@massclouds.com Fixes: 10befea91b61 ("mm: memcg/slab: use a single set of kmem_caches for all allocations") Signed-off-by: Shi Lei Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 874b3f8fe80d..64ec17a3bc2b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3900,8 +3900,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) if (err) goto end; - if (limit && shared && batchcount) - goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -3944,7 +3942,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) limit = 32; #endif batchcount = (limit + 1) / 2; -skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); end: if (err) From d0fe47c64152a63ceed4b9f29ac56371407fa7b4 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 5 Nov 2021 13:35:14 -0700 Subject: [PATCH 268/512] slub: add back check for free nonslab objects After commit f227f0faf63b ("slub: fix unreclaimable slab stat for bulk free"), the check for free nonslab page is replaced by VM_BUG_ON_PAGE, which only check with CONFIG_DEBUG_VM enabled, but this config may impact performance, so it only for debug. Commit 0937502af7c9 ("slub: Add check for kfree() of non slab objects.") add the ability, which should be needed in any configs to catch the invalid free, they even could be potential issue, eg, memory corruption, use after free and double free, so replace VM_BUG_ON_PAGE to WARN_ON_ONCE, add object address printing to help use to debug the issue. Link: https://lkml.kernel.org/r/20210930070214.61499-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Matthew Wilcox Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rienjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index d8f77346376d..b6a1790812f7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3522,7 +3522,9 @@ static inline void free_nonslab_page(struct page *page, void *object) { unsigned int order = compound_order(page); - VM_BUG_ON_PAGE(!PageCompound(page), page); + if (WARN_ON_ONCE(!PageCompound(page))) + pr_warn_once("object pointer: 0x%p\n", object); + kfree_hook(object); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); __free_pages(page, order); From b47291ef02b0bee85ffb7efd6c336060ad1fe1a4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Nov 2021 13:35:17 -0700 Subject: [PATCH 269/512] mm, slub: change percpu partial accounting from objects to pages With CONFIG_SLUB_CPU_PARTIAL enabled, SLUB keeps a percpu list of partial slabs that can be promoted to cpu slab when the previous one is depleted, without accessing the shared partial list. A slab can be added to this list by 1) refill of an empty list from get_partial_node() - once we really have to access the shared partial list, we acquire multiple slabs to amortize the cost of locking, and 2) first free to a previously full slab - instead of putting the slab on a shared partial list, we can more cheaply freeze it and put it on the per-cpu list. To control how large a percpu partial list can grow for a kmem cache, set_cpu_partial() calculates a target number of free objects on each cpu's percpu partial list, and this can be also set by the sysfs file cpu_partial. However, the tracking of actual number of objects is imprecise, in order to limit overhead from cpu X freeing an objects to a slab on percpu partial list of cpu Y. Basically, the percpu partial slabs form a single linked list, and when we add a new slab to the list with current head "oldpage", we set in the struct page of the slab we're adding: page->pages = oldpage->pages + 1; // this is precise page->pobjects = oldpage->pobjects + (page->objects - page->inuse); page->next = oldpage; Thus the real number of free objects in the slab (objects - inuse) is only determined at the moment of adding the slab to the percpu partial list, and further freeing doesn't update the pobjects counter nor propagate it to the current list head. As Jann reports [1], this can easily lead to large inaccuracies, where the target number of objects (up to 30 by default) can translate to the same number of (empty) slab pages on the list. In case 2) above, we put a slab with 1 free object on the list, thus only increase page->pobjects by 1, even if there are subsequent frees on the same slab. Jann has noticed this in practice and so did we [2] when investigating significant increase of kmemcg usage after switching from SLAB to SLUB. While this is no longer a problem in kmemcg context thanks to the accounting rewrite in 5.9, the memory waste is still not ideal and it's questionable whether it makes sense to perform free object count based control when object counts can easily become so much inaccurate. So this patch converts the accounting to be based on number of pages only (which is precise) and removes the page->pobjects field completely. This is also ultimately simpler. To retain the existing set_cpu_partial() heuristic, first calculate the target number of objects as previously, but then convert it to target number of pages by assuming the pages will be half-filled on average. This assumption might obviously also be inaccurate in practice, but cannot degrade to actual number of pages being equal to the target number of objects. We could also skip the intermediate step with target number of objects and rewrite the heuristic in terms of pages. However we still have the sysfs file cpu_partial which uses number of objects and could break existing users if it suddenly becomes number of pages, so this patch doesn't do that. In practice, after this patch the heuristics limit the size of percpu partial list up to 2 pages. In case of a reported regression (which would mean some workload has benefited from the previous imprecise object based counting), we can tune the heuristics to get a better compromise within the new scheme, while still avoid the unexpectedly long percpu partial lists. [1] https://lore.kernel.org/linux-mm/CAG48ez2Qx5K1Cab-m8BdSibp6wLTip6ro4=-umR7BLsEgjEYzA@mail.gmail.com/ [2] https://lore.kernel.org/all/2f0f46e8-2535-410a-1859-e9cfa4e57c18@suse.cz/ ========== Evaluation ========== Mel was kind enough to run v1 through mmtests machinery for netperf (localhost) and hackbench and, for most significant results see below. So there are some apparent regressions, especially with hackbench, which I think ultimately boils down to having shorter percpu partial lists on average and some benchmarks benefiting from longer ones. Monitoring slab usage also indicated less memory usage by slab. Based on that, the following patch will bump the defaults to allow longer percpu partial lists than after this patch. However the goal is certainly not such that we would limit the percpu partial lists to 30 pages just because previously a specific alloc/free pattern could lead to the limit of 30 objects translate to a limit to 30 pages - that would make little sense. This is a correctness patch, and if a workload benefits from larger lists, the sysfs tuning knobs are still there to allow that. Netperf 2-socket Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz (20 cores, 40 threads per socket), 384GB RAM TCP-RR: hmean before 127045.79 after 121092.94 (-4.69%, worse) stddev before 2634.37 after 1254.08 UDP-RR: hmean before 166985.45 after 160668.94 ( -3.78%, worse) stddev before 4059.69 after 1943.63 2-socket Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz (20 cores, 40 threads per socket), 512GB RAM TCP-RR: hmean before 84173.25 after 76914.72 ( -8.62%, worse) UDP-RR: hmean before 93571.12 after 96428.69 ( 3.05%, better) stddev before 23118.54 after 16828.14 2-socket Intel(R) Xeon(R) CPU E5-2670 v3 @ 2.30GHz (12 cores, 24 threads per socket), 64GB RAM TCP-RR: hmean before 49984.92 after 48922.27 ( -2.13%, worse) stddev before 6248.15 after 4740.51 UDP-RR: hmean before 61854.31 after 68761.81 ( 11.17%, better) stddev before 4093.54 after 5898.91 other machines - within 2% Hackbench (results before and after the patch, negative % means worse) 2-socket AMD EPYC 7713 (64 cores, 128 threads per core), 256GB RAM hackbench-process-sockets Amean 1 0.5380 0.5583 ( -3.78%) Amean 4 0.7510 0.8150 ( -8.52%) Amean 7 0.7930 0.9533 ( -20.22%) Amean 12 0.7853 1.1313 ( -44.06%) Amean 21 1.1520 1.4993 ( -30.15%) Amean 30 1.6223 1.9237 ( -18.57%) Amean 48 2.6767 2.9903 ( -11.72%) Amean 79 4.0257 5.1150 ( -27.06%) Amean 110 5.5193 7.4720 ( -35.38%) Amean 141 7.2207 9.9840 ( -38.27%) Amean 172 8.4770 12.1963 ( -43.88%) Amean 203 9.6473 14.3137 ( -48.37%) Amean 234 11.3960 18.7917 ( -64.90%) Amean 265 13.9627 22.4607 ( -60.86%) Amean 296 14.9163 26.0483 ( -74.63%) hackbench-thread-sockets Amean 1 0.5597 0.5877 ( -5.00%) Amean 4 0.7913 0.8960 ( -13.23%) Amean 7 0.8190 1.0017 ( -22.30%) Amean 12 0.9560 1.1727 ( -22.66%) Amean 21 1.7587 1.5660 ( 10.96%) Amean 30 2.4477 1.9807 ( 19.08%) Amean 48 3.4573 3.0630 ( 11.41%) Amean 79 4.7903 5.1733 ( -8.00%) Amean 110 6.1370 7.4220 ( -20.94%) Amean 141 7.5777 9.2617 ( -22.22%) Amean 172 9.2280 11.0907 ( -20.18%) Amean 203 10.2793 13.3470 ( -29.84%) Amean 234 11.2410 17.1070 ( -52.18%) Amean 265 12.5970 23.3323 ( -85.22%) Amean 296 17.1540 24.2857 ( -41.57%) 2-socket Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz (20 cores, 40 threads per socket), 384GB RAM hackbench-process-sockets Amean 1 0.5760 0.4793 ( 16.78%) Amean 4 0.9430 0.9707 ( -2.93%) Amean 7 1.5517 1.8843 ( -21.44%) Amean 12 2.4903 2.7267 ( -9.49%) Amean 21 3.9560 4.2877 ( -8.38%) Amean 30 5.4613 5.8343 ( -6.83%) Amean 48 8.5337 9.2937 ( -8.91%) Amean 79 14.0670 15.2630 ( -8.50%) Amean 110 19.2253 21.2467 ( -10.51%) Amean 141 23.7557 25.8550 ( -8.84%) Amean 172 28.4407 29.7603 ( -4.64%) Amean 203 33.3407 33.9927 ( -1.96%) Amean 234 38.3633 39.1150 ( -1.96%) Amean 265 43.4420 43.8470 ( -0.93%) Amean 296 48.3680 48.9300 ( -1.16%) hackbench-thread-sockets Amean 1 0.6080 0.6493 ( -6.80%) Amean 4 1.0000 1.0513 ( -5.13%) Amean 7 1.6607 2.0260 ( -22.00%) Amean 12 2.7637 2.9273 ( -5.92%) Amean 21 5.0613 4.5153 ( 10.79%) Amean 30 6.3340 6.1140 ( 3.47%) Amean 48 9.0567 9.5577 ( -5.53%) Amean 79 14.5657 15.7983 ( -8.46%) Amean 110 19.6213 21.6333 ( -10.25%) Amean 141 24.1563 26.2697 ( -8.75%) Amean 172 28.9687 30.2187 ( -4.32%) Amean 203 33.9763 34.6970 ( -2.12%) Amean 234 38.8647 39.3207 ( -1.17%) Amean 265 44.0813 44.1507 ( -0.16%) Amean 296 49.2040 49.4330 ( -0.47%) 2-socket Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz (20 cores, 40 threads per socket), 512GB RAM hackbench-process-sockets Amean 1 0.5027 0.5017 ( 0.20%) Amean 4 1.1053 1.2033 ( -8.87%) Amean 7 1.8760 2.1820 ( -16.31%) Amean 12 2.9053 3.1810 ( -9.49%) Amean 21 4.6777 4.9920 ( -6.72%) Amean 30 6.5180 6.7827 ( -4.06%) Amean 48 10.0710 10.5227 ( -4.48%) Amean 79 16.4250 17.5053 ( -6.58%) Amean 110 22.6203 24.4617 ( -8.14%) Amean 141 28.0967 31.0363 ( -10.46%) Amean 172 34.4030 36.9233 ( -7.33%) Amean 203 40.5933 43.0850 ( -6.14%) Amean 234 46.6477 48.7220 ( -4.45%) Amean 265 53.0530 53.9597 ( -1.71%) Amean 296 59.2760 59.9213 ( -1.09%) hackbench-thread-sockets Amean 1 0.5363 0.5330 ( 0.62%) Amean 4 1.1647 1.2157 ( -4.38%) Amean 7 1.9237 2.2833 ( -18.70%) Amean 12 2.9943 3.3110 ( -10.58%) Amean 21 4.9987 5.1880 ( -3.79%) Amean 30 6.7583 7.0043 ( -3.64%) Amean 48 10.4547 10.8353 ( -3.64%) Amean 79 16.6707 17.6790 ( -6.05%) Amean 110 22.8207 24.4403 ( -7.10%) Amean 141 28.7090 31.0533 ( -8.17%) Amean 172 34.9387 36.8260 ( -5.40%) Amean 203 41.1567 43.0450 ( -4.59%) Amean 234 47.3790 48.5307 ( -2.43%) Amean 265 53.9543 54.6987 ( -1.38%) Amean 296 60.0820 60.2163 ( -0.22%) 1-socket Intel(R) Xeon(R) CPU E3-1240 v5 @ 3.50GHz (4 cores, 8 threads), 32 GB RAM hackbench-process-sockets Amean 1 1.4760 1.5773 ( -6.87%) Amean 3 3.9370 4.0910 ( -3.91%) Amean 5 6.6797 6.9357 ( -3.83%) Amean 7 9.3367 9.7150 ( -4.05%) Amean 12 15.7627 16.1400 ( -2.39%) Amean 18 23.5360 23.6890 ( -0.65%) Amean 24 31.0663 31.3137 ( -0.80%) Amean 30 38.7283 39.0037 ( -0.71%) Amean 32 41.3417 41.6097 ( -0.65%) hackbench-thread-sockets Amean 1 1.5250 1.6043 ( -5.20%) Amean 3 4.0897 4.2603 ( -4.17%) Amean 5 6.7760 7.0933 ( -4.68%) Amean 7 9.4817 9.9157 ( -4.58%) Amean 12 15.9610 16.3937 ( -2.71%) Amean 18 23.9543 24.3417 ( -1.62%) Amean 24 31.4400 31.7217 ( -0.90%) Amean 30 39.2457 39.5467 ( -0.77%) Amean 32 41.8267 42.1230 ( -0.71%) 2-socket Intel(R) Xeon(R) CPU E5-2670 v3 @ 2.30GHz (12 cores, 24 threads per socket), 64GB RAM hackbench-process-sockets Amean 1 1.0347 1.0880 ( -5.15%) Amean 4 1.7267 1.8527 ( -7.30%) Amean 7 2.6707 2.8110 ( -5.25%) Amean 12 4.1617 4.3383 ( -4.25%) Amean 21 7.0070 7.2600 ( -3.61%) Amean 30 9.9187 10.2397 ( -3.24%) Amean 48 15.6710 16.3923 ( -4.60%) Amean 79 24.7743 26.1247 ( -5.45%) Amean 110 34.3000 35.9307 ( -4.75%) Amean 141 44.2043 44.8010 ( -1.35%) Amean 172 54.2430 54.7260 ( -0.89%) Amean 192 60.6557 60.9777 ( -0.53%) hackbench-thread-sockets Amean 1 1.0610 1.1353 ( -7.01%) Amean 4 1.7543 1.9140 ( -9.10%) Amean 7 2.7840 2.9573 ( -6.23%) Amean 12 4.3813 4.4937 ( -2.56%) Amean 21 7.3460 7.5350 ( -2.57%) Amean 30 10.2313 10.5190 ( -2.81%) Amean 48 15.9700 16.5940 ( -3.91%) Amean 79 25.3973 26.6637 ( -4.99%) Amean 110 35.1087 36.4797 ( -3.91%) Amean 141 45.8220 46.3053 ( -1.05%) Amean 172 55.4917 55.7320 ( -0.43%) Amean 192 62.7490 62.5410 ( 0.33%) Link: https://lkml.kernel.org/r/20211012134651.11258-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Jann Horn Cc: Roman Gushchin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 - include/linux/slub_def.h | 13 +----- mm/slub.c | 89 ++++++++++++++++++++++++++-------------- 3 files changed, 61 insertions(+), 43 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..68ffa064b7a8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -124,10 +124,8 @@ struct page { struct page *next; #ifdef CONFIG_64BIT int pages; /* Nr of pages left */ - int pobjects; /* Approximate count */ #else short int pages; - short int pobjects; #endif }; }; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 85499f0586b0..0fa751b946fa 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -99,6 +99,8 @@ struct kmem_cache { #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; + /* Number of per cpu partial pages to keep around */ + unsigned int cpu_partial_pages; #endif struct kmem_cache_order_objects oo; @@ -141,17 +143,6 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_cpu_partial(s) ((s)->cpu_partial) -#define slub_set_cpu_partial(s, n) \ -({ \ - slub_cpu_partial(s) = (n); \ -}) -#else -#define slub_cpu_partial(s) (0) -#define slub_set_cpu_partial(s, n) -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS void sysfs_slab_unlink(struct kmem_cache *); diff --git a/mm/slub.c b/mm/slub.c index b6a1790812f7..0df81ea24b91 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -414,6 +414,29 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +#ifdef CONFIG_SLUB_CPU_PARTIAL +static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +{ + unsigned int nr_pages; + + s->cpu_partial = nr_objects; + + /* + * We take the number of objects but actually limit the number of + * pages on the per cpu partial list, in order to limit excessive + * growth of the list. For simplicity we assume that the pages will + * be half-full. + */ + nr_pages = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); + s->cpu_partial_pages = nr_pages; +} +#else +static inline void +slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +{ +} +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + /* * Per slab locking using the pagelock */ @@ -2052,7 +2075,7 @@ static inline void remove_partial(struct kmem_cache_node *n, */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, - int mode, int *objects) + int mode) { void *freelist; unsigned long counters; @@ -2068,7 +2091,6 @@ static inline void *acquire_slab(struct kmem_cache *s, freelist = page->freelist; counters = page->counters; new.counters = counters; - *objects = new.objects - new.inuse; if (mode) { new.inuse = page->objects; new.freelist = NULL; @@ -2106,9 +2128,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, { struct page *page, *page2; void *object = NULL; - unsigned int available = 0; unsigned long flags; - int objects; + unsigned int partial_pages = 0; /* * Racy check. If we mistakenly see no partial slabs then we @@ -2126,11 +2147,10 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!pfmemalloc_match(page, gfpflags)) continue; - t = acquire_slab(s, n, page, object == NULL, &objects); + t = acquire_slab(s, n, page, object == NULL); if (!t) break; - available += objects; if (!object) { *ret_page = page; stat(s, ALLOC_FROM_PARTIAL); @@ -2138,10 +2158,15 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, } else { put_cpu_partial(s, page, 0); stat(s, CPU_PARTIAL_NODE); + partial_pages++; } +#ifdef CONFIG_SLUB_CPU_PARTIAL if (!kmem_cache_has_cpu_partial(s) - || available > slub_cpu_partial(s) / 2) + || partial_pages > s->cpu_partial_pages / 2) break; +#else + break; +#endif } spin_unlock_irqrestore(&n->list_lock, flags); @@ -2546,14 +2571,13 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) struct page *page_to_unfreeze = NULL; unsigned long flags; int pages = 0; - int pobjects = 0; local_lock_irqsave(&s->cpu_slab->lock, flags); oldpage = this_cpu_read(s->cpu_slab->partial); if (oldpage) { - if (drain && oldpage->pobjects > slub_cpu_partial(s)) { + if (drain && oldpage->pages >= s->cpu_partial_pages) { /* * Partial array is full. Move the existing set to the * per node partial list. Postpone the actual unfreezing @@ -2562,16 +2586,13 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page_to_unfreeze = oldpage; oldpage = NULL; } else { - pobjects = oldpage->pobjects; pages = oldpage->pages; } } pages++; - pobjects += page->objects - page->inuse; page->pages = pages; - page->pobjects = pobjects; page->next = oldpage; this_cpu_write(s->cpu_slab->partial, page); @@ -3991,6 +4012,8 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static void set_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL + unsigned int nr_objects; + /* * cpu_partial determined the maximum number of objects kept in the * per cpu partial lists of a processor. @@ -4000,24 +4023,22 @@ static void set_cpu_partial(struct kmem_cache *s) * filled up again with minimal effort. The slab will never hit the * per node partial lists and therefore no locking will be required. * - * This setting also determines - * - * A) The number of objects from per cpu partial slabs dumped to the - * per node list when we reach the limit. - * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch - * 50% to keep some capacity around for frees. + * For backwards compatibility reasons, this is determined as number + * of objects, even though we now limit maximum number of pages, see + * slub_set_cpu_partial() */ if (!kmem_cache_has_cpu_partial(s)) - slub_set_cpu_partial(s, 0); + nr_objects = 0; else if (s->size >= PAGE_SIZE) - slub_set_cpu_partial(s, 2); + nr_objects = 2; else if (s->size >= 1024) - slub_set_cpu_partial(s, 6); + nr_objects = 6; else if (s->size >= 256) - slub_set_cpu_partial(s, 13); + nr_objects = 13; else - slub_set_cpu_partial(s, 30); + nr_objects = 30; + + slub_set_cpu_partial(s, nr_objects); #endif } @@ -5392,7 +5413,12 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sysfs_emit(buf, "%u\n", slub_cpu_partial(s)); + unsigned int nr_partial = 0; +#ifdef CONFIG_SLUB_CPU_PARTIAL + nr_partial = s->cpu_partial; +#endif + + return sysfs_emit(buf, "%u\n", nr_partial); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -5463,12 +5489,12 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (page) { + if (page) pages += page->pages; - objects += page->pobjects; - } } + /* Approximate half-full pages , see slub_set_cpu_partial() */ + objects = (pages * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages); #ifdef CONFIG_SMP @@ -5476,9 +5502,12 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) struct page *page; page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (page) + if (page) { + pages = READ_ONCE(page->pages); + objects = (pages * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", - cpu, page->pobjects, page->pages); + cpu, objects, pages); + } } #endif len += sysfs_emit_at(buf, len, "\n"); From 23e98ad1ce895211f4d23c951f9472e4369dc236 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Nov 2021 13:35:20 -0700 Subject: [PATCH 270/512] mm/slub: increase default cpu partial list sizes The defaults are determined based on object size and can go up to 30 for objects smaller than 256 bytes. Before the previous patch changed the accounting, this could have made cpu partial list contain up to 30 pages. After that patch, only up to 2 pages with default allocation order. Very short lists limit the usefulness of the whole concept of cpu partial lists, so this patch aims at a more reasonable default under the new accounting. The defaults are quadrupled, except for object size >= PAGE_SIZE where it's doubled. This makes the lists grow up to 10 pages in practice. A quick test of booting a kernel under virtme with 4GB RAM and 8 vcpus shows the following slab memory usage after boot: Before previous patch (using page->pobjects): Slab: 36732 kB SReclaimable: 14836 kB SUnreclaim: 21896 kB After previous patch (using page->pages): Slab: 34720 kB SReclaimable: 13716 kB SUnreclaim: 21004 kB After this patch (using page->pages, higher defaults): Slab: 35252 kB SReclaimable: 13944 kB SUnreclaim: 21308 kB In the same setup, I also ran 5 times: hackbench -l 16000 -g 16 Differences in time were in the noise, we can compare slub stats as given by slabinfo -r skbuff_head_cache (the other cache heavily used by hackbench, kmalloc-cg-512 looks similar). Negligible stats left out for brevity. Before previous patch (using page->pobjects): Objects: 1408, Memory Total: 401408 Used : 304128 Slab Perf Counter Alloc Free %Al %Fr -------------------------------------------------- Fastpath 469952498 5946606 91 1 Slowpath 42053573 506059465 8 98 Page Alloc 41093 41044 0 0 Add partial 18 21229327 0 4 Remove partial 20039522 36051 3 0 Cpu partial list 4686640 24767229 0 4 RemoteObj/SlabFrozen 16 124027841 0 24 Total 512006071 512006071 Flushes 18 Slab Deactivation Occurrences % ------------------------------------------------- Slab empty 4993 0% Deactivation bypass 24767229 99% Refilled from foreign frees 21972674 88% After previous patch (using page->pages): Objects: 480, Memory Total: 131072 Used : 103680 Slab Perf Counter Alloc Free %Al %Fr -------------------------------------------------- Fastpath 473016294 5405653 92 1 Slowpath 38989777 506600418 7 98 Page Alloc 32717 32701 0 0 Add partial 3 22749164 0 4 Remove partial 11371127 32474 2 0 Cpu partial list 11686226 23090059 2 4 RemoteObj/SlabFrozen 2 67541803 0 13 Total 512006071 512006071 Flushes 3 Slab Deactivation Occurrences % ------------------------------------------------- Slab empty 227 0% Deactivation bypass 23090059 99% Refilled from foreign frees 27585695 119% After this patch (using page->pages, higher defaults): Objects: 896, Memory Total: 229376 Used : 193536 Slab Perf Counter Alloc Free %Al %Fr -------------------------------------------------- Fastpath 473799295 4980278 92 0 Slowpath 38206776 507025793 7 99 Page Alloc 32295 32267 0 0 Add partial 11 23291143 0 4 Remove partial 5815764 31278 1 0 Cpu partial list 18119280 23967320 3 4 RemoteObj/SlabFrozen 10 76974794 0 15 Total 512006071 512006071 Flushes 11 Slab Deactivation Occurrences % ------------------------------------------------- Slab empty 989 0% Deactivation bypass 23967320 99% Refilled from foreign frees 32358473 135% As expected, memory usage dropped significantly with change of accounting, increasing the defaults increased it, but not as much. The number of page allocation/frees dropped significantly with the new accounting, but didn't increase with the higher defaults. Interestingly, the number of fasthpath allocations increased, as well as allocations from the cpu partial list, even though it's shorter. Link: https://lkml.kernel.org/r/20211012134651.11258-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Jann Horn Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0df81ea24b91..f282329489bf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4030,13 +4030,13 @@ static void set_cpu_partial(struct kmem_cache *s) if (!kmem_cache_has_cpu_partial(s)) nr_objects = 0; else if (s->size >= PAGE_SIZE) - nr_objects = 2; - else if (s->size >= 1024) nr_objects = 6; + else if (s->size >= 1024) + nr_objects = 24; else if (s->size >= 256) - nr_objects = 13; + nr_objects = 52; else - nr_objects = 30; + nr_objects = 120; slub_set_cpu_partial(s, nr_objects); #endif From 04b4b006139b58ffbd90df3befecc13711b1faf8 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Fri, 5 Nov 2021 13:35:24 -0700 Subject: [PATCH 271/512] mm, slub: use prefetchw instead of prefetch Commit 0ad9500e16fe ("slub: prefetch next freelist pointer in slab_alloc()") introduced prefetch_freepointer() because when other cpu(s) freed objects into a page that current cpu owns, the freelist link is hot on cpu(s) which freed objects and possibly very cold on current cpu. But if freelist link chain is hot on cpu(s) which freed objects, it's better to invalidate that chain because they're not going to access again within a short time. So use prefetchw instead of prefetch. On supported architectures like x86 and arm, it invalidates other copied instances of a cache line when prefetching it. Before: Time: 91.677 Performance counter stats for 'hackbench -g 100 -l 10000': 1462938.07 msec cpu-clock # 15.908 CPUs utilized 18072550 context-switches # 12.354 K/sec 1018814 cpu-migrations # 696.416 /sec 104558 page-faults # 71.471 /sec 1580035699271 cycles # 1.080 GHz (54.51%) 2003670016013 instructions # 1.27 insn per cycle (54.31%) 5702204863 branch-misses (54.28%) 643368500985 cache-references # 439.778 M/sec (54.26%) 18475582235 cache-misses # 2.872 % of all cache refs (54.28%) 642206796636 L1-dcache-loads # 438.984 M/sec (46.87%) 18215813147 L1-dcache-load-misses # 2.84% of all L1-dcache accesses (46.83%) 653842996501 dTLB-loads # 446.938 M/sec (46.63%) 3227179675 dTLB-load-misses # 0.49% of all dTLB cache accesses (46.85%) 537531951350 iTLB-loads # 367.433 M/sec (54.33%) 114750630 iTLB-load-misses # 0.02% of all iTLB cache accesses (54.37%) 630135543177 L1-icache-loads # 430.733 M/sec (46.80%) 22923237620 L1-icache-load-misses # 3.64% of all L1-icache accesses (46.76%) 91.964452802 seconds time elapsed 43.416742000 seconds user 1422.441123000 seconds sys After: Time: 90.220 Performance counter stats for 'hackbench -g 100 -l 10000': 1437418.48 msec cpu-clock # 15.880 CPUs utilized 17694068 context-switches # 12.310 K/sec 958257 cpu-migrations # 666.651 /sec 100604 page-faults # 69.989 /sec 1583259429428 cycles # 1.101 GHz (54.57%) 2004002484935 instructions # 1.27 insn per cycle (54.37%) 5594202389 branch-misses (54.36%) 643113574524 cache-references # 447.409 M/sec (54.39%) 18233791870 cache-misses # 2.835 % of all cache refs (54.37%) 640205852062 L1-dcache-loads # 445.386 M/sec (46.75%) 17968160377 L1-dcache-load-misses # 2.81% of all L1-dcache accesses (46.79%) 651747432274 dTLB-loads # 453.415 M/sec (46.59%) 3127124271 dTLB-load-misses # 0.48% of all dTLB cache accesses (46.75%) 535395273064 iTLB-loads # 372.470 M/sec (54.38%) 113500056 iTLB-load-misses # 0.02% of all iTLB cache accesses (54.35%) 628871845924 L1-icache-loads # 437.501 M/sec (46.80%) 22585641203 L1-icache-load-misses # 3.59% of all L1-icache accesses (46.79%) 90.514819303 seconds time elapsed 43.877656000 seconds user 1397.176001000 seconds sys Link: https://lkml.org/lkml/2021/10/8/598=20 Link: https://lkml.kernel.org/r/20211011144331.70084-1-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index f282329489bf..e9a51dcf8bf9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -354,7 +354,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - prefetch(object + s->offset); + prefetchw(object + s->offset); } static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) From 554b0f3ca6f4948fdbab5f199858d902061318d0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 5 Nov 2021 13:35:27 -0700 Subject: [PATCH 272/512] mm: disable NUMA_BALANCING_DEFAULT_ENABLED and TRANSPARENT_HUGEPAGE on PREEMPT_RT TRANSPARENT_HUGEPAGE: There are potential non-deterministic delays to an RT thread if a critical memory region is not THP-aligned and a non-RT buffer is located in the same hugepage-aligned region. It's also possible for an unrelated thread to migrate pages belonging to an RT task incurring unexpected page faults due to memory defragmentation even if khugepaged is disabled. Regular HUGEPAGEs are not affected by this can be used. NUMA_BALANCING: There is a non-deterministic delay to mark PTEs PROT_NONE to gather NUMA fault samples, increased page faults of regions even if mlocked and non-deterministic delays when migrating pages. [Mel Gorman worded 99% of the commit description]. Link: https://lore.kernel.org/all/20200304091159.GN3818@techsingularity.net/ Link: https://lore.kernel.org/all/20211026165100.ahz5bkx44lrrw5pt@linutronix.de/ Link: https://lkml.kernel.org/r/20211028143327.hfbxjze7palrpfgp@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Mel Gorman Reviewed-by: David Hildenbrand Cc: Vlastimil Babka Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 2 +- mm/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 11f8a845f259..21b1f4870c80 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -901,7 +901,7 @@ config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION + depends on SMP && NUMA && MIGRATION && !PREEMPT_RT help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when diff --git a/mm/Kconfig b/mm/Kconfig index d16ba9249bc5..9f1e0098522c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -371,7 +371,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT select COMPACTION select XARRAY_MULTI help From 96c84dde362a3e4ff67a12eaac2ea6e88e963c07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:35:30 -0700 Subject: [PATCH 273/512] mm: don't include in Not required at all, and having this causes a huge kernel rebuild as soon as something in dax.h changes. Link: https://lkml.kernel.org/r/20210921082253.1859794-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Naoya Horiguchi Reviewed-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 1 - mm/memory-failure.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4091692bed8c..097bbbb37493 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -8,7 +8,6 @@ #include #include -#include #include #include #include diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bdbbb32211a5..8376cfe0efce 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include From 7857ccdf94e94f1dd56496b90084fdcaa5e655a4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:33 -0700 Subject: [PATCH 274/512] lib/stackdepot: include gfp.h Patch series "stackdepot, kasan, workqueue: Avoid expanding stackdepot slabs when holding raw_spin_lock", v2. Shuah Khan reported [1]: | When CONFIG_PROVE_RAW_LOCK_NESTING=y and CONFIG_KASAN are enabled, | kasan_record_aux_stack() runs into "BUG: Invalid wait context" when | it tries to allocate memory attempting to acquire spinlock in page | allocation code while holding workqueue pool raw_spinlock. | | There are several instances of this problem when block layer tries | to __queue_work(). Call trace from one of these instances is below: | | kblockd_mod_delayed_work_on() | mod_delayed_work_on() | __queue_delayed_work() | __queue_work() (rcu_read_lock, raw_spin_lock pool->lock held) | insert_work() | kasan_record_aux_stack() | kasan_save_stack() | stack_depot_save() | alloc_pages() | __alloc_pages() | get_page_from_freelist() | rm_queue() | rm_queue_pcplist() | local_lock_irqsave(&pagesets.lock, flags); | [ BUG: Invalid wait context triggered ] PROVE_RAW_LOCK_NESTING is pointing out that (on RT kernels) the locking rules are being violated. More generally, memory is being allocated from a non-preemptive context (raw_spin_lock'd c-s) where it is not allowed. To properly fix this, we must prevent stackdepot from replenishing its "stack slab" pool if memory allocations cannot be done in the current context: it's a bug to use either GFP_ATOMIC nor GFP_NOWAIT in certain non-preemptive contexts, including raw_spin_locks (see gfp.h and commit ab00db216c9c7). The only downside is that saving a stack trace may fail if: stackdepot runs out of space AND the same stack trace has not been recorded before. I expect this to be unlikely, and a simple experiment (boot the kernel) didn't result in any failure to record stack trace from insert_work(). The series includes a few minor fixes to stackdepot that I noticed in preparing the series. It then introduces __stack_depot_save(), which exposes the option to force stackdepot to not allocate any memory. Finally, KASAN is changed to use the new stackdepot interface and provide kasan_record_aux_stack_noalloc(), which is then used by workqueue code. [1] https://lkml.kernel.org/r/20210902200134.25603-1-skhan@linuxfoundation.org This patch (of 6): refers to gfp_t, but doesn't include gfp.h. Fix it by including . Link: https://lkml.kernel.org/r/20210913112609.2651084-1-elver@google.com Link: https://lkml.kernel.org/r/20210913112609.2651084-2-elver@google.com Signed-off-by: Marco Elver Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Andrey Konovalov Cc: Tejun Heo Cc: Lai Jiangshan Cc: Walter Wu Cc: Thomas Gleixner Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: "Gustavo A. R. Silva" Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackdepot.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 6bb4bc1a5f54..97b36dc53301 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -11,6 +11,8 @@ #ifndef _LINUX_STACKDEPOT_H #define _LINUX_STACKDEPOT_H +#include + typedef u32 depot_stack_handle_t; depot_stack_handle_t stack_depot_save(unsigned long *entries, From 7f2b8818ea1361e3482d1e3a3c9a824789177d3a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:36 -0700 Subject: [PATCH 275/512] lib/stackdepot: remove unused function argument alloc_flags in depot_alloc_stack() is no longer used; remove it. Link: https://lkml.kernel.org/r/20210913112609.2651084-3-elver@google.com Signed-off-by: Marco Elver Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Gustavo A. R. Silva" Cc: Lai Jiangshan Cc: Taras Madan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Walter Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/stackdepot.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 0a2e417f83cb..c80a9f734253 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -102,8 +102,8 @@ static bool init_stack_slab(void **prealloc) } /* Allocation of a new stack in raw storage */ -static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, - u32 hash, void **prealloc, gfp_t alloc_flags) +static struct stack_record * +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { struct stack_record *stack; size_t required_size = struct_size(stack, entries, size); @@ -309,9 +309,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { - struct stack_record *new = - depot_alloc_stack(entries, nr_entries, - hash, &prealloc, alloc_flags); + struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); + if (new) { new->next = *bucket; /* From 11ac25c62cd2f3bb8da9e1df2e71afdebe76f093 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:39 -0700 Subject: [PATCH 276/512] lib/stackdepot: introduce __stack_depot_save() Add __stack_depot_save(), which provides more fine-grained control over stackdepot's memory allocation behaviour, in case stackdepot runs out of "stack slabs". Normally stackdepot uses alloc_pages() in case it runs out of space; passing can_alloc==false to __stack_depot_save() prohibits this, at the cost of more likely failure to record a stack trace. Link: https://lkml.kernel.org/r/20210913112609.2651084-4-elver@google.com Signed-off-by: Marco Elver Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Gustavo A. R. Silva" Cc: Lai Jiangshan Cc: Taras Madan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Walter Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackdepot.h | 4 ++++ lib/stackdepot.c | 43 ++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 97b36dc53301..b2f7e7c6ba54 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -15,6 +15,10 @@ typedef u32 depot_stack_handle_t; +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, bool can_alloc); + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index c80a9f734253..bda58597e375 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -248,17 +248,28 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, EXPORT_SYMBOL_GPL(stack_depot_fetch); /** - * stack_depot_save - Save a stack trace from an array + * __stack_depot_save - Save a stack trace from an array * * @entries: Pointer to storage array * @nr_entries: Size of the storage array * @alloc_flags: Allocation gfp flags + * @can_alloc: Allocate stack slabs (increased chance of failure if false) * - * Return: The handle of the stack struct stored in depot + * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is + * %true, is allowed to replenish the stack slab pool in case no space is left + * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids + * any allocations and will fail if no space is left to store the stack trace. + * + * Context: Any context, but setting @can_alloc to %false is required if + * alloc_pages() cannot be used from the current context. Currently + * this is the case from contexts where neither %GFP_ATOMIC nor + * %GFP_NOWAIT can be used (NMI, raw_spin_lock). + * + * Return: The handle of the stack struct stored in depot, 0 on failure. */ -depot_stack_handle_t stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t alloc_flags) +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; depot_stack_handle_t retval = 0; @@ -291,7 +302,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, * The smp_load_acquire() here pairs with smp_store_release() to * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). */ - if (unlikely(!smp_load_acquire(&next_slab_inited))) { + if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -339,6 +350,26 @@ exit: fast_exit: return retval; } +EXPORT_SYMBOL_GPL(__stack_depot_save); + +/** + * stack_depot_save - Save a stack trace from an array + * + * @entries: Pointer to storage array + * @nr_entries: Size of the storage array + * @alloc_flags: Allocation gfp flags + * + * Context: Contexts where allocations via alloc_pages() are allowed. + * See __stack_depot_save() for more details. + * + * Return: The handle of the stack struct stored in depot, 0 on failure. + */ +depot_stack_handle_t stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t alloc_flags) +{ + return __stack_depot_save(entries, nr_entries, alloc_flags, true); +} EXPORT_SYMBOL_GPL(stack_depot_save); static inline int in_irqentry_text(unsigned long ptr) From 7594b34774294ff23a32506b8b874d6425a1481a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:43 -0700 Subject: [PATCH 277/512] kasan: common: provide can_alloc in kasan_save_stack() Add another argument, can_alloc, to kasan_save_stack() which is passed as-is to __stack_depot_save(). No functional change intended. Link: https://lkml.kernel.org/r/20210913112609.2651084-5-elver@google.com Signed-off-by: Marco Elver Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Gustavo A. R. Silva" Cc: Lai Jiangshan Cc: Taras Madan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Walter Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 6 +++--- mm/kasan/generic.c | 2 +- mm/kasan/kasan.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2baf121fb8c5..3e0999892c36 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -30,20 +30,20 @@ #include "kasan.h" #include "../slab.h" -depot_stack_handle_t kasan_save_stack(gfp_t flags) +depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) { unsigned long entries[KASAN_STACK_DEPTH]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); nr_entries = filter_irq_stacks(entries, nr_entries); - return stack_depot_save(entries, nr_entries, flags); + return __stack_depot_save(entries, nr_entries, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) { track->pid = current->pid; - track->stack = kasan_save_stack(flags); + track->stack = kasan_save_stack(flags, true); } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index c3f5ba7a294a..2a8e59e6326d 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -345,7 +345,7 @@ void kasan_record_aux_stack(void *addr) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); + alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, true); } void kasan_set_free_info(struct kmem_cache *cache, diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8bf568a80eb8..fa6b48d08513 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -251,7 +251,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip); struct page *kasan_addr_to_page(const void *addr); -depot_stack_handle_t kasan_save_stack(gfp_t flags); +depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, From 7cb3007ce2da27ec02a1a3211941e7fe6875b642 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:46 -0700 Subject: [PATCH 278/512] kasan: generic: introduce kasan_record_aux_stack_noalloc() Introduce a variant of kasan_record_aux_stack() that does not do any memory allocation through stackdepot. This will permit using it in contexts that cannot allocate any memory. Link: https://lkml.kernel.org/r/20210913112609.2651084-6-elver@google.com Signed-off-by: Marco Elver Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Gustavo A. R. Silva" Cc: Lai Jiangshan Cc: Taras Madan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Walter Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 2 ++ mm/kasan/generic.c | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index dd874a1ee862..736d7b458996 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -370,12 +370,14 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); +void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} +static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 2a8e59e6326d..84a038b07c6f 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,7 +328,7 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); -void kasan_record_aux_stack(void *addr) +static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct page *page = kasan_addr_to_page(addr); struct kmem_cache *cache; @@ -345,7 +345,17 @@ void kasan_record_aux_stack(void *addr) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, true); + alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc); +} + +void kasan_record_aux_stack(void *addr) +{ + return __kasan_record_aux_stack(addr, true); +} + +void kasan_record_aux_stack_noalloc(void *addr) +{ + return __kasan_record_aux_stack(addr, false); } void kasan_set_free_info(struct kmem_cache *cache, From f70da745be4d4c367568b8345f50db1ae04efcb2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:50 -0700 Subject: [PATCH 279/512] workqueue, kasan: avoid alloc_pages() when recording stack Shuah Khan reported: | When CONFIG_PROVE_RAW_LOCK_NESTING=y and CONFIG_KASAN are enabled, | kasan_record_aux_stack() runs into "BUG: Invalid wait context" when | it tries to allocate memory attempting to acquire spinlock in page | allocation code while holding workqueue pool raw_spinlock. | | There are several instances of this problem when block layer tries | to __queue_work(). Call trace from one of these instances is below: | | kblockd_mod_delayed_work_on() | mod_delayed_work_on() | __queue_delayed_work() | __queue_work() (rcu_read_lock, raw_spin_lock pool->lock held) | insert_work() | kasan_record_aux_stack() | kasan_save_stack() | stack_depot_save() | alloc_pages() | __alloc_pages() | get_page_from_freelist() | rm_queue() | rm_queue_pcplist() | local_lock_irqsave(&pagesets.lock, flags); | [ BUG: Invalid wait context triggered ] The default kasan_record_aux_stack() calls stack_depot_save() with GFP_NOWAIT, which in turn can then call alloc_pages(GFP_NOWAIT, ...). In general, however, it is not even possible to use either GFP_ATOMIC nor GFP_NOWAIT in certain non-preemptive contexts, including raw_spin_locks (see gfp.h and commmit ab00db216c9c7). Fix it by instructing stackdepot to not expand stack storage via alloc_pages() in case it runs out by using kasan_record_aux_stack_noalloc(). While there is an increased risk of failing to insert the stack trace, this is typically unlikely, especially if the same insertion had already succeeded previously (stack depot hit). For frequent calls from the same location, it therefore becomes extremely unlikely that kasan_record_aux_stack_noalloc() fails. Link: https://lkml.kernel.org/r/20210902200134.25603-1-skhan@linuxfoundation.org Link: https://lkml.kernel.org/r/20210913112609.2651084-7-elver@google.com Signed-off-by: Marco Elver Reported-by: Shuah Khan Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Acked-by: Tejun Heo Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Gustavo A. R. Silva" Cc: Lai Jiangshan Cc: Taras Madan Cc: Thomas Gleixner Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Walter Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1b3eb1e9531f..12e0d9cb4ac7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1350,7 +1350,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct worker_pool *pool = pwq->pool; /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); From 820a1e6e87ccaa6c0c77ac7d79d05beec3f8cb88 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 13:35:53 -0700 Subject: [PATCH 280/512] kasan: fix tag for large allocations when using CONFIG_SLAB If an object is allocated on a tail page of a multi-page slab, kasan will get the wrong tag because page->s_mem is NULL for tail pages. I'm not quite sure what the user-visible effect of this might be. Link: https://lkml.kernel.org/r/20211001024105.3217339-1-willy@infradead.org Fixes: 7f94ffbc4c6a ("kasan: add hooks implementation for tag-based mode") Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 3e0999892c36..8428da2aaf17 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -298,7 +298,7 @@ static inline u8 assign_tag(struct kmem_cache *cache, /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ #ifdef CONFIG_SLAB /* For SLAB assign tags based on the object index in the freelist. */ - return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); + return (u8)obj_to_index(cache, virt_to_head_page(object), (void *)object); #else /* * For SLUB assign a random tag during slab creation, otherwise reuse From 758cabae312d3aded781aacc6d0c946b299c52df Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 5 Nov 2021 13:35:56 -0700 Subject: [PATCH 281/512] kasan: test: add memcpy test that avoids out-of-bounds write With HW tag-based KASAN, error checks are performed implicitly by the load and store instructions in the memcpy implementation. A failed check results in tag checks being disabled and execution will keep going. As a result, under HW tag-based KASAN, prior to commit 1b0668be62cf ("kasan: test: disable kmalloc_memmove_invalid_size for HW_TAGS"), this memcpy would end up corrupting memory until it hits an inaccessible page and causes a kernel panic. This is a pre-existing issue that was revealed by commit 285133040e6c ("arm64: Import latest memcpy()/memmove() implementation") which changed the memcpy implementation from using signed comparisons (incorrectly, resulting in the memcpy being terminated early for negative sizes) to using unsigned comparisons. It is unclear how this could be handled by memcpy itself in a reasonable way. One possibility would be to add an exception handler that would force memcpy to return if a tag check fault is detected -- this would make the behavior roughly similar to generic and SW tag-based KASAN. However, this wouldn't solve the problem for asynchronous mode and also makes memcpy behavior inconsistent with manually copying data. This test was added as a part of a series that taught KASAN to detect negative sizes in memory operations, see commit 8cceeff48f23 ("kasan: detect negative size in memory operation function"). Therefore we should keep testing for negative sizes with generic and SW tag-based KASAN. But there is some value in testing small memcpy overflows, so let's add another test with memcpy that does not destabilize the kernel by performing out-of-bounds writes, and run it in all modes. Link: https://linux-review.googlesource.com/id/I048d1e6a9aff766c4a53f989fb0c83de68923882 Link: https://lkml.kernel.org/r/20210910211356.3603758-1-pcc@google.com Signed-off-by: Peter Collingbourne Reviewed-by: Andrey Konovalov Acked-by: Marco Elver Cc: Robin Murphy Cc: Will Deacon Cc: Catalin Marinas Cc: Mark Rutland Cc: Evgenii Stepanov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 8835e0784578..aa8e42250219 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -493,7 +493,7 @@ static void kmalloc_oob_in_memset(struct kunit *test) kfree(ptr); } -static void kmalloc_memmove_invalid_size(struct kunit *test) +static void kmalloc_memmove_negative_size(struct kunit *test) { char *ptr; size_t size = 64; @@ -515,6 +515,21 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) kfree(ptr); } +static void kmalloc_memmove_invalid_size(struct kunit *test) +{ + char *ptr; + size_t size = 64; + volatile size_t invalid_size = size; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + memset((char *)ptr, 0, 64); + KUNIT_EXPECT_KASAN_FAIL(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + kfree(ptr); +} + static void kmalloc_uaf(struct kunit *test) { char *ptr; @@ -1129,6 +1144,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_memset_4), KUNIT_CASE(kmalloc_oob_memset_8), KUNIT_CASE(kmalloc_oob_memset_16), + KUNIT_CASE(kmalloc_memmove_negative_size), KUNIT_CASE(kmalloc_memmove_invalid_size), KUNIT_CASE(kmalloc_uaf), KUNIT_CASE(kmalloc_uaf_memset), From 10c848c8b4805ff026018bfb2b3dedd90e7b0cea Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:35:59 -0700 Subject: [PATCH 282/512] mm/smaps: fix shmem pte hole swap calculation Patch series "mm/smaps: Fixes and optimizations on shmem swap handling". This patch (of 3): The shmem swap calculation on the privately writable mappings are using wrong parameters as spotted by Vlastimil. Fix them. This was introduced in commit 48131e03ca4e ("mm, proc: reduce cost of /proc/pid/smaps for unpopulated shmem mappings"), when shmem_swap_usage was reworked to shmem_partial_swap_usage. Test program: void main(void) { char *buffer, *p; int i, fd; fd = memfd_create("test", 0); assert(fd > 0); /* isize==2M*3, fill in pages, swap them out */ ftruncate(fd, SIZE_2M * 3); buffer = mmap(NULL, SIZE_2M * 3, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); assert(buffer); for (i = 0, p = buffer; i < SIZE_2M * 3 / 4096; i++) { *p = 1; p += 4096; } madvise(buffer, SIZE_2M * 3, MADV_PAGEOUT); munmap(buffer, SIZE_2M * 3); /* * Remap with private+writtable mappings on partial of the inode (<= 2M*3), * while the size must also be >= 2M*2 to make sure there's a none pmd so * smaps_pte_hole will be triggered. */ buffer = mmap(NULL, SIZE_2M * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); printf("pid=%d, buffer=%p\n", getpid(), buffer); /* Check /proc/$PID/smap_rollup, should see 4MB swap */ sleep(1000000); } Before the patch, smaps_rollup shows <4MB swap and the number will be random depending on the alignment of the buffer of mmap() allocated. After this patch, it'll show 4MB. Link: https://lkml.kernel.org/r/20210917164756.8586-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210917164756.8586-2-peterx@redhat.com Fixes: 48131e03ca4e ("mm, proc: reduce cost of /proc/pid/smaps for unpopulated shmem mappings") Signed-off-by: Peter Xu Reported-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cf25be3e0321..2197f669e17b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -478,9 +478,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; - mss->swap += shmem_partial_swap_usage( - walk->vma->vm_file->f_mapping, addr, end); + mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, + linear_page_index(vma, addr), + linear_page_index(vma, end)); return 0; } From 02399c88024f4b7e6c43b3d4aa39c75af0069e17 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:36:02 -0700 Subject: [PATCH 283/512] mm/smaps: use vma->vm_pgoff directly when counting partial swap As it's trying to cover the whole vma anyways, use direct vm_pgoff value and vma_pages() rather than linear_page_index. Link: https://lkml.kernel.org/r/20210917164756.8586-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Vlastimil Babka Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b5860f4a2738..7cd976b588ff 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -856,9 +856,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) return swapped << PAGE_SHIFT; /* Here comes the more involved part */ - return shmem_partial_swap_usage(mapping, - linear_page_index(vma, vma->vm_start), - linear_page_index(vma, vma->vm_end)); + return shmem_partial_swap_usage(mapping, vma->vm_pgoff, + vma->vm_pgoff + vma_pages(vma)); } /* From 230100321518a4e3a027b3b1b7e93b3e02324def Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:36:05 -0700 Subject: [PATCH 284/512] mm/smaps: simplify shmem handling of pte holes Firstly, check_shmem_swap variable is actually not necessary, because it's always set with pte_hole hook; checking each would work. Meanwhile, the check within smaps_pte_entry is not easy to follow. E.g., pte_none() check is not needed as "!pte_present && !is_swap_pte" is the same. Since at it, use the pte_hole() helper rather than dup the page cache lookup. Still keep the CONFIG_SHMEM part so the code can be optimized to nop for !SHMEM. There will be a very slight functional change in smaps_pte_entry(), that for !SHMEM we'll return early for pte_none (before checking page==NULL), but that's even nicer. Link: https://lkml.kernel.org/r/20210917164756.8586-4-peterx@redhat.com Signed-off-by: Peter Xu Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2197f669e17b..ad667dbc96f5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -397,7 +397,6 @@ struct mem_size_stats { u64 pss_shmem; u64 pss_locked; u64 swap_pss; - bool check_shmem_swap; }; static void smaps_page_accumulate(struct mem_size_stats *mss, @@ -490,6 +489,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, #define smaps_pte_hole NULL #endif /* CONFIG_SHMEM */ +static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) +{ +#ifdef CONFIG_SHMEM + if (walk->ops->pte_hole) { + /* depth is not used */ + smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); + } +#endif +} + static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { @@ -518,12 +527,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); - } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap - && pte_none(*pte))) { - page = xa_load(&vma->vm_file->f_mapping->i_pages, - linear_page_index(vma, addr)); - if (xa_is_value(page)) - mss->swap += PAGE_SIZE; + } else { + smaps_pte_hole_lookup(addr, walk); return; } @@ -737,8 +742,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, return; #ifdef CONFIG_SHMEM - /* In case of smaps_rollup, reset the value from previous vma */ - mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -756,7 +759,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { - mss->check_shmem_swap = true; ops = &smaps_shmem_walk_ops; } } From 8772716f96704c67b1e2a6ba175605b4fce2a252 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Fri, 5 Nov 2021 13:36:09 -0700 Subject: [PATCH 285/512] mm: debug_vm_pgtable: don't use __P000 directly The __Pxxx/__Sxxx macros are only for protection_map[] init. All usage of them in linux should come from protection_map array. Because a lot of architectures would re-initilize protection_map[] array, eg: x86-mem_encrypt, m68k-motorola, mips, arm, sparc. Using __P000 is not rigorous. Link: https://lkml.kernel.org/r/20210924060821.1138281-1-guoren@kernel.org Signed-off-by: Guo Ren Reviewed-by: Andrew Morton Reviewed-by: Anshuman Khandual Cc: Gavin Shan Cc: Christophe Leroy Cc: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1403639302e4..228e3954b90c 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1104,13 +1104,14 @@ static int __init init_args(struct pgtable_debug_args *args) /* * Initialize the debugging data. * - * __P000 (or even __S000) will help create page table entries with - * PROT_NONE permission as required for pxx_protnone_tests(). + * protection_map[0] (or even protection_map[8]) will help create + * page table entries with PROT_NONE permission as required for + * pxx_protnone_tests(). */ memset(args, 0, sizeof(*args)); args->vaddr = get_random_vaddr(); args->page_prot = vm_get_page_prot(VMFLAGS); - args->page_prot_none = __P000; + args->page_prot_none = protection_map[0]; args->is_contiguous_page = false; args->pud_pfn = ULONG_MAX; args->pmd_pfn = ULONG_MAX; From d73dad4eb5ad8c31ac9cf358eb5a55825bafe706 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:12 -0700 Subject: [PATCH 286/512] kasan: test: bypass __alloc_size checks Intentional overflows, as performed by the KASAN tests, are detected at compile time[1] (instead of only at run-time) with the addition of __alloc_size. Fix this by forcing the compiler into not being able to trust the size used following the kmalloc()s. [1] https://lore.kernel.org/lkml/20211005184717.65c6d8eb39350395e387b71f@linux-foundation.org Link: https://lkml.kernel.org/r/20211006181544.1670992-1-keescook@chromium.org Signed-off-by: Kees Cook Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 8 +++++++- lib/test_kasan_module.c | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index aa8e42250219..5475fe396ff7 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -440,6 +440,7 @@ static void kmalloc_oob_memset_2(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2)); kfree(ptr); } @@ -452,6 +453,7 @@ static void kmalloc_oob_memset_4(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4)); kfree(ptr); } @@ -464,6 +466,7 @@ static void kmalloc_oob_memset_8(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8)); kfree(ptr); } @@ -476,6 +479,7 @@ static void kmalloc_oob_memset_16(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16)); kfree(ptr); } @@ -488,6 +492,7 @@ static void kmalloc_oob_in_memset(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + KASAN_GRANULE_SIZE)); kfree(ptr); @@ -497,7 +502,7 @@ static void kmalloc_memmove_negative_size(struct kunit *test) { char *ptr; size_t size = 64; - volatile size_t invalid_size = -2; + size_t invalid_size = -2; /* * Hardware tag-based mode doesn't check memmove for negative size. @@ -510,6 +515,7 @@ static void kmalloc_memmove_negative_size(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(invalid_size); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c index 7ebf433edef3..b112cbc835e9 100644 --- a/lib/test_kasan_module.c +++ b/lib/test_kasan_module.c @@ -35,6 +35,8 @@ static noinline void __init copy_user_test(void) return; } + OPTIMIZER_HIDE_VAR(size); + pr_info("out-of-bounds in copy_from_user()\n"); unused = copy_from_user(kmem, usermem, size + 1); From 75da0eba0a47c4df45b3e214013ecc70f4586443 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:15 -0700 Subject: [PATCH 287/512] rapidio: avoid bogus __alloc_size warning Patch series "Add __alloc_size()", v3. GCC and Clang both use the "alloc_size" attribute to assist with bounds checking around the use of allocation functions. Add the attribute, adjust the Makefile to silence needless warnings, and add the hints to the allocators where possible. These changes have been in use for a while now in GrapheneOS. This patch (of 8): After adding __alloc_size attributes to the allocators, GCC 9.3 (but not later) may incorrectly evaluate the arguments to check_copy_size(), getting seemingly confused by the size being returned from array_size(). Instead, perform the calculation once, which both makes the code more readable and avoids the bug in GCC. In file included from arch/x86/include/asm/preempt.h:7, from include/linux/preempt.h:78, from include/linux/spinlock.h:55, from include/linux/mm_types.h:9, from include/linux/buildid.h:5, from include/linux/module.h:14, from drivers/rapidio/devices/rio_mport_cdev.c:13: In function 'check_copy_size', inlined from 'copy_from_user' at include/linux/uaccess.h:191:6, inlined from 'rio_mport_transfer_ioctl' at drivers/rapidio/devices/rio_mport_cdev.c:983:6: include/linux/thread_info.h:213:4: error: call to '__bad_copy_to' declared with attribute error: copy destination size is too small 213 | __bad_copy_to(); | ^~~~~~~~~~~~~~~ But the allocation size and the copy size are identical: transfer = vmalloc(array_size(sizeof(*transfer), transaction.count)); if (!transfer) return -ENOMEM; if (unlikely(copy_from_user(transfer, (void __user *)(uintptr_t)transaction.block, array_size(sizeof(*transfer), transaction.count)))) { Link: https://lkml.kernel.org/r/20210930222704.2631604-1-keescook@chromium.org Link: https://lkml.kernel.org/r/20210930222704.2631604-2-keescook@chromium.org Link: https://lore.kernel.org/linux-mm/202109091134.FHnRmRxu-lkp@intel.com/ Signed-off-by: Kees Cook Reviewed-by: John Hubbard Reported-by: kernel test robot Cc: Matt Porter Cc: Alexandre Bounine Cc: Jing Xiangfeng Cc: Ira Weiny Cc: Souptick Joarder Cc: Gustavo A. R. Silva Cc: Andy Whitcroft Cc: Christoph Lameter Cc: Daniel Micay Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Randy Dunlap Cc: Tejun Heo Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 94331d999d27..7df466e22282 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -965,6 +965,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) struct rio_transfer_io *transfer; enum dma_data_direction dir; int i, ret = 0; + size_t size; if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction)))) return -EFAULT; @@ -976,13 +977,14 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) priv->md->properties.transfer_mode) == 0) return -ENODEV; - transfer = vmalloc(array_size(sizeof(*transfer), transaction.count)); + size = array_size(sizeof(*transfer), transaction.count); + transfer = vmalloc(size); if (!transfer) return -ENOMEM; if (unlikely(copy_from_user(transfer, (void __user *)(uintptr_t)transaction.block, - array_size(sizeof(*transfer), transaction.count)))) { + size))) { ret = -EFAULT; goto out_free; } @@ -994,8 +996,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) transaction.sync, dir, &transfer[i]); if (unlikely(copy_to_user((void __user *)(uintptr_t)transaction.block, - transfer, - array_size(sizeof(*transfer), transaction.count)))) + transfer, size))) ret = -EFAULT; out_free: From 86cffecdeaa278444870c8745ab166a65865dbf0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:19 -0700 Subject: [PATCH 288/512] Compiler Attributes: add __alloc_size() for better bounds checking GCC and Clang can use the "alloc_size" attribute to better inform the results of __builtin_object_size() (for compile-time constant values). Clang can additionally use alloc_size to inform the results of __builtin_dynamic_object_size() (for run-time values). Because GCC sees the frequent use of struct_size() as an allocator size argument, and notices it can return SIZE_MAX (the overflow indication), it complains about these call sites overflowing (since SIZE_MAX is greater than the default -Walloc-size-larger-than=PTRDIFF_MAX). This isn't helpful since we already know a SIZE_MAX will be caught at run-time (this was an intentional design). To deal with this, we must disable this check as it is both a false positive and redundant. (Clang does not have this warning option.) Unfortunately, just checking the -Wno-alloc-size-larger-than is not sufficient to make the __alloc_size attribute behave correctly under older GCC versions. The attribute itself must be disabled in those situations too, as there appears to be no way to reliably silence the SIZE_MAX constant expression cases for GCC versions less than 9.1: In file included from ./include/linux/resource_ext.h:11, from ./include/linux/pci.h:40, from drivers/net/ethernet/intel/ixgbe/ixgbe.h:9, from drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c:4: In function 'kmalloc_node', inlined from 'ixgbe_alloc_q_vector' at ./include/linux/slab.h:743:9: ./include/linux/slab.h:618:9: error: argument 1 value '18446744073709551615' exceeds maximum object size 9223372036854775807 [-Werror=alloc-size-larger-than=] return __kmalloc_node(size, flags, node); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/slab.h: In function 'ixgbe_alloc_q_vector': ./include/linux/slab.h:455:7: note: in a call to allocation function '__kmalloc_node' declared here void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_slab_alignment __malloc; ^~~~~~~~~~~~~~ Specifically: '-Wno-alloc-size-larger-than' is not correctly handled by GCC < 9.1 https://godbolt.org/z/hqsfG7q84 (doesn't disable) https://godbolt.org/z/P9jdrPTYh (doesn't admit to not knowing about option) https://godbolt.org/z/465TPMWKb (only warns when other warnings appear) '-Walloc-size-larger-than=18446744073709551615' is not handled by GCC < 8.2 https://godbolt.org/z/73hh1EPxz (ignores numeric value) Since anything marked with __alloc_size would also qualify for marking with __malloc, just include __malloc along with it to avoid redundant markings. (Suggested by Linus Torvalds.) Finally, make sure checkpatch.pl doesn't get confused about finding the __alloc_size attribute on functions. (Thanks to Joe Perches.) Link: https://lkml.kernel.org/r/20210930222704.2631604-3-keescook@chromium.org Signed-off-by: Kees Cook Tested-by: Randy Dunlap Cc: Andy Whitcroft Cc: Christoph Lameter Cc: Daniel Micay Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Pekka Enberg Cc: Tejun Heo Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Makefile | 15 +++++++++++++++ include/linux/compiler-gcc.h | 8 ++++++++ include/linux/compiler_attributes.h | 10 ++++++++++ include/linux/compiler_types.h | 12 ++++++++++++ scripts/checkpatch.pl | 3 ++- 5 files changed, 47 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ed6e7ec60eff..d09ac84a170b 100644 --- a/Makefile +++ b/Makefile @@ -1008,6 +1008,21 @@ ifdef CONFIG_CC_IS_GCC KBUILD_CFLAGS += -Wno-maybe-uninitialized endif +ifdef CONFIG_CC_IS_GCC +# The allocators already balk at large sizes, so silence the compiler +# warnings for bounds checks involving those possible values. While +# -Wno-alloc-size-larger-than would normally be used here, earlier versions +# of gcc (<9.1) weirdly don't handle the option correctly when _other_ +# warnings are produced (?!). Using -Walloc-size-larger-than=SIZE_MAX +# doesn't work (as it is documented to), silently resolving to "0" prior to +# version 9.1 (and producing an error more recently). Numeric values larger +# than PTRDIFF_MAX also don't work prior to version 9.1, which are silently +# ignored, continuing to default to PTRDIFF_MAX. So, left with no other +# choice, we must perform a versioned check to disable this warning. +# https://lore.kernel.org/lkml/20210824115859.187f272f@canb.auug.org.au +KBUILD_CFLAGS += $(call cc-ifversion, -ge, 0901, -Wno-alloc-size-larger-than) +endif + # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index bd2b881c6b63..b9d5f9c373a0 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -144,3 +144,11 @@ #else #define __diag_GCC_8(s) #endif + +/* + * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" + * attribute) do not work, and must be disabled. + */ +#if GCC_VERSION < 90100 +#undef __alloc_size__ +#endif diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index e6ec63403965..3de06a8fae73 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -33,6 +33,15 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __aligned_largest __attribute__((__aligned__)) +/* + * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally + * available and includes other attributes. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size + */ +#define __alloc_size__(x, ...) __attribute__((__alloc_size__(x, ## __VA_ARGS__))) + /* * Note: users of __always_inline currently do not write "inline" themselves, * which seems to be required by gcc to apply the attribute according @@ -153,6 +162,7 @@ /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#malloc */ #define __malloc __attribute__((__malloc__)) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index b6ff83a714ca..4f2203c4a257 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -250,6 +250,18 @@ struct ftrace_likely_data { # define __cficanonical #endif +/* + * Any place that could be marked with the "alloc_size" attribute is also + * a place to be marked with the "malloc" attribute. Do this as part of the + * __alloc_size macro to avoid redundant attributes and to avoid missing a + * __malloc marking. + */ +#ifdef __alloc_size__ +# define __alloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) __malloc +#else +# define __alloc_size(x, ...) __malloc +#endif + #ifndef asm_volatile_goto #define asm_volatile_goto(x...) asm goto(x) #endif diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c27d2312cfc3..88cb294dc447 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -489,7 +489,8 @@ our $Attribute = qr{ ____cacheline_aligned| ____cacheline_aligned_in_smp| ____cacheline_internodealigned_in_smp| - __weak + __weak| + __alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) }x; our $Modifier; our $Inline = qr{inline|__always_inline|noinline|__inline|__inline__}; From 72d67229f522e3331d1eabd9f58d36ae080eb228 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:23 -0700 Subject: [PATCH 289/512] slab: clean up function prototypes Based on feedback from Joe Perches and Linus Torvalds, regularize the slab function prototypes before making attribute changes. Link: https://lkml.kernel.org/r/20210930222704.2631604-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Andy Whitcroft Cc: Daniel Micay Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: Joe Perches Cc: John Hubbard Cc: kernel test robot Cc: Lukas Bulwahn Cc: Matt Porter Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Randy Dunlap Cc: Souptick Joarder Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 68 ++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index c0d46b6fa12a..d05de03bdcdd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -152,8 +152,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); -void kmem_cache_destroy(struct kmem_cache *); -int kmem_cache_shrink(struct kmem_cache *); +void kmem_cache_destroy(struct kmem_cache *s); +int kmem_cache_shrink(struct kmem_cache *s); /* * Please use this macro to create slab caches. Simply specify the @@ -181,11 +181,11 @@ int kmem_cache_shrink(struct kmem_cache *); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *, size_t, gfp_t); -void kfree(const void *); -void kfree_sensitive(const void *); -size_t __ksize(const void *); -size_t ksize(const void *); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags); +void kfree(const void *objp); +void kfree_sensitive(const void *objp); +size_t __ksize(const void *objp); +size_t ksize(const void *objp); #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); void kmem_dump_obj(void *object); @@ -426,8 +426,8 @@ static __always_inline unsigned int __kmalloc_index(size_t size, #endif /* !CONFIG_SLOB */ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; -void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; -void kmem_cache_free(struct kmem_cache *, void *); +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc; +void kmem_cache_free(struct kmem_cache *s, void *objp); /* * Bulk allocation and freeing operations. These are accelerated in an @@ -436,8 +436,8 @@ void kmem_cache_free(struct kmem_cache *, void *); * * Note that interrupts must be enabled when calling these functions. */ -void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p); /* * Caller must not use kfree_bulk() on memory not originally allocated @@ -450,7 +450,8 @@ static __always_inline void kfree_bulk(size_t size, void **p) #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; -void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment + __malloc; #else static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) { @@ -464,25 +465,24 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #endif #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc; +extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_slab_alignment __malloc; #ifdef CONFIG_NUMA -extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment __malloc; +extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) __assume_slab_alignment __malloc; #else -static __always_inline void * -kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, + gfp_t gfpflags, int node, + size_t size) { return kmem_cache_alloc_trace(s, gfpflags, size); } #endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ -static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, - gfp_t flags, size_t size) +static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, + size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -490,10 +490,8 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, return ret; } -static __always_inline void * -kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = kmem_cache_alloc_node(s, gfpflags, node); @@ -502,13 +500,14 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment + __malloc; #ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; +extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) + __assume_page_alignment __malloc; #else -static __always_inline void * -kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +static __always_inline void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { return kmalloc_order(size, flags, order); } @@ -638,8 +637,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static __must_check inline void * -krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags) +static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t new_size, + gfp_t flags) { size_t bytes; @@ -668,7 +667,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) @@ -691,7 +690,8 @@ static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) #ifdef CONFIG_NUMA -extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); +extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, + unsigned long caller); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) From c37495d6254c237578db3121dcf79857e033f8ff Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:27 -0700 Subject: [PATCH 290/512] slab: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for regular kmalloc interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-5-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Reviewed-by: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Andy Whitcroft Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Tejun Heo Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 61 ++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index d05de03bdcdd..b5bf0537975b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -181,7 +181,7 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -425,7 +425,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ -void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; +void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc; void kmem_cache_free(struct kmem_cache *s, void *objp); @@ -449,11 +449,12 @@ static __always_inline void kfree_bulk(size_t size, void **p) } #ifdef CONFIG_NUMA -void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; +void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment + __alloc_size(1); void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #else -static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __kmalloc(size, flags); } @@ -466,23 +467,23 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_slab_alignment __malloc; + __assume_slab_alignment __alloc_size(3); #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment __malloc; + int node, size_t size) __assume_slab_alignment + __alloc_size(4); #else -static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, int node, - size_t size) +static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s, + gfp_t gfpflags, int node, size_t size) { return kmem_cache_alloc_trace(s, gfpflags, size); } #endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ -static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, - size_t size) +static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, + gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -501,19 +502,20 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g #endif /* CONFIG_TRACING */ extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment - __malloc; + __alloc_size(1); #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) - __assume_page_alignment __malloc; + __assume_page_alignment __alloc_size(1); #else -static __always_inline void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags, + unsigned int order) { return kmalloc_order(size, flags, order); } #endif -static __always_inline void *kmalloc_large(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags) { unsigned int order = get_order(size); return kmalloc_order_trace(size, flags, order); @@ -573,7 +575,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * Try really hard to succeed the allocation but fail * eventually. */ -static __always_inline void *kmalloc(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { #ifndef CONFIG_SLOB @@ -595,7 +597,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } -static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && @@ -619,7 +621,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -637,8 +639,10 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t new_size, - gfp_t flags) +static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p, + size_t new_n, + size_t new_size, + gfp_t flags) { size_t bytes; @@ -654,7 +658,7 @@ static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t n * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kcalloc(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags) { return kmalloc_array(n, size, flags | __GFP_ZERO); } @@ -667,12 +671,13 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) + __alloc_size(1); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) -static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, - int node) +static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, + int node) { size_t bytes; @@ -683,7 +688,7 @@ static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, return __kmalloc_node(bytes, flags, node); } -static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) { return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } @@ -691,7 +696,7 @@ static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) #ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, - unsigned long caller); + unsigned long caller) __alloc_size(1); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) @@ -716,7 +721,7 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kzalloc(size_t size, gfp_t flags) +static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags) { return kmalloc(size, flags | __GFP_ZERO); } @@ -727,7 +732,7 @@ static inline void *kzalloc(size_t size, gfp_t flags) * @flags: the type of memory to allocate (see kmalloc). * @node: memory node from which to allocate */ -static inline void *kzalloc_node(size_t size, gfp_t flags, int node) +static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node) { return kmalloc_node(size, flags | __GFP_ZERO, node); } From 56bcf40f91c7d7f53b77abe161a7d18ef9f981ba Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:31 -0700 Subject: [PATCH 291/512] mm/kvmalloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for regular kvmalloc interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-6-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Reviewed-by: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Andy Whitcroft Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Tejun Heo Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index b5bf0537975b..837cb16232ef 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -737,21 +737,21 @@ static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int n return kmalloc_node(size, flags | __GFP_ZERO, node); } -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kvmalloc(size_t size, gfp_t flags) +extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1); +static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags) { return kvmalloc_node(size, flags, NUMA_NO_NODE); } -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) +static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node) { return kvmalloc_node(size, flags | __GFP_ZERO, node); } -static inline void *kvzalloc(size_t size, gfp_t flags) +static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags) { return kvmalloc(size, flags | __GFP_ZERO); } -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -761,13 +761,13 @@ static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) return kvmalloc(bytes, flags); } -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags) { return kvmalloc_array(n, size, flags | __GFP_ZERO); } -extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags); +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) + __alloc_size(3); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); From 894f24bb569afd4fe4a874c636f82d47f1c9beed Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:34 -0700 Subject: [PATCH 292/512] mm/vmalloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate vmalloc allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-7-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Cc: Andy Whitcroft Cc: Christoph Lameter Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Tejun Heo Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 671d402c3778..0ed56fc10c11 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -136,21 +136,21 @@ static inline void vmalloc_init(void) static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif -extern void *vmalloc(unsigned long size); -extern void *vzalloc(unsigned long size); -extern void *vmalloc_user(unsigned long size); -extern void *vmalloc_node(unsigned long size, int node); -extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_32(unsigned long size); -extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); +extern void *vmalloc(unsigned long size) __alloc_size(1); +extern void *vzalloc(unsigned long size) __alloc_size(1); +extern void *vmalloc_user(unsigned long size) __alloc_size(1); +extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); +extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); +extern void *vmalloc_32(unsigned long size) __alloc_size(1); +extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, - const void *caller); + const void *caller) __alloc_size(1); void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, - int node, const void *caller); -void *vmalloc_no_huge(unsigned long size); + int node, const void *caller) __alloc_size(1); +void *vmalloc_no_huge(unsigned long size) __alloc_size(1); extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); From abd58f38dfb4771ef06e25d71a84aa0932c52f1a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:38 -0700 Subject: [PATCH 293/512] mm/page_alloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate page allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-8-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Cc: Andy Whitcroft Cc: Christoph Lameter Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Tejun Heo Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 55b2ec1f965a..fbd4abc33f24 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -608,9 +608,9 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -void *alloc_pages_exact(size_t size, gfp_t gfp_mask); +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); -void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); +__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) From 17197dd460469c6fca66e274f5cd51ee43bb6ddd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:42 -0700 Subject: [PATCH 294/512] percpu: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate percpu allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Note that due to the implementation of the percpu API, this is unlikely to ever actually provide compile-time checking beyond very simple non-SMP builds. But, since they are technically allocators, mark them as such. Link: https://lkml.kernel.org/r/20210930222704.2631604-9-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Andy Whitcroft Cc: David Rientjes Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 5e76af742c80..98a9371133f8 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); +extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); extern bool is_kernel_percpu_address(unsigned long addr); @@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); extern void __init setup_per_cpu_areas(void); #endif -extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); -extern void __percpu *__alloc_percpu(size_t size, size_t align); +extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); +extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); extern void free_percpu(void __percpu *__pdata); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); From d1fea155ee3d90fb3adcb3d6c42751860be3b158 Mon Sep 17 00:00:00 2001 From: Yinan Zhang Date: Fri, 5 Nov 2021 13:36:46 -0700 Subject: [PATCH 295/512] mm/page_ext.c: fix a comment I have noticed that the previous macro is #ifndef CONFIG_SPARSEMEM. I think the comment of #else should be CONFIG_SPARSEMEM. Link: https://lkml.kernel.org/r/20211008140312.6492-1-zhangyinan2019@email.szu.edu.cn Signed-off-by: Yinan Zhang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_ext.c b/mm/page_ext.c index 2a52fd9ed464..6242afb24d84 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -201,7 +201,7 @@ fail: panic("Out of memory"); } -#else /* CONFIG_FLATMEM */ +#else /* CONFIG_SPARSEMEM */ struct page_ext *lookup_page_ext(const struct page *page) { From 8c8387ee3f55a38c3388882f1dd72dc526965211 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 5 Nov 2021 13:36:49 -0700 Subject: [PATCH 296/512] mm: stop filemap_read() from grabbing a superfluous page Under some circumstances, filemap_read() will allocate sufficient pages to read to the end of the file, call readahead/readpages on them and copy the data over - and then it will allocate another page at the EOF and call readpage on that and then ignore it. This is unnecessary and a waste of time and resources. filemap_read() *does* check for this, but only after it has already done the allocation and I/O. Fix this by checking before calling filemap_get_pages() also. Link: https://lkml.kernel.org/r/163472463105.3126792.7056099385135786492.stgit@warthog.procyon.org.uk Link: https://lore.kernel.org/r/160588481358.3465195.16552616179674485179.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/163456863216.2614702.6384850026368833133.stgit@warthog.procyon.org.uk/ Signed-off-by: David Howells Acked-by: Jeff Layton Reviewed-by: Matthew Wilcox (Oracle) Cc: Kent Overstreet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index dae481293b5d..e50be519f6a4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2625,6 +2625,9 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; + if (unlikely(iocb->ki_pos >= i_size_read(inode))) + break; + error = filemap_get_pages(iocb, iter, &pvec); if (error < 0) break; From c6fd3ac0fc859da57404c3bad64696d48a6f425e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:36:52 -0700 Subject: [PATCH 297/512] mm: export bdi_unregister Patch series "simplify bdi unregistation". This series simplifies the BDI code to get rid of the magic auto-unregister feature that hid a recent block layer refcounting bug. This patch (of 5): To wind down the magic auto-unregister semantics we'll need to push this into modular code. Link: https://lkml.kernel.org/r/20211021124441.668816-1-hch@lst.de Link: https://lkml.kernel.org/r/20211021124441.668816-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 4a9d4e27d0d9..8a46a0a4b72f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -958,6 +958,7 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi->owner = NULL; } } +EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { From 9718c59c0a1604350c06ffd87d598f03dcf5e9ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:36:55 -0700 Subject: [PATCH 298/512] mtd: call bdi_unregister explicitly Call bdi_unregister explicitly instead of relying on the automatic unregistration. Link: https://lkml.kernel.org/r/20211021124441.668816-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mtd/mtdcore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index c8fd7f758938..c904e23c8237 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -2409,6 +2409,7 @@ static void __exit cleanup_mtd(void) if (proc_mtd) remove_proc_entry("mtd", NULL); class_unregister(&mtd_class); + bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); idr_destroy(&mtd_idr); } From 0b3ea0926afb8dde70cfab00316ae0a70b93a7cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:36:58 -0700 Subject: [PATCH 299/512] fs: explicitly unregister per-superblock BDIs Add a new SB_I_ flag to mark superblocks that have an ephemeral bdi associated with them, and unregister it when the superblock is shut down. Link: https://lkml.kernel.org/r/20211021124441.668816-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 3 +++ include/linux/fs.h | 1 + 2 files changed, 4 insertions(+) diff --git a/fs/super.c b/fs/super.c index bcef3a6f4c4b..3bfc0f8fbd5b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -476,6 +476,8 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb_lock); up_write(&sb->s_umount); if (sb->s_bdi != &noop_backing_dev_info) { + if (sb->s_iflags & SB_I_PERSB_BDI) + bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } @@ -1562,6 +1564,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; + sb->s_iflags |= SB_I_PERSB_BDI; return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index e7a633353fd2..226de651f52e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1443,6 +1443,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ +#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ /* Possible states of 'frozen' field */ enum { From 702f2d1e3b33617a8d9a9424f08a69b7c51642a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:37:01 -0700 Subject: [PATCH 300/512] mm: don't automatically unregister bdis All BDI users now unregister explicitly. Link: https://lkml.kernel.org/r/20211021124441.668816-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8a46a0a4b72f..768e9ae489f6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -965,8 +965,7 @@ static void release_bdi(struct kref *ref) struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); - if (test_bit(WB_registered, &bdi->wb.state)) - bdi_unregister(bdi); + WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); From efee17134ca464639a2f5b4d036ce40caf1b247a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:37:04 -0700 Subject: [PATCH 301/512] mm: simplify bdi refcounting Move grabbing and releasing the bdi refcount out of the common wb_init/wb_exit helpers into code that is only used for the non-default memcg driven bdi_writeback structures. [hch@lst.de: add comment] Link: https://lkml.kernel.org/r/20211027074207.GA12793@lst.de [akpm@linux-foundation.org: fix typo] Link: https://lkml.kernel.org/r/20211021124441.668816-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 3 +++ mm/backing-dev.c | 13 +++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 33207004cfde..993c5628a726 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -103,6 +103,9 @@ struct wb_completion { * change as blkcg is disabled and enabled higher up in the hierarchy, a wb * is tested for blkcg after lookup and removed from index on mismatch so * that a new wb for the combination can be created. + * + * Each bdi_writeback that is not embedded into the backing_dev_info must hold + * a reference to the parent backing_dev_info. See cgwb_create() for details. */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 768e9ae489f6..5ccb25089808 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -291,8 +291,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, memset(wb, 0, sizeof(*wb)); - if (wb != &bdi->wb) - bdi_get(bdi); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); @@ -316,7 +314,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, err = fprop_local_init_percpu(&wb->completions, gfp); if (err) - goto out_put_bdi; + return err; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); @@ -330,9 +328,6 @@ out_destroy_stat: while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); -out_put_bdi: - if (wb != &bdi->wb) - bdi_put(bdi); return err; } @@ -373,8 +368,6 @@ static void wb_exit(struct bdi_writeback *wb) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); - if (wb != &wb->bdi->wb) - bdi_put(wb->bdi); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -397,6 +390,7 @@ static void cgwb_release_workfn(struct work_struct *work) struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); + struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); @@ -416,6 +410,7 @@ static void cgwb_release_workfn(struct work_struct *work) percpu_ref_exit(&wb->refcnt); wb_exit(wb); + bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); } @@ -497,6 +492,7 @@ static int cgwb_create(struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); + bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and @@ -528,6 +524,7 @@ static int cgwb_create(struct backing_dev_info *bdi, goto out_put; err_fprop_exit: + bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); From 61d0017e5a32d3b13ba3c7becc83a5a9e53cdbe9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Nov 2021 13:37:07 -0700 Subject: [PATCH 302/512] mm: don't read i_size of inode unless we need it We always go through i_size_read(), and we rarely end up needing it. Push the read to down where we need to check it, which avoids it for most cases. It looks like we can even remove this check entirely, which might be worth pursuing. But at least this takes it out of the hot path. Link: https://lkml.kernel.org/r/6b67981f-57d4-c80e-bc07-6020aa601381@kernel.dk Signed-off-by: Jens Axboe Acked-by: Chris Mason Cc: Josef Bacik Cc: Dave Chinner Cc: Pavel Begunkov Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index e50be519f6a4..ebd68890a844 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2740,9 +2740,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - loff_t size; - size = i_size_read(inode); if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, iocb->ki_pos, iocb->ki_pos + count - 1)) @@ -2774,8 +2772,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ - if (retval < 0 || !count || iocb->ki_pos >= size || - IS_DAX(inode)) + if (retval < 0 || !count || IS_DAX(inode)) + return retval; + if (iocb->ki_pos >= i_size_read(inode)) return retval; } From d417b49fff3e2f21043c834841e8623a6098741d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 13:37:10 -0700 Subject: [PATCH 303/512] mm/filemap.c: remove bogus VM_BUG_ON It is not safe to check page->index without holding the page lock. It can be changed if the page is moved between the swap cache and the page cache for a shmem file, for example. There is a VM_BUG_ON below which checks page->index is correct after taking the page lock. Link: https://lkml.kernel.org/r/20210818144932.940640-1-willy@infradead.org Fixes: 5c211ba29deb ("mm: add and use find_lock_entries") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index ebd68890a844..28c57c2536d0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2093,7 +2093,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, if (!xa_is_value(page)) { if (page->index < start) goto put; - VM_BUG_ON_PAGE(page->index != xas.xa_index, page); if (page->index + thp_nr_pages(page) - 1 > end) goto put; if (!trylock_page(page)) From f8ee8909ac818e555ef18b57b0ee4b9fd0478b82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Nov 2021 13:37:13 -0700 Subject: [PATCH 304/512] mm: move more expensive part of XA setup out of mapping check The fast path here is not needing any writeback, yet we spend time setting up the xarray lookup data upfront. Move the part that actually needs to iterate the address space mapping into a separate helper, saving ~30% of the time here. Link: https://lkml.kernel.org/r/49f67983-b802-8929-edab-d807f745c9ca@kernel.dk Signed-off-by: Jens Axboe Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 28c57c2536d0..938f52702a7a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -639,6 +639,30 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } +static bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; + struct page *page; + + if (end_byte < start_byte) + return false; + + rcu_read_lock(); + xas_for_each(&xas, page, max) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + break; + } + rcu_read_unlock(); + return page != NULL; + +} + /** * filemap_range_needs_writeback - check if range potentially needs writeback * @mapping: address space within which to check @@ -656,29 +680,12 @@ static bool mapping_needs_writeback(struct address_space *mapping) bool filemap_range_needs_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); - pgoff_t max = end_byte >> PAGE_SHIFT; - struct page *page; - if (!mapping_needs_writeback(mapping)) return false; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) return false; - if (end_byte < start_byte) - return false; - - rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) - continue; - if (xa_is_value(page)) - continue; - if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) - break; - } - rcu_read_unlock(); - return page != NULL; + return filemap_range_has_writeback(mapping, start_byte, end_byte); } EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); From 20b7fee738d65e50ca00e325ae27ee3efaa819f6 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 5 Nov 2021 13:37:16 -0700 Subject: [PATCH 305/512] mm/gup: further simplify __gup_device_huge() Commit 6401c4eb57f9 ("mm: gup: fix potential pgmap refcnt leak in __gup_device_huge()") simplified the return paths, but didn't go quite far enough, as discussed in [1]. Remove the "ret" variable entirely, because there is enough information already available to provide the return value. [1] https://lore.kernel.org/r/CAHk-=wgQTRX=5SkCmS+zfmpqubGHGJvXX_HgnPG8JSpHKHBMeg@mail.gmail.com Link: https://lkml.kernel.org/r/20210904004224.86391-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Suggested-by: Linus Torvalds Reviewed-by: Jan Kara Cc: Miaohe Lin Cc: Claudio Imbrenda Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 886d6148d3d0..48c7a5a1e85b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2228,7 +2228,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; - int ret = 1; do { struct page *page = pfn_to_page(pfn); @@ -2236,14 +2235,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); - ret = 0; break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); - ret = 0; break; } (*nr)++; @@ -2251,7 +2248,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); - return ret; + return addr == end; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, From 363dc512b666a235769482cc73869f899785a1f6 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Fri, 5 Nov 2021 13:37:19 -0700 Subject: [PATCH 306/512] mm/swapfile: remove needless request_queue NULL pointer check The request_queue pointer returned from bdev_get_queue() shall never be NULL, so the null check is unnecessary, just remove it. Link: https://lkml.kernel.org/r/20210917082111.33923-1-vulab@iscas.ac.cn Signed-off-by: Xu Wang Acked-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 22d10f713848..42027d213fd2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3118,7 +3118,7 @@ static bool swap_discardable(struct swap_info_struct *si) { struct request_queue *q = bdev_get_queue(si->bdev); - if (!q || !blk_queue_discard(q)) + if (!blk_queue_discard(q)) return false; return true; From 642929a2ded029aac13b8cb91bc9b7c088ddb57f Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Fri, 5 Nov 2021 13:37:22 -0700 Subject: [PATCH 307/512] mm/swapfile: fix an integer overflow in swap_show() This one is just a minor nuisance for people going through /proc/swaps if any of their swapareas is bigger than, or equal to 1073741824 pages (4TB). seq_printf() format string casts as uint the conversion from pages to KB, and that will overflow in the aforementioned case. Albeit being almost unthinkable that someone would actually set up such big of a single swaparea, there is a ticket recently filed against RHEL: https://bugzilla.redhat.com/show_bug.cgi?id=2008812 Given that all other codesites that use format strings for the same swap pages-to-KB conversion do cast it as ulong, this patch just follows suit. Link: https://lkml.kernel.org/r/20211006184011.2579054-1-aquini@redhat.com Signed-off-by: Rafael Aquini Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 42027d213fd2..353e5bd518e1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2763,7 +2763,7 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; - unsigned int bytes, inuse; + unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); @@ -2775,7 +2775,7 @@ static int swap_show(struct seq_file *swap, void *v) file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", + seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", From 988c69f1bc23fe632ea5e6eb3c26a9e103c3ed41 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 13:37:25 -0700 Subject: [PATCH 308/512] mm: optimise put_pages_list() Instead of calling put_page() one page at a time, pop pages off the list if their refcount was too high and pass the remainder to put_unref_page_list(). This should be a speed improvement, but I have no measurements to support that. Current callers do not care about performance, but I hope to add some which do. Link: https://lkml.kernel.org/r/20211007192138.561673-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Anthony Yznaga Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index af3cad4e5378..9f334d503fd2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -134,18 +134,27 @@ EXPORT_SYMBOL(__put_page); * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru * - * Release a list of pages which are strung together on page.lru. Currently - * used by read_cache_pages() and related error recovery code. + * Release a list of pages which are strung together on page.lru. */ void put_pages_list(struct list_head *pages) { - while (!list_empty(pages)) { - struct page *victim; + struct page *page, *next; - victim = lru_to_page(pages); - list_del(&victim->lru); - put_page(victim); + list_for_each_entry_safe(page, next, pages, lru) { + if (!put_page_testzero(page)) { + list_del(&page->lru); + continue; + } + if (PageHead(page)) { + list_del(&page->lru); + __put_compound_page(page); + continue; + } + /* Cannot be PageLRU because it's passed to us using the lru */ + __ClearPageWaiters(page); } + + free_unref_page_list(pages); } EXPORT_SYMBOL(put_pages_list); From 48384b0b76f3662dfa6153c1072c2b936fc14627 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:37:28 -0700 Subject: [PATCH 309/512] mm/memcg: drop swp_entry_t* in mc_handle_file_pte() It is unused after the rework of commit f5df8635c5a3 ("mm: use find_get_incore_page in memcontrol"). Link: https://lkml.kernel.org/r/20210916193014.80129-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6da5020a8656..15e6fb5d56a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5545,7 +5545,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, #endif static struct page *mc_handle_file_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + unsigned long addr, pte_t ptent) { if (!vma->vm_file) /* anonymous vma */ return NULL; @@ -5718,7 +5718,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) - page = mc_handle_file_pte(vma, addr, ptent, &ent); + page = mc_handle_file_pte(vma, addr, ptent); if (!page && !ent.val) return ret; From 11192d9c124d58d66449b163ed0d2cdff03761a1 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 5 Nov 2021 13:37:31 -0700 Subject: [PATCH 310/512] memcg: flush stats only if updated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment, the kernel flushes the memcg stats on every refault and also on every reclaim iteration. Although rstat maintains per-cpu update tree but on the flush the kernel still has to go through all the cpu rstat update tree to check if there is anything to flush. This patch adds the tracking on the stats update side to make flush side more clever by skipping the flush if there is no update. The stats update codepath is very sensitive performance wise for many workloads and benchmarks. So, we can not follow what the commit aa48e47e3906 ("memcg: infrastructure to flush memcg stats") did which was triggering async flush through queue_work() and caused a lot performance regression reports. That got reverted by the commit 1f828223b799 ("memcg: flush lruvec stats in the refault"). In this patch we kept the stats update codepath very minimal and let the stats reader side to flush the stats only when the updates are over a specific threshold. For now the threshold is (nr_cpus * CHARGE_BATCH). To evaluate the impact of this patch, an 8 GiB tmpfs file is created on a system with swap-on-zram and the file was pushed to swap through memory.force_empty interface. On reading the whole file, the memcg stat flush in the refault code path is triggered. With this patch, we observed 63% reduction in the read time of 8 GiB file. Link: https://lkml.kernel.org/r/20211001190040.48086-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Reviewed-by: "Michal Koutný" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 78 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 23 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 15e6fb5d56a7..ae12fdd4bc0d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,11 +103,6 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } -/* memcg and lruvec stats flushing */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_SPINLOCK(stats_flush_lock); - #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -635,6 +630,56 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static DEFINE_SPINLOCK(stats_flush_lock); +static DEFINE_PER_CPU(unsigned int, stats_updates); +static atomic_t stats_flush_threshold = ATOMIC_INIT(0); + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +{ + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) + atomic_inc(&stats_flush_threshold); +} + +static void __mem_cgroup_flush_stats(void) +{ + if (!spin_trylock(&stats_flush_lock)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + atomic_set(&stats_flush_threshold, 0); + spin_unlock(&stats_flush_lock); +} + +void mem_cgroup_flush_stats(void) +{ + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + __mem_cgroup_flush_stats(); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -647,7 +692,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -675,10 +720,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, memcg = pn->memcg; /* Update memcg */ - __mod_memcg_state(memcg, idx, val); + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg); } /** @@ -780,7 +827,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, return; __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -5341,21 +5388,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } -void mem_cgroup_flush_stats(void) -{ - if (!spin_trylock(&stats_flush_lock)) - return; - - cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); - spin_unlock(&stats_flush_lock); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ - mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); -} - static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); From fd25a9e0e23b995fd0ba5e2f00a1099452cbc3cf Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 5 Nov 2021 13:37:34 -0700 Subject: [PATCH 311/512] memcg: unify memcg stat flushing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memcg stats can be flushed in multiple context and potentially in parallel too. For example multiple parallel user space readers for memcg stats will contend on the rstat locks with each other. There is no need for that. We just need one flusher and everyone else can benefit. In addition after aa48e47e3906 ("memcg: infrastructure to flush memcg stats") the kernel periodically flush the memcg stats from the root, so, the other flushers will potentially have much less work to do. Link: https://lkml.kernel.org/r/20211001190040.48086-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: "Michal Koutný" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae12fdd4bc0d..fd18076171b5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -660,12 +660,14 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg) static void __mem_cgroup_flush_stats(void) { - if (!spin_trylock(&stats_flush_lock)) + unsigned long flag; + + if (!spin_trylock_irqsave(&stats_flush_lock, flag)) return; cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); - spin_unlock(&stats_flush_lock); + spin_unlock_irqrestore(&stats_flush_lock, flag); } void mem_cgroup_flush_stats(void) @@ -1461,7 +1463,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -3565,8 +3567,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - /* mem_cgroup_threshold() calls here from irqsafe context */ - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3947,7 +3948,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -4019,7 +4020,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4522,7 +4523,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -6405,7 +6406,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; From 38d4ef44ee4a7dbdf220daa52ad341c140f3bb51 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 5 Nov 2021 13:37:37 -0700 Subject: [PATCH 312/512] mm/memcg: remove obsolete memcg_free_kmem() Since commit d648bcc7fe65 ("mm: kmem: make memcg_kmem_enabled() irreversible"), the only thing memcg_free_kmem() does is to call memcg_offline_kmem() when the memcg is still online which can happen when online_css() fails due to -ENOMEM. However, the name memcg_free_kmem() is confusing and it is more clear and straight forward to call memcg_offline_kmem() directly from mem_cgroup_css_free(). Link: https://lkml.kernel.org/r/20211005202450.11775-1-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Roman Gushchin Reviewed-by: Aaron Tomlin Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd18076171b5..e092cce3c96e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3704,13 +3704,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_free_cache_id(kmemcg_id); } - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ - /* css_alloc() failed, offlining didn't happen */ - if (unlikely(memcg->kmem_state == KMEM_ONLINE)) - memcg_offline_kmem(memcg); -} #else static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -3719,9 +3712,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ static int memcg_update_kmem_max(struct mem_cgroup *memcg, @@ -5356,7 +5346,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); - memcg_free_kmem(memcg); + + /* Need to offline kmem if online_css() fails */ + memcg_offline_kmem(memcg); mem_cgroup_free(memcg); } From 16f6bf266c94017c2c4699acab64316ddc0ee77c Mon Sep 17 00:00:00 2001 From: Len Baker Date: Fri, 5 Nov 2021 13:37:40 -0700 Subject: [PATCH 313/512] mm/list_lru.c: prefer struct_size over open coded arithmetic As noted in the "Deprecated Interfaces, Language Features, Attributes, and Conventions" documentation [1], size calculations (especially multiplication) should not be performed in memory allocator (or similar) function arguments due to the risk of them overflowing. This could lead to values wrapping around and a smaller allocation being made than the caller was expecting. Using those allocations could lead to linear overflows of heap memory and other misbehaviors. So, use the struct_size() helper to do the arithmetic instead of the argument "size + count * size" in the kvmalloc() functions. Also, take the opportunity to refactor the memcpy() call to use the flex_array_size() helper. This code was detected with the help of Coccinelle and audited and fixed manually. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments Link: https://lkml.kernel.org/r/20211017105929.9284-1-len.baker@gmx.com Signed-off-by: Len Baker Cc: Kees Cook Cc: "Gustavo A. R. Silva" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index cd58790d0fb3..a6031f1c5bd7 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -354,8 +354,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; int size = memcg_nr_cache_ids; - memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + - size * sizeof(void *), GFP_KERNEL); + memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL); if (!memcg_lrus) return -ENOMEM; @@ -389,7 +388,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, old = rcu_dereference_protected(nlru->memcg_lrus, lockdep_is_held(&list_lrus_mutex)); - new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); + new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL); if (!new) return -ENOMEM; @@ -398,7 +397,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, return -ENOMEM; } - memcpy(&new->lru, &old->lru, old_size * sizeof(void *)); + memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size)); /* * The locking below allows readers that hold nlru->lock avoid taking From 58056f77502f3567b760c9a8fc8d2e9081515b2d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 5 Nov 2021 13:37:44 -0700 Subject: [PATCH 314/512] memcg, kmem: further deprecate kmem.limit_in_bytes The deprecation process of kmem.limit_in_bytes started with the commit 0158115f702 ("memcg, kmem: deprecate kmem.limit_in_bytes") which also explains in detail the motivation behind the deprecation. To summarize, it is the unexpected behavior on hitting the kmem limit. This patch moves the deprecation process to the next stage by disallowing to set the kmem limit. In future we might just remove the kmem.limit_in_bytes file completely. [akpm@linux-foundation.org: s/ENOTSUPP/EOPNOTSUPP/] [arnd@arndb.de: mark cancel_charge() inline] Link: https://lkml.kernel.org/r/20211022070542.679839-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20211019153408.2916808-1-shakeelb@google.com Signed-off-by: Shakeel Butt Signed-off-by: Arnd Bergmann Acked-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Muchun Song Cc: Vasily Averin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/cgroup-v1/memory.rst | 11 +----- mm/memcontrol.c | 39 +++---------------- 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 41191b5fb69d..faac50149a22 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -87,10 +87,8 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node - memory.kmem.limit_in_bytes set/show hard limit for kernel memory - This knob is deprecated and shouldn't be - used. It is planned that this be removed in - the foreseeable future. + memory.kmem.limit_in_bytes This knob is deprecated and writing to + it will return -ENOTSUPP. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits @@ -518,11 +516,6 @@ will be charged as a new owner of it. charged file caches. Some out-of-use page caches may keep charged until memory pressure happens. If you want to avoid that, force_empty will be useful. - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - 5.2 stat file ------------- diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e092cce3c96e..e4fcb6e55f75 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2771,8 +2771,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) -static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) return; @@ -2781,7 +2780,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); } -#endif static void commit_charge(struct page *page, struct mem_cgroup *memcg) { @@ -3000,7 +2998,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, unsigned int nr_pages) { - struct page_counter *counter; struct mem_cgroup *memcg; int ret; @@ -3010,21 +3007,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && - !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { - - /* - * Enforce __GFP_NOFAIL allocation because callers are not - * prepared to see failures and likely do not have any failure - * handling code. - */ - if (gfp & __GFP_NOFAIL) { - page_counter_charge(&memcg->kmem, nr_pages); - goto out; - } - cancel_charge(memcg, nr_pages); - ret = -ENOMEM; - } + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_charge(&memcg->kmem, nr_pages); out: css_put(&memcg->css); @@ -3714,17 +3698,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) } #endif /* CONFIG_MEMCG_KMEM */ -static int memcg_update_kmem_max(struct mem_cgroup *memcg, - unsigned long max) -{ - int ret; - - mutex_lock(&memcg_max_mutex); - ret = page_counter_set_max(&memcg->kmem, max); - mutex_unlock(&memcg_max_mutex); - return ret; -} - static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; @@ -3790,10 +3763,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " - "Please report your usecase to linux-mm@kvack.org if you " - "depend on this functionality.\n"); - ret = memcg_update_kmem_max(memcg, nr_pages); + /* kmem.limit_in_bytes is deprecated. */ + ret = -EOPNOTSUPP; break; case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); From 60ec6a48eec24986a6414740a2481d22efc1b2f9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:47 -0700 Subject: [PATCH 315/512] mm: list_lru: remove holding lru lock Since commit e5bc3af7734f ("rcu: Consolidate PREEMPT and !PREEMPT synchronize_rcu()"), the critical section of spin lock can serve as an RCU read-side critical section which already allows readers that hold nlru->lock to avoid taking rcu lock. So just remove holding lock. Link: https://lkml.kernel.org/r/20211025124534.56345-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index a6031f1c5bd7..9a1f7df1afc9 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -398,18 +398,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, } memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size)); - - /* - * The locking below allows readers that hold nlru->lock avoid taking - * rcu_read_lock (see list_lru_from_memcg_idx). - * - * Since list_lru_{add,del} may be called under an IRQ-safe lock, - * we have to use IRQ-safe primitives here to avoid deadlock. - */ - spin_lock_irq(&nlru->lock); rcu_assign_pointer(nlru->memcg_lrus, new); - spin_unlock_irq(&nlru->lock); - kvfree_rcu(old, rcu); return 0; } From 41d17431df4aa7c57761e04f81c94fb3c3beedf4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:50 -0700 Subject: [PATCH 316/512] mm: list_lru: fix the return value of list_lru_count_one() Since commit 2788cf0c401c ("memcg: reparent list_lrus and free kmemcg_id on css offline"), ->nr_items can be negative during memory cgroup reparenting. In this case, list_lru_count_one() will return an unusual and huge value, which can surprise users. At least for now it hasn't affected any users. But it is better to let list_lru_count_ont() returns zero when ->nr_items is negative. Link: https://lkml.kernel.org/r/20211025124910.56433-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 9a1f7df1afc9..7572f8e70b86 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -176,13 +176,16 @@ unsigned long list_lru_count_one(struct list_lru *lru, { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - unsigned long count; + long count; rcu_read_lock(); l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); count = READ_ONCE(l->nr_items); rcu_read_unlock(); + if (unlikely(count < 0)) + count = 0; + return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); From 642688681133a501d149349ba1a824204f3540e1 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:53 -0700 Subject: [PATCH 317/512] mm: memcontrol: remove kmemcg_id reparenting Since slab objects and kmem pages are charged to object cgroup instead of memory cgroup, memcg_reparent_objcgs() will reparent this cgroup and all its descendants to its parent cgroup. This already makes further list_lru_add()'s add elements to the parent's list. So it is unnecessary to change kmemcg_id of an offline cgroup to its parent's id. It just wastes CPU cycles. Just remove the redundant code. Link: https://lkml.kernel.org/r/20211025125102.56533-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e4fcb6e55f75..1e2d3f378edd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3650,8 +3650,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; + struct mem_cgroup *parent; int kmemcg_id; if (memcg->kmem_state != KMEM_ONLINE) @@ -3669,21 +3668,11 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) BUG_ON(kmemcg_id < 0); /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by + * After we have finished memcg_reparent_objcgs(), all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. + * The ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ - rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - } - rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); From e80216d9f1f5c90b3cab834cd4fb492d731f70aa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:56 -0700 Subject: [PATCH 318/512] mm: memcontrol: remove the kmem states Now the kmem states is only used to indicate whether the kmem is offline. However, we can set ->kmemcg_id to -1 to indicate whether the kmem is offline. Finally, we can remove the kmem states to simplify the code. Link: https://lkml.kernel.org/r/20211025125259.56624-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Shakeel Butt Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ------- mm/memcontrol.c | 7 ++----- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3096c9a0ee01..f207f98bdb76 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -180,12 +180,6 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; -enum memcg_kmem_state { - KMEM_NONE, - KMEM_ALLOCATED, - KMEM_ONLINE, -}; - #if defined(CONFIG_SMP) struct memcg_padding { char x[0]; @@ -318,7 +312,6 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; - enum memcg_kmem_state kmem_state; struct obj_cgroup __rcu *objcg; struct list_head objcg_list; /* list of inherited objcgs */ #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1e2d3f378edd..b4a17b7a10d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3626,7 +3626,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_state); memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) @@ -3643,7 +3642,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static_branch_enable(&memcg_kmem_enabled_key); memcg->kmemcg_id = memcg_id; - memcg->kmem_state = KMEM_ONLINE; return 0; } @@ -3653,11 +3651,9 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) struct mem_cgroup *parent; int kmemcg_id; - if (memcg->kmem_state != KMEM_ONLINE) + if (memcg->kmemcg_id == -1) return; - memcg->kmem_state = KMEM_ALLOCATED; - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; @@ -3676,6 +3672,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); + memcg->kmemcg_id = -1; } #else static int memcg_online_kmem(struct mem_cgroup *memcg) From 3eef11279ba5936b1554a0dccb1cea8345e5e2a5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:59 -0700 Subject: [PATCH 319/512] mm: list_lru: only add memcg-aware lrus to the global lru list The non-memcg-aware lru is always skiped when traversing the global lru list, which is not efficient. We can only add the memcg-aware lru to the global lru list instead to make traversing more efficient. Link: https://lkml.kernel.org/r/20211025124353.55781-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 7572f8e70b86..0cd5e89ca063 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,18 +15,29 @@ #include "slab.h" #ifdef CONFIG_MEMCG_KMEM -static LIST_HEAD(list_lrus); +static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return lru->memcg_aware; +} + static void list_lru_register(struct list_lru *lru) { + if (!list_lru_memcg_aware(lru)) + return; + mutex_lock(&list_lrus_mutex); - list_add(&lru->list, &list_lrus); + list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { + if (!list_lru_memcg_aware(lru)) + return; + mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); @@ -37,11 +48,6 @@ static int lru_shrinker_id(struct list_lru *lru) return lru->shrinker_id; } -static inline bool list_lru_memcg_aware(struct list_lru *lru) -{ - return lru->memcg_aware; -} - static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) { @@ -457,9 +463,6 @@ static int memcg_update_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return 0; - for_each_node(i) { if (memcg_update_list_lru_node(&lru->node[i], old_size, new_size)) @@ -482,9 +485,6 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return; - for_each_node(i) memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); @@ -497,7 +497,7 @@ int memcg_update_all_list_lrus(int new_size) int old_size = memcg_nr_cache_ids; mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &list_lrus, list) { + list_for_each_entry(lru, &memcg_list_lrus, list) { ret = memcg_update_list_lru(lru, old_size, new_size); if (ret) goto fail; @@ -506,7 +506,7 @@ out: mutex_unlock(&list_lrus_mutex); return ret; fail: - list_for_each_entry_continue_reverse(lru, &list_lrus, list) + list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list) memcg_cancel_update_list_lru(lru, old_size, new_size); goto out; } @@ -543,9 +543,6 @@ static void memcg_drain_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return; - for_each_node(i) memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg); } @@ -555,7 +552,7 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) struct list_lru *lru; mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &list_lrus, list) + list_for_each_entry(lru, &memcg_list_lrus, list) memcg_drain_list_lru(lru, src_idx, dst_memcg); mutex_unlock(&list_lrus_mutex); } From 0b28179a6138a5edd9d82ad2687c05b3773c387b Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:38:02 -0700 Subject: [PATCH 320/512] mm, oom: pagefault_out_of_memory: don't force global OOM for dying tasks Patch series "memcg: prohibit unconditional exceeding the limit of dying tasks", v3. Memory cgroup charging allows killed or exiting tasks to exceed the hard limit. It can be misused and allowed to trigger global OOM from inside a memcg-limited container. On the other hand if memcg fails allocation, called from inside #PF handler it triggers global OOM from inside pagefault_out_of_memory(). To prevent these problems this patchset: (a) removes execution of out_of_memory() from pagefault_out_of_memory(), becasue nobody can explain why it is necessary. (b) allow memcg to fail allocation of dying/killed tasks. This patch (of 3): Any allocation failure during the #PF path will return with VM_FAULT_OOM which in turn results in pagefault_out_of_memory which in turn executes out_out_memory() and can kill a random task. An allocation might fail when the current task is the oom victim and there are no memory reserves left. The OOM killer is already handled at the page allocator level for the global OOM and at the charging level for the memcg one. Both have much more information about the scope of allocation/charge request. This means that either the OOM killer has been invoked properly and didn't lead to the allocation success or it has been skipped because it couldn't have been invoked. In both cases triggering it from here is pointless and even harmful. It makes much more sense to let the killed task die rather than to wake up an eternally hungry oom-killer and send him to choose a fatter victim for breakfast. Link: https://lkml.kernel.org/r/0828a149-786e-7c06-b70a-52d086818ea3@virtuozzo.com Signed-off-by: Vasily Averin Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tetsuo Handa Cc: Uladzislau Rezki Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 989f35a2bbb1..d89545d1a5b8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1137,6 +1137,9 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return; + if (fatal_signal_pending(current)) + return; + if (!mutex_trylock(&oom_lock)) return; out_of_memory(&oc); From 60e2793d440a3ec95abb5d6d4fc034a4b480472d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 5 Nov 2021 13:38:06 -0700 Subject: [PATCH 321/512] mm, oom: do not trigger out_of_memory from the #PF Any allocation failure during the #PF path will return with VM_FAULT_OOM which in turn results in pagefault_out_of_memory. This can happen for 2 different reasons. a) Memcg is out of memory and we rely on mem_cgroup_oom_synchronize to perform the memcg OOM handling or b) normal allocation fails. The latter is quite problematic because allocation paths already trigger out_of_memory and the page allocator tries really hard to not fail allocations. Anyway, if the OOM killer has been already invoked there is no reason to invoke it again from the #PF path. Especially when the OOM condition might be gone by that time and we have no way to find out other than allocate. Moreover if the allocation failed and the OOM killer hasn't been invoked then we are unlikely to do the right thing from the #PF context because we have already lost the allocation context and restictions and therefore might oom kill a task from a different NUMA domain. This all suggests that there is no legitimate reason to trigger out_of_memory from pagefault_out_of_memory so drop it. Just to be sure that no #PF path returns with VM_FAULT_OOM without allocation print a warning that this is happening before we restart the #PF. [VvS: #PF allocation can hit into limit of cgroup v1 kmem controller. This is a local problem related to memcg, however, it causes unnecessary global OOM kills that are repeated over and over again and escalate into a real disaster. This has been broken since kmem accounting has been introduced for cgroup v1 (3.8). There was no kmem specific reclaim for the separate limit so the only way to handle kmem hard limit was to return with ENOMEM. In upstream the problem will be fixed by removing the outdated kmem limit, however stable and LTS kernels cannot do it and are still affected. This patch fixes the problem and should be backported into stable/LTS.] Link: https://lkml.kernel.org/r/f5fd8dd8-0ad4-c524-5f65-920b01972a42@virtuozzo.com Signed-off-by: Michal Hocko Signed-off-by: Vasily Averin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tetsuo Handa Cc: Uladzislau Rezki Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d89545d1a5b8..bfa9e348c3a3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1120,19 +1120,15 @@ bool out_of_memory(struct oom_control *oc) } /* - * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If oom_lock is held by somebody else, a parallel oom - * killing is already in progress so do nothing. + * The pagefault handler calls here because some allocation has failed. We have + * to take care of the memcg OOM here because this is the only safe context without + * any locks held but let the oom killer triggered from the allocation context care + * about the global OOM. */ void pagefault_out_of_memory(void) { - struct oom_control oc = { - .zonelist = NULL, - .nodemask = NULL, - .memcg = NULL, - .gfp_mask = 0, - .order = 0, - }; + static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (mem_cgroup_oom_synchronize(true)) return; @@ -1140,10 +1136,8 @@ void pagefault_out_of_memory(void) if (fatal_signal_pending(current)) return; - if (!mutex_trylock(&oom_lock)) - return; - out_of_memory(&oc); - mutex_unlock(&oom_lock); + if (__ratelimit(&pfoom_rs)) + pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); } SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) From a4ebf1b6ca1e011289677239a2a361fde4a88076 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:38:09 -0700 Subject: [PATCH 322/512] memcg: prohibit unconditional exceeding the limit of dying tasks Memory cgroup charging allows killed or exiting tasks to exceed the hard limit. It is assumed that the amount of the memory charged by those tasks is bound and most of the memory will get released while the task is exiting. This is resembling a heuristic for the global OOM situation when tasks get access to memory reserves. There is no global memory shortage at the memcg level so the memcg heuristic is more relieved. The above assumption is overly optimistic though. E.g. vmalloc can scale to really large requests and the heuristic would allow that. We used to have an early break in the vmalloc allocator for killed tasks but this has been reverted by commit b8c8a338f75e ("Revert "vmalloc: back off when the current task is killed""). There are likely other similar code paths which do not check for fatal signals in an allocation&charge loop. Also there are some kernel objects charged to a memcg which are not bound to a process life time. It has been observed that it is not really hard to trigger these bypasses and cause global OOM situation. One potential way to address these runaways would be to limit the amount of excess (similar to the global OOM with limited oom reserves). This is certainly possible but it is not really clear how much of an excess is desirable and still protects from global OOMs as that would have to consider the overall memcg configuration. This patch is addressing the problem by removing the heuristic altogether. Bypass is only allowed for requests which either cannot fail or where the failure is not desirable while excess should be still limited (e.g. atomic requests). Implementation wise a killed or dying task fails to charge if it has passed the OOM killer stage. That should give all forms of reclaim chance to restore the limit before the failure (ENOMEM) and tell the caller to back off. In addition, this patch renames should_force_charge() helper to task_is_dying() because now its use is not associated witch forced charging. This patch depends on pagefault_out_of_memory() to not trigger out_of_memory(), because then a memcg failure can unwind to VM_FAULT_OOM and cause a global OOM killer. Link: https://lkml.kernel.org/r/8f5cebbb-06da-4902-91f0-6566fc4b4203@virtuozzo.com Signed-off-by: Vasily Averin Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Uladzislau Rezki Cc: Vlastimil Babka Cc: Shakeel Butt Cc: Mel Gorman Cc: Tetsuo Handa Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b4a17b7a10d3..cf0321d7a784 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -234,7 +234,7 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool should_force_charge(void) +static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || (current->flags & PF_EXITING); @@ -1624,7 +1624,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = should_force_charge() || out_of_memory(&oc); + ret = task_is_dying() || out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -2579,6 +2579,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; enum oom_status oom_status; unsigned long nr_reclaimed; + bool passed_oom = false; bool may_swap = true; bool drained = false; unsigned long pflags; @@ -2613,15 +2614,6 @@ retry: if (gfp_mask & __GFP_ATOMIC) goto force; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(should_force_charge())) - goto force; - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, @@ -2679,8 +2671,9 @@ retry: if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (fatal_signal_pending(current)) - goto force; + /* Avoid endless loop for tasks bypassed by the oom killer */ + if (passed_oom && task_is_dying()) + goto nomem; /* * keep retrying as long as the memcg oom killer is able to make @@ -2689,14 +2682,10 @@ retry: */ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); - switch (oom_status) { - case OOM_SUCCESS: + if (oom_status == OOM_SUCCESS) { + passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; - case OOM_FAILED: - goto force; - default: - goto nomem; } nomem: if (!(gfp_mask & __GFP_NOFAIL)) From 7866076b924ad4f285bd4596630a8ca7b8333319 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 5 Nov 2021 13:38:12 -0700 Subject: [PATCH 323/512] mm/mmap.c: fix a data race of mm->total_vm The variable mm->total_vm could be accessed concurrently during mmaping and system accounting as noticed by KCSAN, BUG: KCSAN: data-race in __acct_update_integrals / mmap_region read-write to 0xffffa40267bd14c8 of 8 bytes by task 15609 on cpu 3: mmap_region+0x6dc/0x1400 do_mmap+0x794/0xca0 vm_mmap_pgoff+0xdf/0x150 ksys_mmap_pgoff+0xe1/0x380 do_syscall_64+0x37/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffffa40267bd14c8 of 8 bytes by interrupt on cpu 2: __acct_update_integrals+0x187/0x1d0 acct_account_cputime+0x3c/0x40 update_process_times+0x5c/0x150 tick_sched_timer+0x184/0x210 __run_hrtimer+0x119/0x3b0 hrtimer_interrupt+0x350/0xaa0 __sysvec_apic_timer_interrupt+0x7b/0x220 asm_call_irq_on_stack+0x12/0x20 sysvec_apic_timer_interrupt+0x4d/0x80 asm_sysvec_apic_timer_interrupt+0x12/0x20 smp_call_function_single+0x192/0x2b0 perf_install_in_context+0x29b/0x4a0 __se_sys_perf_event_open+0x1a98/0x2550 __x64_sys_perf_event_open+0x63/0x70 do_syscall_64+0x37/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 2 PID: 15610 Comm: syz-executor.3 Not tainted 5.10.0+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 In vm_stat_account which called by mmap_region, increase total_vm, and __acct_update_integrals may read total_vm at the same time. This will cause a data race which lead to undefined behaviour. To avoid potential bad read/write, volatile property and barrier are both used to avoid undefined behaviour. Link: https://lkml.kernel.org/r/20210913105550.1569419-1-liupeng256@huawei.com Signed-off-by: Peng Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 2 +- mm/mmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 257ffb993ea2..f00de83d0246 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -137,7 +137,7 @@ static void __acct_update_integrals(struct task_struct *tsk, * the rest of the math is done in xacct_add_tsk. */ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; + tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10; } /** diff --git a/mm/mmap.c b/mm/mmap.c index 88dcc5c25225..b22a07f5e761 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3332,7 +3332,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { - mm->total_vm += npages; + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); if (is_exec_mapping(flags)) mm->exec_vm += npages; From f1dc0db296bd25960273649fc6ef2ecbf5aaa0e0 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 5 Nov 2021 13:38:15 -0700 Subject: [PATCH 324/512] mm: use __pfn_to_section() instead of open coding it It is defined in the same file just a few lines above. Link: https://lkml.kernel.org/r/4598487.Rc0NezkW7i@mobilepool36.emlix.com Signed-off-by: Rolf Eike Beer Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6a1d79d84675..832aa49d0d8e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1481,7 +1481,7 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - ms = __nr_to_section(pfn_to_section_nr(pfn)); + ms = __pfn_to_section(pfn); if (!valid_section(ms)) return 0; /* @@ -1496,7 +1496,7 @@ static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - return present_section(__nr_to_section(pfn_to_section_nr(pfn))); + return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) From b063e374e7ae75589a36f4bcbfb47956ac197c57 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 5 Nov 2021 13:38:18 -0700 Subject: [PATCH 325/512] mm/memory.c: avoid unnecessary kernel/user pointer conversion Annotating a pointer from __user to kernel and then back again might confuse sparse. In copy_huge_page_from_user() it can be avoided by removing the intermediate variable since it is never used. Link: https://lkml.kernel.org/r/20210914150820.19326-1-amit.kachhap@arm.com Signed-off-by: Amit Daniel Kachhap Acked-by: Kirill A. Shutemov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c52be6d6b605..1b7032a30dd5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5421,7 +5421,6 @@ long copy_huge_page_from_user(struct page *dst_page, unsigned int pages_per_huge_page, bool allow_pagefault) { - void *src = (void *)usr_src; void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; @@ -5434,8 +5433,7 @@ long copy_huge_page_from_user(struct page *dst_page, else page_kaddr = kmap_atomic(subpage); rc = copy_from_user(page_kaddr, - (const void __user *)(src + i * PAGE_SIZE), - PAGE_SIZE); + usr_src + i * PAGE_SIZE, PAGE_SIZE); if (allow_pagefault) kunmap(subpage); else From 9ae0f87d009ca6c4aab2882641ddfc319727e3db Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:24 -0700 Subject: [PATCH 326/512] mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte Patch series "mm: A few cleanup patches around zap, shmem and uffd", v4. IMHO all of them are very nice cleanups to existing code already, they're all small and self-contained. They'll be needed by uffd-wp coming series. This patch (of 4): It was conditionally done previously, as there's one shmem special case that we use SetPageDirty() instead. However that's not necessary and it should be easier and cleaner to do it unconditionally in mfill_atomic_install_pte(). The most recent discussion about this is here, where Hugh explained the history of SetPageDirty() and why it's possible that it's not required at all: https://lore.kernel.org/lkml/alpine.LSU.2.11.2104121657050.1097@eggly.anvils/ Currently mfill_atomic_install_pte() has three callers: 1. shmem_mfill_atomic_pte 2. mcopy_atomic_pte 3. mcontinue_atomic_pte After the change: case (1) should have its SetPageDirty replaced by the dirty bit on pte (so we unify them together, finally), case (2) should have no functional change at all as it has page_in_cache==false, case (3) may add a dirty bit to the pte. However since case (3) is UFFDIO_CONTINUE for shmem, it's merely 100% sure the page is dirty after all because UFFDIO_CONTINUE normally requires another process to modify the page cache and kick the faulted thread, so should not make a real difference either. This should make it much easier to follow on which case will set dirty for uffd, as we'll simply set it all now for all uffd related ioctls. Meanwhile, no special handling of SetPageDirty() if there's no need. Link: https://lkml.kernel.org/r/20210915181456.10739-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210915181456.10739-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Liam Howlett Cc: Mike Rapoport Cc: Yang Shi Cc: David Hildenbrand Cc: "Kirill A . Shutemov" Cc: Jerome Glisse Cc: Alistair Popple Cc: Miaohe Lin Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 - mm/userfaultfd.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7cd976b588ff..df2f0288b9ca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2423,7 +2423,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - SetPageDirty(page); unlock_page(page); return 0; out_delete_from_cache: diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7a9008415534..caf6dfff2a60 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -69,10 +69,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, pgoff_t offset, max_off; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - if (writable || !page_in_cache) - _dst_pte = pte_mkdirty(_dst_pte); if (writable) { if (wp_copy) _dst_pte = pte_mkuffd_wp(_dst_pte); From 2ca99358671ad3a2065389476701f69f39ab5e5f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:28 -0700 Subject: [PATCH 327/512] mm: clear vmf->pte after pte_unmap_same() returns pte_unmap_same() will always unmap the pte pointer. After the unmap, vmf->pte will not be valid any more, we should clear it. It was safe only because no one is accessing vmf->pte after pte_unmap_same() returns, since the only caller of pte_unmap_same() (so far) is do_swap_page(), where vmf->pte will in most cases be overwritten very soon. Directly pass in vmf into pte_unmap_same() and then we can also avoid the long parameter list too, which should be a nice cleanup. Link: https://lkml.kernel.org/r/20210915181533.11188-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Liam Howlett Acked-by: Hugh Dickins Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 1b7032a30dd5..cf7b059b57a7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2724,19 +2724,19 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range); * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ -static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, - pte_t *page_table, pte_t orig_pte) +static inline int pte_unmap_same(struct vm_fault *vmf) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(mm, pmd); + spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(ptl); - same = pte_same(*page_table, orig_pte); + same = pte_same(*vmf->pte, vmf->orig_pte); spin_unlock(ptl); } #endif - pte_unmap(page_table); + pte_unmap(vmf->pte); + vmf->pte = NULL; return same; } @@ -3488,7 +3488,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vm_fault_t ret = 0; void *shadow = NULL; - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) + if (!pte_unmap_same(vmf)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); From 232a6a1c0619d1b4d9cd8d21949b2f13821be0af Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:31 -0700 Subject: [PATCH 328/512] mm: drop first_index/last_index in zap_details The first_index/last_index parameters in zap_details are actually only used in unmap_mapping_range_tree(). At the meantime, this function is only called by unmap_mapping_pages() once. Instead of passing these two variables through the whole stack of page zapping code, remove them from zap_details and let them simply be parameters of unmap_mapping_range_tree(), which is inlined. Link: https://lkml.kernel.org/r/20210915181535.11238-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Reviewed-by: David Hildenbrand Reviewed-by: Liam Howlett Acked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Rapoport Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- mm/memory.c | 31 ++++++++++++++++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 32b2ecc47408..c5b600d7f85b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1688,8 +1688,6 @@ extern void user_shm_unlock(size_t, struct ucounts *); */ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ struct page *single_page; /* Locked page to be unmapped */ }; diff --git a/mm/memory.c b/mm/memory.c index cf7b059b57a7..d4ed701e3e5d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3321,20 +3321,20 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, + pgoff_t first_index, + pgoff_t last_index, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; - vma_interval_tree_foreach(vma, root, - details->first_index, details->last_index) { - + vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; - zba = details->first_index; + zba = first_index; if (zba < vba) zba = vba; - zea = details->last_index; + zea = last_index; if (zea > vea) zea = vea; @@ -3360,18 +3360,22 @@ void unmap_mapping_page(struct page *page) { struct address_space *mapping = page->mapping; struct zap_details details = { }; + pgoff_t first_index; + pgoff_t last_index; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageTail(page)); + first_index = page->index; + last_index = page->index + thp_nr_pages(page) - 1; + details.check_mapping = mapping; - details.first_index = page->index; - details.last_index = page->index + thp_nr_pages(page) - 1; details.single_page = page; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); + unmap_mapping_range_tree(&mapping->i_mmap, first_index, + last_index, &details); i_mmap_unlock_write(mapping); } @@ -3391,16 +3395,17 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; + pgoff_t first_index = start; + pgoff_t last_index = start + nr - 1; details.check_mapping = even_cows ? NULL : mapping; - details.first_index = start; - details.last_index = start + nr - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; + if (last_index < first_index) + last_index = ULONG_MAX; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); + unmap_mapping_range_tree(&mapping->i_mmap, first_index, + last_index, &details); i_mmap_unlock_write(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); From 91b61ef333cf43f96b3522a086c9ac925763d6e5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:34 -0700 Subject: [PATCH 329/512] mm: add zap_skip_check_mapping() helper Use the helper for the checks. Rename "check_mapping" into "zap_mapping" because "check_mapping" looks like a bool but in fact it stores the mapping itself. When it's set, we check the mapping (it must be non-NULL). When it's cleared we skip the check, which works like the old way. Move the duplicated comments to the helper too. Link: https://lkml.kernel.org/r/20210915181538.11288-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Rapoport Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 16 +++++++++++++++- mm/memory.c | 29 ++++++----------------------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c5b600d7f85b..6e29deb1e0d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1687,10 +1687,24 @@ extern void user_shm_unlock(size_t, struct ucounts *); * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct address_space *check_mapping; /* Check page->mapping if set */ + struct address_space *zap_mapping; /* Check page->mapping if set */ struct page *single_page; /* Locked page to be unmapped */ }; +/* + * We set details->zap_mappings when we want to unmap shared but keep private + * pages. Return true if skip zapping this page, false otherwise. + */ +static inline bool +zap_skip_check_mapping(struct zap_details *details, struct page *page) +{ + if (!details || !page) + return false; + + return details->zap_mapping && + (details->zap_mapping != page_rmapping(page)); +} + struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index d4ed701e3e5d..48b2e048d267 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1333,16 +1333,8 @@ again: struct page *page; page = vm_normal_page(vma, addr, ptent); - if (unlikely(details) && page) { - /* - * unmap_shared_mapping_pages() wants to - * invalidate cache without truncating: - * unmap shared but keep private pages. - */ - if (details->check_mapping && - details->check_mapping != page_rmapping(page)) - continue; - } + if (unlikely(zap_skip_check_mapping(details, page))) + continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); @@ -1375,17 +1367,8 @@ again: is_device_exclusive_entry(entry)) { struct page *page = pfn_swap_entry_to_page(entry); - if (unlikely(details && details->check_mapping)) { - /* - * unmap_shared_mapping_pages() wants to - * invalidate cache without truncating: - * unmap shared but keep private pages. - */ - if (details->check_mapping != - page_rmapping(page)) - continue; - } - + if (unlikely(zap_skip_check_mapping(details, page))) + continue; pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; @@ -3369,7 +3352,7 @@ void unmap_mapping_page(struct page *page) first_index = page->index; last_index = page->index + thp_nr_pages(page) - 1; - details.check_mapping = mapping; + details.zap_mapping = mapping; details.single_page = page; i_mmap_lock_write(mapping); @@ -3398,7 +3381,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; - details.check_mapping = even_cows ? NULL : mapping; + details.zap_mapping = even_cows ? NULL : mapping; if (last_index < first_index) last_index = ULONG_MAX; From 03c4f20454e0231d2cdec4373841a3a25cf4efed Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 5 Nov 2021 13:38:38 -0700 Subject: [PATCH 330/512] mm: introduce pmd_install() helper Patch series "Do some code cleanups related to mm", v3. This patch (of 2): Currently we have three times the same few lines repeated in the code. Deduplicate them by newly introduced pmd_install() helper. Link: https://lkml.kernel.org/r/20210901102722.47686-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20210901102722.47686-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Acked-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Mika Penttila Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 ++--------- mm/internal.h | 1 + mm/memory.c | 34 ++++++++++++++++------------------ 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 938f52702a7a..d05f8013a4f0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3211,15 +3211,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) } } - if (pmd_none(*vmf->pmd)) { - vmf->ptl = pmd_lock(mm, vmf->pmd); - if (likely(pmd_none(*vmf->pmd))) { - mm_inc_nr_ptes(mm); - pmd_populate(mm, vmf->pmd, vmf->prealloc_pte); - vmf->prealloc_pte = NULL; - } - spin_unlock(vmf->ptl); - } + if (pmd_none(*vmf->pmd)) + pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); /* See comment in handle_pte_fault() */ if (pmd_devmap_trans_unstable(vmf->pmd)) { diff --git a/mm/internal.h b/mm/internal.h index cf3cb933eba3..6c3e1a9f8c5a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,6 +38,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { diff --git a/mm/memory.c b/mm/memory.c index 48b2e048d267..38d63f6d998b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -433,9 +433,20 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } +void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) +{ + spinlock_t *ptl = pmd_lock(mm, pmd); + + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + mm_inc_nr_ptes(mm); + pmd_populate(mm, pmd, *pte); + *pte = NULL; + } + spin_unlock(ptl); +} + int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { - spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; @@ -455,13 +466,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - ptl = pmd_lock(mm, pmd); - if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm_inc_nr_ptes(mm); - pmd_populate(mm, pmd, new); - new = NULL; - } - spin_unlock(ptl); + pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); return 0; @@ -4024,17 +4029,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (likely(pmd_none(*vmf->pmd))) { - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - vmf->prealloc_pte = NULL; - } - spin_unlock(vmf->ptl); - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { + if (vmf->prealloc_pte) + pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); + else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) return VM_FAULT_OOM; - } } /* See comment in handle_pte_fault() */ From ed33b5a677da33d6e8f959879bb61e9791b80354 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 5 Nov 2021 13:38:41 -0700 Subject: [PATCH 331/512] mm: remove redundant smp_wmb() The smp_wmb() which is in the __pte_alloc() is used to ensure all ptes setup is visible before the pte is made visible to other CPUs by being put into page tables. We only need this when the pte is actually populated, so move it to pmd_install(). __pte_alloc_kernel(), __p4d_alloc(), __pud_alloc() and __pmd_alloc() are similar to this case. We can also defer smp_wmb() to the place where the pmd entry is really populated by preallocated pte. There are two kinds of user of preallocated pte, one is filemap & finish_fault(), another is THP. The former does not need another smp_wmb() because the smp_wmb() has been done by pmd_install(). Fortunately, the latter also does not need another smp_wmb() because there is already a smp_wmb() before populating the new pte when the THP uses a preallocated pte to split a huge pmd. Link: https://lkml.kernel.org/r/20210901102722.47686-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Acked-by: Kirill A. Shutemov Cc: Johannes Weiner Cc: Michal Hocko Cc: Mika Penttila Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 52 ++++++++++++++++++++------------------------- mm/sparse-vmemmap.c | 2 +- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 38d63f6d998b..5db36280950a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -439,6 +439,20 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_rmb() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ pmd_populate(mm, pmd, *pte); *pte = NULL; } @@ -451,21 +465,6 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) if (!new) return -ENOMEM; - /* - * Ensure all pte setup (eg. pte page lock and page clearing) are - * visible before the pte is made visible to other CPUs by being - * put into page tables. - * - * The other side of the story is the pointer chasing in the page - * table walking code (when walking the page table without locking; - * ie. most of the time). Fortunately, these data accesses consist - * of a chain of data-dependent loads, meaning most CPUs (alpha - * being the notable exception) will already guarantee loads are - * seen in-order. See the alpha page table accessors for the - * smp_rmb() barriers in page table walking code. - */ - smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); @@ -478,10 +477,9 @@ int __pte_alloc_kernel(pmd_t *pmd) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + smp_wmb(); /* See comment in pmd_install() */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } @@ -3845,7 +3843,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } ret = vma->vm_ops->fault(vmf); @@ -3916,7 +3913,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -4141,7 +4137,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); @@ -4815,13 +4810,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); - if (pgd_present(*pgd)) /* Another has populated it */ + if (pgd_present(*pgd)) { /* Another has populated it */ p4d_free(mm, new); - else + } else { + smp_wmb(); /* See comment in pmd_install() */ pgd_populate(mm, pgd, new); + } spin_unlock(&mm->page_table_lock); return 0; } @@ -4838,11 +4833,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); + smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); @@ -4863,14 +4857,14 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); + smp_wmb(); /* See comment in pmd_install() */ pud_populate(mm, pud, new); - } else /* Another has populated it */ + } else { /* Another has populated it */ pmd_free(mm, new); + } spin_unlock(ptl); return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bdce883f9286..db6df27c852a 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -76,7 +76,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in __pte_alloc(). */ + /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); From cbbb69d3c432da9d4afe734ca451fa2c012c05e2 Mon Sep 17 00:00:00 2001 From: Tiberiu A Georgescu Date: Fri, 5 Nov 2021 13:38:44 -0700 Subject: [PATCH 332/512] Documentation: update pagemap with shmem exceptions This patch follows the discussions on previous documentation patch threads [1][2]. It presents the exception case of shared memory management from the pagemap's point of view. It briefly describes what is missing, why it is missing and alternatives to the pagemap for page info retrieval in user space. In short, the kernel does not keep track of PTEs for swapped out shared pages within the processes that references them. Thus, the proc/pid/pagemap tool cannot print the swap destination of the shared memory pages, instead setting the pagemap entry to zero for both non-allocated and swapped out pages. This can create confusion for users who need information on swapped out pages. The reasons why maintaining the PTEs of all swapped out shared pages among all processes while maintaining similar performance is not a trivial task, or a desirable change, have been discussed extensively [1][3][4][5]. There are also arguments for why this arguably missing information should eventually be exposed to the user in either a future pagemap patch, or by an alternative tool. [1]: https://marc.info/?m=162878395426774 [2]: https://lore.kernel.org/lkml/20210920164931.175411-1-tiberiu.georgescu@nutanix.com/ [3]: https://lore.kernel.org/lkml/20210730160826.63785-1-tiberiu.georgescu@nutanix.com/ [4]: https://lore.kernel.org/lkml/20210807032521.7591-1-peterx@redhat.com/ [5]: https://lore.kernel.org/lkml/20210715201651.212134-1-peterx@redhat.com/ Mention the current missing information in the pagemap and alternatives on how to retrieve it, in case someone stumbles upon unexpected behaviour. Link: https://lkml.kernel.org/r/20210923064618.157046-1-tiberiu.georgescu@nutanix.com Link: https://lkml.kernel.org/r/20210923064618.157046-2-tiberiu.georgescu@nutanix.com Signed-off-by: Tiberiu A Georgescu Reviewed-by: Ivan Teterevkov Reviewed-by: Florian Schmidt Reviewed-by: Carl Waldspurger Reviewed-by: Jonathan Davies Reviewed-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index fb578fbbb76c..4581527c07ae 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -196,6 +196,28 @@ you can go through every map in the process, find the PFNs, look those up in kpagecount, and tally up the number of pages that are only referenced once. +Exceptions for Shared Memory +============================ + +Page table entries for shared pages are cleared when the pages are zapped or +swapped out. This makes swapped out pages indistinguishable from never-allocated +ones. + +In kernel space, the swap location can still be retrieved from the page cache. +However, values stored only on the normal PTE get lost irretrievably when the +page is swapped out (i.e. SOFT_DIRTY). + +In user space, whether the page is present, swapped or none can be deduced with +the help of lseek and/or mincore system calls. + +lseek() can differentiate between accessed pages (present or swapped out) and +holes (none/non-allocated) by specifying the SEEK_DATA flag on the file where +the pages are backed. For anonymous shared pages, the file can be found in +``/proc/pid/map_files/``. + +mincore() can differentiate between pages in memory (present, including swap +cache) and out of memory (swapped out or none/non-allocated). + Other notes =========== From e26e0cc30b48cad5f2704f6a22fa5cac9fdaaece Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 5 Nov 2021 13:39:00 -0700 Subject: [PATCH 333/512] memory: remove unused CONFIG_MEM_BLOCK_SIZE Commit 3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove functions") defines CONFIG_MEM_BLOCK_SIZE, but this has never been utilized anywhere. It is a good practice to keep the CONFIG_* defines exclusively for the Kbuild system. So, drop this unused definition. This issue was noticed due to running ./scripts/checkkconfigsymbols.py. Link: https://lkml.kernel.org/r/20211006120354.7468-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/memory.h b/include/linux/memory.h index 182c606adb06..053a530c7bdd 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -140,7 +140,6 @@ typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); -#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< Date: Fri, 5 Nov 2021 13:39:03 -0700 Subject: [PATCH 334/512] mm/mprotect.c: avoid repeated assignment in do_mprotect_pkey() After adjustment, the repeated assignment of "prev" is avoided, and the readability of the code is improved. Link: https://lkml.kernel.org/r/20211012152444.4127-1-fishland@aliyun.com Reviewed-by: Andrew Morton Signed-off-by: Liu Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 883e2cc85cad..e552f5e0ccbd 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -563,7 +563,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, error = -ENOMEM; if (!vma) goto out; - prev = vma->vm_prev; + if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; @@ -581,8 +581,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } } + if (start > vma->vm_start) prev = vma; + else + prev = vma->vm_prev; for (nstart = start ; ; ) { unsigned long mask_off_old_flags; From fdbef61491359947753c13581098878e8d038286 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Fri, 5 Nov 2021 13:39:06 -0700 Subject: [PATCH 335/512] mm/mremap: don't account pages in vma_to_resize() All this vm_unacct_memory(charged) dance seems to complicate the life without a good reason. Furthermore, it seems not always done right on error-pathes in mremap_to(). And worse than that: this `charged' difference is sometimes double-accounted for growing MREMAP_DONTUNMAP mremap()s in move_vma(): if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) Let's not do this. Account memory in mremap() fast-path for growing VMAs or in move_vma() for actually moving things. The same simpler way as it's done by vm_stat_account(), but with a difference to call security_vm_enough_memory_mm() before copying/adjusting VMA. Originally noticed by Chen Wandun: https://lkml.kernel.org/r/20210717101942.120607-1-chenwandun@huawei.com Link: https://lkml.kernel.org/r/20210721131320.522061-1-dima@arista.com Fixes: e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") Signed-off-by: Dmitry Safonov Acked-by: Brian Geffon Cc: Alexander Viro Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Chen Wandun Cc: Dan Carpenter Cc: Dan Williams Cc: Dave Jiang Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kefeng Wang Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Minchan Kim Cc: Ralph Campbell Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vishal Verma Cc: Vlastimil Babka Cc: Wei Yongjun Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index badfe17ade1f..c0b6c41b7b78 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -565,6 +565,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, bool *locked, unsigned long flags, struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap) { + long to_account = new_len - old_len; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; unsigned long vm_flags = vma->vm_flags; @@ -583,6 +584,9 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + if (unlikely(flags & MREMAP_DONTUNMAP)) + to_account = new_len; + if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) err = vma->vm_ops->may_split(vma, old_addr); @@ -604,8 +608,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (err) return err; - if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) { - if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) + if (vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT)) return -ENOMEM; } @@ -613,8 +617,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, &need_rmap_locks); if (!new_vma) { - if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) - vm_unacct_memory(new_len >> PAGE_SHIFT); + if (vm_flags & VM_ACCOUNT) + vm_unacct_memory(to_account >> PAGE_SHIFT); return -ENOMEM; } @@ -708,8 +712,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } static struct vm_area_struct *vma_to_resize(unsigned long addr, - unsigned long old_len, unsigned long new_len, unsigned long flags, - unsigned long *p) + unsigned long old_len, unsigned long new_len, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -768,13 +771,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); - if (vma->vm_flags & VM_ACCOUNT) { - unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return ERR_PTR(-ENOMEM); - *p = charged; - } - return vma; } @@ -787,7 +783,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; - unsigned long charged = 0; unsigned long map_flags = 0; if (offset_in_page(new_addr)) @@ -830,7 +825,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, old_len = new_len; } - vma = vma_to_resize(addr, old_len, new_len, flags, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -853,7 +848,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); if (IS_ERR_VALUE(ret)) - goto out1; + goto out; /* We got a new mapping */ if (!(flags & MREMAP_FIXED)) @@ -862,12 +857,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, uf_unmap); - if (!(offset_in_page(ret))) - goto out; - -out1: - vm_unacct_memory(charged); - out: return ret; } @@ -899,7 +888,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; - unsigned long charged = 0; bool locked = false; bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; @@ -981,7 +969,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Ok, we need to grow.. */ - vma = vma_to_resize(addr, old_len, new_len, flags, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -992,10 +980,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (old_len == vma->vm_end - addr) { /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + long pages = (new_len - old_len) >> PAGE_SHIFT; + + if (vma->vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, pages)) { + ret = -ENOMEM; + goto out; + } + } if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { + vm_unacct_memory(pages); ret = -ENOMEM; goto out; } @@ -1034,10 +1030,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, &locked, flags, &uf, &uf_unmap); } out: - if (offset_in_page(ret)) { - vm_unacct_memory(charged); + if (offset_in_page(ret)) locked = false; - } if (downgraded) mmap_read_unlock(current->mm); else From 2e86f78b117a019881a420705a9d0ef126253083 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 5 Nov 2021 13:39:10 -0700 Subject: [PATCH 336/512] include/linux/io-mapping.h: remove fallback for writecombine The fallback was introduced in commit 80c33624e472 ("io-mapping: Fixup for different names of writecombine") to fix the build on microblaze. 5 years later, it seems all archs now provide a pgprot_writecombine(), so just remove the other possible fallbacks. For microblaze, pgprot_writecombine() is available since commit 97ccedd793ac ("microblaze: Provide pgprot_device/writecombine macros for nommu"). This is build-tested on microblaze with a hack to always build mm/io-mapping.o and without DIYing on an x86-only macro (_PAGE_CACHE_MASK) Link: https://lkml.kernel.org/r/20211020204838.1142908-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi Cc: Chris Wilson Cc: Daniel Vetter Cc: Joonas Lahtinen Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io-mapping.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index e9743cfd8585..66a774d2710e 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -132,13 +132,7 @@ io_mapping_init_wc(struct io_mapping *iomap, iomap->base = base; iomap->size = size; -#if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */ - iomap->prot = pgprot_noncached_wc(PAGE_KERNEL); -#elif defined(pgprot_writecombine) iomap->prot = pgprot_writecombine(PAGE_KERNEL); -#else - iomap->prot = pgprot_noncached(PAGE_KERNEL); -#endif return iomap; } From f595e3411dcb664844eab807dfef61619eb9cf39 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 5 Nov 2021 13:39:13 -0700 Subject: [PATCH 337/512] mm: mmap_lock: remove redundant newline in TP_printk Ftrace core will add newline automatically on printing, so using it in TP_printkcreates a blank line. Link: https://lkml.kernel.org/r/20211009071105.69544-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Acked-by: Vlastimil Babka Reviewed-by: Steven Rostedt (VMware) Cc: Ingo Molnar Cc: Axel Rasmussen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/mmap_lock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index 0abff67b96f0..5f980c92e3e9 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -32,7 +32,7 @@ TRACE_EVENT_FN(mmap_lock_start_locking, ), TP_printk( - "mm=%p memcg_path=%s write=%s\n", + "mm=%p memcg_path=%s write=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false" @@ -63,7 +63,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, ), TP_printk( - "mm=%p memcg_path=%s write=%s success=%s\n", + "mm=%p memcg_path=%s write=%s success=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false", @@ -92,7 +92,7 @@ TRACE_EVENT_FN(mmap_lock_released, ), TP_printk( - "mm=%p memcg_path=%s write=%s\n", + "mm=%p memcg_path=%s write=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false" From 627ae8284f50f220e8285119ca1716069c1c6a4d Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 5 Nov 2021 13:39:16 -0700 Subject: [PATCH 338/512] mm: mmap_lock: use DECLARE_EVENT_CLASS and DEFINE_EVENT_FN By using DECLARE_EVENT_CLASS and TRACE_EVENT_FN, we can save a lot of space from duplicate code. Link: https://lkml.kernel.org/r/20211009071243.70286-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Acked-by: Vlastimil Babka Reviewed-by: Steven Rostedt (VMware) Cc: Axel Rasmussen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/mmap_lock.h | 44 +++++++++----------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index 5f980c92e3e9..14db8044c1ff 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -13,7 +13,7 @@ struct mm_struct; extern int trace_mmap_lock_reg(void); extern void trace_mmap_lock_unreg(void); -TRACE_EVENT_FN(mmap_lock_start_locking, +DECLARE_EVENT_CLASS(mmap_lock, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), @@ -36,11 +36,19 @@ TRACE_EVENT_FN(mmap_lock_start_locking, __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false" - ), - - trace_mmap_lock_reg, trace_mmap_lock_unreg + ) ); +#define DEFINE_MMAP_LOCK_EVENT(name) \ + DEFINE_EVENT_FN(mmap_lock, name, \ + TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ + bool write), \ + TP_ARGS(mm, memcg_path, write), \ + trace_mmap_lock_reg, trace_mmap_lock_unreg) + +DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); +DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); + TRACE_EVENT_FN(mmap_lock_acquire_returned, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, @@ -73,34 +81,6 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, trace_mmap_lock_reg, trace_mmap_lock_unreg ); -TRACE_EVENT_FN(mmap_lock_released, - - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), - - TP_ARGS(mm, memcg_path, write), - - TP_STRUCT__entry( - __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) - __field(bool, write) - ), - - TP_fast_assign( - __entry->mm = mm; - __assign_str(memcg_path, memcg_path); - __entry->write = write; - ), - - TP_printk( - "mm=%p memcg_path=%s write=%s", - __entry->mm, - __get_str(memcg_path), - __entry->write ? "true" : "false" - ), - - trace_mmap_lock_reg, trace_mmap_lock_unreg -); - #endif /* _TRACE_MMAP_LOCK_H */ /* This part must be outside protection */ From 228f778e973035185232ae745be0e3bc57dacea6 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:39:19 -0700 Subject: [PATCH 339/512] mm/vmalloc: repair warn_alloc()s in __vmalloc_area_node() Commit f255935b9767 ("mm: cleanup the gfp_mask handling in __vmalloc_area_node") added __GFP_NOWARN to gfp_mask unconditionally however it disabled all output inside warn_alloc() call. This patch saves original gfp_mask and provides it to all warn_alloc() calls. Link: https://lkml.kernel.org/r/f4f3187b-9684-e426-565d-827c2a9bbb0e@virtuozzo.com Fixes: f255935b9767 ("mm: cleanup the gfp_mask handling in __vmalloc_area_node") Signed-off-by: Vasily Averin Reviewed-by: Christoph Hellwig Reviewed-by: Muchun Song Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e8a807c78110..f43c88fa08cf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2887,6 +2887,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t orig_gfp_mask = gfp_mask; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; @@ -2907,7 +2908,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!area->pages) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); @@ -2927,7 +2928,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via __vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, page order %u, failed to allocate pages", area->nr_pages * PAGE_SIZE, page_order); goto fail; @@ -2935,7 +2936,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (vmap_pages_range(addr, addr + size, prot, area->pages, page_shift) < 0) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; From bd1a8fb2d43f7c293383f76691d7a55f7f89d9da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Nov 2021 13:39:22 -0700 Subject: [PATCH 340/512] mm/vmalloc: don't allow VM_NO_GUARD on vmap() The vmalloc guard pages are added on top of each allocation, thereby isolating any two allocations from one another. The top guard of the lower allocation is the bottom guard guard of the higher allocation etc. Therefore VM_NO_GUARD is dangerous; it breaks the basic premise of isolating separate allocations. There are only two in-tree users of this flag, neither of which use it through the exported interface. Ensure it stays this way. Link: https://lkml.kernel.org/r/YUMfdA36fuyZ+/xt@hirez.programming.kicks-ass.net Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Christoph Hellwig Reviewed-by: David Hildenbrand Acked-by: Will Deacon Acked-by: Kees Cook Cc: Andrey Konovalov Cc: Mel Gorman Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 +- mm/vmalloc.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0ed56fc10c11..6e022cc712e6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -22,7 +22,7 @@ struct notifier_block; /* in notifier.h */ #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ #define VM_DMA_COHERENT 0x00000010 /* dma_alloc_coherent */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ -#define VM_NO_GUARD 0x00000040 /* don't add guard page */ +#define VM_NO_GUARD 0x00000040 /* ***DANGEROUS*** don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f43c88fa08cf..4a11abd9e70f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2743,6 +2743,13 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); + /* + * Your top guard is someone else's bottom guard. Not having a top + * guard compromises someone else's mappings too. + */ + if (WARN_ON_ONCE(flags & VM_NO_GUARD)) + flags &= ~VM_NO_GUARD; + if (count > totalram_pages()) return NULL; From 51e50b3a22937ab7b350f05af7e3b79b7ff73dd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Nov 2021 13:39:25 -0700 Subject: [PATCH 341/512] mm/vmalloc: make show_numa_info() aware of hugepage mappings show_numa_info() can be slightly faster, by skipping over hugepages directly. Link: https://lkml.kernel.org/r/20211001172725.105824-1-eric.dumazet@gmail.com Signed-off-by: Eric Dumazet Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4a11abd9e70f..6c1b97fd9b3d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3864,6 +3864,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; + unsigned int step = 1U << vm_area_page_order(v); if (!counters) return; @@ -3875,9 +3876,8 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) memset(counters, 0, nr_node_ids * sizeof(unsigned int)); - for (nr = 0; nr < v->nr_pages; nr++) - counters[page_to_nid(v->pages[nr])]++; - + for (nr = 0; nr < v->nr_pages; nr += step) + counters[page_to_nid(v->pages[nr])] += step; for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); From 7cc7913e8e61ac436497d01a64963770d1600f5d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Nov 2021 13:39:28 -0700 Subject: [PATCH 342/512] mm/vmalloc: make sure to dump unpurged areas in /proc/vmallocinfo If last va found in vmap_area_list does not have a vm pointer, vmallocinfo.s_show() returns 0, and show_purge_info() is not called as it should. Link: https://lkml.kernel.org/r/20211001170815.73321-1-eric.dumazet@gmail.com Fixes: dd3b8353bae7 ("mm/vmalloc: do not keep unpurged areas in the busy tree") Signed-off-by: Eric Dumazet Cc: Uladzislau Rezki (Sony) Cc: Pengfei Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6c1b97fd9b3d..0a740fb055ec 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3913,7 +3913,7 @@ static int s_show(struct seq_file *m, void *p) (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); - return 0; + goto final; } v = va->vm; @@ -3954,6 +3954,7 @@ static int s_show(struct seq_file *m, void *p) /* * As a final step, dump "unpurged" areas. */ +final: if (list_is_last(&va->list, &vmap_area_list)) show_purge_info(m); From 9f531973dff39c671219352573ad5df6d0d9a58c Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 5 Nov 2021 13:39:31 -0700 Subject: [PATCH 343/512] mm/vmalloc: do not adjust the search size for alignment overhead We used to include an alignment overhead into a search length, in that case we guarantee that a found area will definitely fit after applying a specific alignment that user specifies. From the other hand we do not guarantee that an area has the lowest address if an alignment is >= PAGE_SIZE. It means that, when a user specifies a special alignment together with a range that corresponds to an exact requested size then an allocation will fail. This is what happens to KASAN, it wants the free block that exactly matches a specified range during onlining memory banks: [root@vm-0 fedora]# echo online > /sys/devices/system/memory/memory82/state [root@vm-0 fedora]# echo online > /sys/devices/system/memory/memory83/state [root@vm-0 fedora]# echo online > /sys/devices/system/memory/memory85/state [root@vm-0 fedora]# echo online > /sys/devices/system/memory/memory84/state vmap allocation for size 16777216 failed: use vmalloc= to increase size bash: vmalloc: allocation failure: 16777216 bytes, mode:0x6000c0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 4 PID: 1644 Comm: bash Kdump: loaded Not tainted 4.18.0-339.el8.x86_64+debug #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8e/0xd0 warn_alloc.cold.90+0x8a/0x1b2 ? zone_watermark_ok_safe+0x300/0x300 ? slab_free_freelist_hook+0x85/0x1a0 ? __get_vm_area_node+0x240/0x2c0 ? kfree+0xdd/0x570 ? kmem_cache_alloc_node_trace+0x157/0x230 ? notifier_call_chain+0x90/0x160 __vmalloc_node_range+0x465/0x840 ? mark_held_locks+0xb7/0x120 Fix it by making sure that find_vmap_lowest_match() returns lowest start address with any given alignment value, i.e. for alignments bigger then PAGE_SIZE the algorithm rolls back toward parent nodes checking right sub-trees if the most left free block did not fit due to alignment overhead. Link: https://lkml.kernel.org/r/20211004142829.22222-1-urezki@gmail.com Fixes: 68ad4a330433 ("mm/vmalloc.c: keep track of free blocks for vmap allocation") Signed-off-by: Uladzislau Rezki (Sony) Reported-by: Ping Fang Tested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Mel Gorman Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Hillf Danton Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0a740fb055ec..0d835bd7fb5f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1195,18 +1195,14 @@ find_vmap_lowest_match(unsigned long size, { struct vmap_area *va; struct rb_node *node; - unsigned long length; /* Start from the root. */ node = free_vmap_area_root.rb_node; - /* Adjust the search size for alignment overhead. */ - length = size + align - 1; - while (node) { va = rb_entry(node, struct vmap_area, rb_node); - if (get_subtree_max_size(node->rb_left) >= length && + if (get_subtree_max_size(node->rb_left) >= size && vstart < va->va_start) { node = node->rb_left; } else { @@ -1216,9 +1212,9 @@ find_vmap_lowest_match(unsigned long size, /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is - * equal or bigger to the requested search length. + * equal or bigger to the requested search size. */ - if (get_subtree_max_size(node->rb_right) >= length) { + if (get_subtree_max_size(node->rb_right) >= size) { node = node->rb_right; continue; } @@ -1226,15 +1222,23 @@ find_vmap_lowest_match(unsigned long size, /* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen - * only once due to "vstart" restriction. + * due to "vstart" restriction or an alignment overhead + * that is bigger then PAGE_SIZE. */ while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; - if (get_subtree_max_size(node->rb_right) >= length && + if (get_subtree_max_size(node->rb_right) >= size && vstart <= va->va_start) { + /* + * Shift the vstart forward. Please note, we update it with + * parent's start address adding "1" because we do not want + * to enter same sub-tree after it has already been checked + * and no suitable free block found there. + */ + vstart = va->va_start + 1; node = node->rb_right; break; } From 066fed59d8a1bab4f3213e8fe413c54e4a76b77a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 5 Nov 2021 13:39:34 -0700 Subject: [PATCH 344/512] mm/vmalloc: check various alignments when debugging Before we did not guarantee a free block with lowest start address for allocations with alignment >= PAGE_SIZE. Because an alignment overhead was included into a search length like below: length = size + align - 1; doing so we make sure that a bigger block would fit after applying an alignment adjustment. Now there is no such limitation, i.e. any alignment that user wants to apply will result to a lowest address of returned free area. Link: https://lkml.kernel.org/r/20211004142829.22222-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Hillf Danton Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Ping Fang Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0d835bd7fb5f..d4770396799a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1269,7 +1269,7 @@ find_vmap_lowest_linear_match(unsigned long size, } static void -find_vmap_lowest_match_check(unsigned long size) +find_vmap_lowest_match_check(unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; @@ -1278,8 +1278,8 @@ find_vmap_lowest_match_check(unsigned long size) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, 1, vstart); - va_2 = find_vmap_lowest_linear_match(size, 1, vstart); + va_1 = find_vmap_lowest_match(size, align, vstart); + va_2 = find_vmap_lowest_linear_match(size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1458,7 +1458,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align, return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(size); + find_vmap_lowest_match_check(size, align); #endif return nva_start_addr; From dd544141b9eb8f3c58dedc9c1dcc4803de0eed45 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:39:37 -0700 Subject: [PATCH 345/512] vmalloc: back off when the current task is OOM-killed Huge vmalloc allocation on heavy loaded node can lead to a global memory shortage. Task called vmalloc can have worst badness and be selected by OOM-killer, however taken fatal signal does not interrupt allocation cycle. Vmalloc repeat page allocaions again and again, exacerbating the crisis and consuming the memory freed up by another killed tasks. After a successful completion of the allocation procedure, a fatal signal will be processed and task will be destroyed finally. However it may not release the consumed memory, since the allocated object may have a lifetime unrelated to the completed task. In the worst case, this can lead to the host will panic due to "Out of memory and no killable processes..." This patch allows OOM-killer to break vmalloc cycle, makes OOM more effective and avoid host panic. It does not check oom condition directly, however, and breaks page allocation cycle when fatal signal was received. This may trigger some hidden problems, when caller does not handle vmalloc failures, or when rollaback after failed vmalloc calls own vmallocs inside. However all of these scenarios are incorrect: vmalloc does not guarantee successful allocation, it has never been called with __GFP_NOFAIL and threfore either should not be used for any rollbacks or should handle such errors correctly and not lead to critical failures. Link: https://lkml.kernel.org/r/83efc664-3a65-2adb-d7c4-2885784cf109@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Tetsuo Handa Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d4770396799a..2cd23fa00a13 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2871,6 +2871,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { + if (fatal_signal_pending(current)) + break; + if (nid == NUMA_NO_NODE) page = alloc_pages(gfp, order); else From 0eb68437a7f9dfef9c218873310c66c714f2fa99 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 5 Nov 2021 13:39:41 -0700 Subject: [PATCH 346/512] vmalloc: choose a better start address in vm_area_register_early() Percpu embedded first chunk allocator is the firstly option, but it could fail on ARM64, eg, percpu: max_distance=0x5fcfdc640000 too large for vmalloc space 0x781fefff0000 percpu: max_distance=0x600000540000 too large for vmalloc space 0x7dffb7ff0000 percpu: max_distance=0x5fff9adb0000 too large for vmalloc space 0x5dffb7ff0000 then we could get to WARNING: CPU: 15 PID: 461 at vmalloc.c:3087 pcpu_get_vm_areas+0x488/0x838 and the system cannot boot successfully. Let's implement page mapping percpu first chunk allocator as a fallback to the embedding allocator to increase the robustness of the system. Also fix a crash when both NEED_PER_CPU_PAGE_FIRST_CHUNK and KASAN_VMALLOC enabled. Tested on ARM64 qemu with cmdline "percpu_alloc=page". This patch (of 3): There are some fixed locations in the vmalloc area be reserved in ARM(see iotable_init()) and ARM64(see map_kernel()), but for pcpu_page_first_chunk(), it calls vm_area_register_early() and choose VMALLOC_START as the start address of vmap area which could be conflicted with above address, then could trigger a BUG_ON in vm_area_add_early(). Let's choose a suit start address by traversing the vmlist. Link: https://lkml.kernel.org/r/20210910053354.26721-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20210910053354.26721-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Catalin Marinas Cc: Will Deacon Cc: Andrey Ryabinin Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Marco Elver Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2cd23fa00a13..74da45a8beec 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2276,15 +2276,21 @@ void __init vm_area_add_early(struct vm_struct *vm) */ void __init vm_area_register_early(struct vm_struct *vm, size_t align) { - static size_t vm_init_off __initdata; - unsigned long addr; + unsigned long addr = ALIGN(VMALLOC_START, align); + struct vm_struct *cur, **p; - addr = ALIGN(VMALLOC_START + vm_init_off, align); - vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; + BUG_ON(vmap_initialized); + for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { + if ((unsigned long)cur->addr - addr >= vm->size) + break; + addr = ALIGN((unsigned long)cur->addr + cur->size, align); + } + + BUG_ON(addr > VMALLOC_END - vm->size); vm->addr = (void *)addr; - - vm_area_add_early(vm); + vm->next = *p; + *p = vm; } static void vmap_init_free_space(void) From 09cea6195073ee1d0f076d907d9249045757245d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 5 Nov 2021 13:39:44 -0700 Subject: [PATCH 347/512] arm64: support page mapping percpu first chunk allocator Percpu embedded first chunk allocator is the firstly option, but it could fails on ARM64, eg, percpu: max_distance=0x5fcfdc640000 too large for vmalloc space 0x781fefff0000 percpu: max_distance=0x600000540000 too large for vmalloc space 0x7dffb7ff0000 percpu: max_distance=0x5fff9adb0000 too large for vmalloc space 0x5dffb7ff0000 then we could get WARNING: CPU: 15 PID: 461 at vmalloc.c:3087 pcpu_get_vm_areas+0x488/0x838 and the system could not boot successfully. Let's implement page mapping percpu first chunk allocator as a fallback to the embedding allocator to increase the robustness of the system. Link: https://lkml.kernel.org/r/20210910053354.26721-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Catalin Marinas Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Marco Elver Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 4 ++ drivers/base/arch_numa.c | 82 +++++++++++++++++++++++++++++++++++----- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fee914c716aa..67f24ee6cf57 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1042,6 +1042,10 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK def_bool y depends on NUMA +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y + depends on NUMA + source "kernel/Kconfig.hz" config ARCH_SPARSEMEM_ENABLE diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 00fb4120a5b3..28222688ace7 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -14,6 +14,7 @@ #include #include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -168,22 +169,83 @@ static void __init pcpu_fc_free(void *ptr, size_t size) memblock_free_early(__pa(ptr), size); } +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +static void __init pcpu_populate_pte(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + pud_t *new; + + new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; + p4d_populate(&init_mm, p4d, new); + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pmd_t *new; + + new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; + pud_populate(&init_mm, pud, new); + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + pte_t *new; + + new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; + pmd_populate_kernel(&init_mm, pmd, new); + } + + return; + +err_alloc: + panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); +} +#endif + void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; - int rc; + int rc = -EINVAL; - /* - * Always reserve area for module percpu variables. That's - * what the legacy allocator did. - */ - rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, - pcpu_cpu_distance, - pcpu_fc_alloc, pcpu_fc_free); + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + if (rc < 0) + pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); +#endif + } + +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) - panic("Failed to initialize percpu areas."); + rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, + pcpu_fc_alloc, + pcpu_fc_free, + pcpu_populate_pte); +#endif + if (rc < 0) + panic("Failed to initialize percpu areas (err=%d).", rc); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) From 3252b1d8309ea42bc6329d9341072ecf1c9505c0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 5 Nov 2021 13:39:47 -0700 Subject: [PATCH 348/512] kasan: arm64: fix pcpu_page_first_chunk crash with KASAN_VMALLOC With KASAN_VMALLOC and NEED_PER_CPU_PAGE_FIRST_CHUNK the kernel crashes: Unable to handle kernel paging request at virtual address ffff7000028f2000 ... swapper pgtable: 64k pages, 48-bit VAs, pgdp=0000000042440000 [ffff7000028f2000] pgd=000000063e7c0003, p4d=000000063e7c0003, pud=000000063e7c0003, pmd=000000063e7b0003, pte=0000000000000000 Internal error: Oops: 96000007 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.13.0-rc4-00003-gc6e6e28f3f30-dirty #62 Hardware name: linux,dummy-virt (DT) pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO BTYPE=--) pc : kasan_check_range+0x90/0x1a0 lr : memcpy+0x88/0xf4 sp : ffff80001378fe20 ... Call trace: kasan_check_range+0x90/0x1a0 pcpu_page_first_chunk+0x3f0/0x568 setup_per_cpu_areas+0xb8/0x184 start_kernel+0x8c/0x328 The vm area used in vm_area_register_early() has no kasan shadow memory, Let's add a new kasan_populate_early_vm_area_shadow() function to populate the vm area shadow memory to fix the issue. [wangkefeng.wang@huawei.com: fix redefinition of 'kasan_populate_early_vm_area_shadow'] Link: https://lkml.kernel.org/r/20211011123211.3936196-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20210910053354.26721-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Marco Elver [KASAN] Acked-by: Andrey Konovalov [KASAN] Acked-by: Catalin Marinas Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/kasan_init.c | 16 ++++++++++++++++ include/linux/kasan.h | 6 ++++++ mm/kasan/shadow.c | 5 +++++ mm/vmalloc.c | 1 + 4 files changed, 28 insertions(+) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 61b52a92b8b6..5b996ca4d996 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -287,6 +287,22 @@ static void __init kasan_init_depth(void) init_task.kasan_depth = 0; } +#ifdef CONFIG_KASAN_VMALLOC +void __init kasan_populate_early_vm_area_shadow(void *start, unsigned long size) +{ + unsigned long shadow_start, shadow_end; + + if (!is_vmalloc_or_module_addr(start)) + return; + + shadow_start = (unsigned long)kasan_mem_to_shadow(start); + shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); + shadow_end = (unsigned long)kasan_mem_to_shadow(start + size); + shadow_end = ALIGN(shadow_end, PAGE_SIZE); + kasan_map_populate(shadow_start, shadow_end, NUMA_NO_NODE); +} +#endif + void __init kasan_init(void) { kasan_init_shadow(); diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 736d7b458996..65487a3b2a71 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -436,6 +436,8 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); +void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); + #else /* CONFIG_KASAN_VMALLOC */ static inline int kasan_populate_vmalloc(unsigned long start, @@ -453,6 +455,10 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_start, unsigned long free_region_end) {} +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ } + #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 8d95ee52d019..4a4929b29a23 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -254,6 +254,11 @@ core_initcall(kasan_memhotplug_init); #ifdef CONFIG_KASAN_VMALLOC +void __init __weak kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ +} + static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 74da45a8beec..75fa100f5424 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2291,6 +2291,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) vm->addr = (void *)addr; vm->next = *p; *p = vm; + kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } static void vmap_init_free_space(void) From b7d90e7a5ea8d64e668d5685925900d33d3884d5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 5 Nov 2021 13:39:50 -0700 Subject: [PATCH 349/512] mm/vmalloc: be more explicit about supported gfp flags The core of the vmalloc allocator __vmalloc_area_node doesn't say anything about gfp mask argument. Not all gfp flags are supported though. Be more explicit about constraints. Link: https://lkml.kernel.org/r/20211020082545.4830-1-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Dave Chinner Cc: Neil Brown Cc: Christoph Hellwig Cc: Uladzislau Rezki Cc: Ilya Dryomov Cc: Jeff Layton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75fa100f5424..c56720136c45 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2983,8 +2983,16 @@ fail: * @caller: caller's return address * * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * allocator with @gfp_mask flags. Please note that the full set of gfp + * flags are not supported. GFP_KERNEL would be a preferred allocation mode + * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not + * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka + * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka + * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported). + * __GFP_NOWARN can be used to suppress error messages about failures. + * + * Map them into contiguous kernel virtual space, using a pagetable + * protection of @prot. * * Return: the address of the area or %NULL on failure */ From c00b6b9610991c042ff4c3153daaa3ea8522c210 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 5 Nov 2021 13:39:53 -0700 Subject: [PATCH 350/512] mm/vmalloc: introduce alloc_pages_bulk_array_mempolicy to accelerate memory allocation Commit ffb29b1c255a ("mm/vmalloc: fix numa spreading for large hash tables") can cause significant performance regressions in some situations as Andrew mentioned in [1]. The main situation is vmalloc, vmalloc will allocate pages with NUMA_NO_NODE by default, that will result in alloc page one by one; In order to solve this, __alloc_pages_bulk and mempolicy should be considered at the same time. 1) If node is specified in memory allocation request, it will alloc all pages by __alloc_pages_bulk. 2) If interleaving allocate memory, it will cauculate how many pages should be allocated in each node, and use __alloc_pages_bulk to alloc pages in each node. [1]: https://lore.kernel.org/lkml/CALvZod4G3SzP3kWxQYn0fj+VgG-G3yWXz=gz17+3N57ru1iajw@mail.gmail.com/t/#m750c8e3231206134293b089feaa090590afa0f60 [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: make two functions static] [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Link: https://lkml.kernel.org/r/20211021080744.874701-3-chenwandun@huawei.com Signed-off-by: Chen Wandun Reviewed-by: Uladzislau Rezki (Sony) Cc: Eric Dumazet Cc: Shakeel Butt Cc: Nicholas Piggin Cc: Kefeng Wang Cc: Hanjun Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 +++ mm/mempolicy.c | 82 +++++++++++++++++++++++++++++++++++++++++++++ mm/vmalloc.c | 20 ++++++++--- 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fbd4abc33f24..c1b262725dca 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -535,6 +535,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct list_head *page_list, struct page **page_array); +unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, + unsigned long nr_pages, + struct page **page_array); + /* Bulk allocate order-0 pages */ static inline unsigned long alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d12e0608fced..ce722333d9f6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2196,6 +2196,88 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages); +static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + int nodes; + unsigned long nr_pages_per_node; + int delta; + int i; + unsigned long nr_allocated; + unsigned long total_allocated = 0; + + nodes = nodes_weight(pol->nodes); + nr_pages_per_node = nr_pages / nodes; + delta = nr_pages - nodes * nr_pages_per_node; + + for (i = 0; i < nodes; i++) { + if (delta) { + nr_allocated = __alloc_pages_bulk(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node + 1, NULL, + page_array); + delta--; + } else { + nr_allocated = __alloc_pages_bulk(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node, NULL, page_array); + } + + page_array += nr_allocated; + total_allocated += nr_allocated; + } + + return total_allocated; +} + +static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + gfp_t preferred_gfp; + unsigned long nr_allocated = 0; + + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + + nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, + nr_pages, NULL, page_array); + + if (nr_allocated < nr_pages) + nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, + nr_pages - nr_allocated, NULL, + page_array + nr_allocated); + return nr_allocated; +} + +/* alloc pages bulk and mempolicy should be considered at the + * same time in some situation such as vmalloc. + * + * It can accelerate memory allocation especially interleaving + * allocate memory. + */ +unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, + unsigned long nr_pages, struct page **page_array) +{ + struct mempolicy *pol = &default_policy; + + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + if (pol->mode == MPOL_INTERLEAVE) + return alloc_pages_bulk_array_interleave(gfp, pol, + nr_pages, page_array); + + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_bulk_array_preferred_many(gfp, + numa_node_id(), pol, nr_pages, page_array); + + return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol), nr_pages, NULL, + page_array); +} + int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(vma_policy(src)); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c56720136c45..d2a00ad4e1dd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2843,7 +2843,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * to fails, fallback to a single page allocator that is * more permissive. */ - if (!order && nid != NUMA_NO_NODE) { + if (!order) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -2855,8 +2855,20 @@ vm_area_alloc_pages(gfp_t gfp, int nid, */ nr_pages_request = min(100U, nr_pages - nr_allocated); - nr = alloc_pages_bulk_array_node(gfp, nid, - nr_pages_request, pages + nr_allocated); + /* memory allocation should consider mempolicy, we can't + * wrongly use nearest node when nid == NUMA_NO_NODE, + * otherwise memory may be allocated in only one node, + * but mempolcy want to alloc memory by interleaving. + */ + if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) + nr = alloc_pages_bulk_array_mempolicy(gfp, + nr_pages_request, + pages + nr_allocated); + + else + nr = alloc_pages_bulk_array_node(gfp, nid, + nr_pages_request, + pages + nr_allocated); nr_allocated += nr; cond_resched(); @@ -2868,7 +2880,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } - } else if (order) + } else /* * Compound pages required for remap_vmalloc_page if * high-order pages. From 34b46efd6ec6a6f2d6a57be4216b820c879a0030 Mon Sep 17 00:00:00 2001 From: Changcheng Deng Date: Fri, 5 Nov 2021 13:39:56 -0700 Subject: [PATCH 351/512] lib/test_vmalloc.c: use swap() to make code cleaner Use swap() in order to make code cleaner. Issue found by coccinelle. Link: https://lkml.kernel.org/r/20211028111443.15744-1-deng.changcheng@zte.com.cn Signed-off-by: Changcheng Deng Reported-by: Zeal Robot Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_vmalloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index e14993bc84d2..cf41fd6df42a 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -393,7 +393,7 @@ static struct test_driver { static void shuffle_array(int *arr, int n) { unsigned int rnd; - int i, j, x; + int i, j; for (i = n - 1; i > 0; i--) { get_random_bytes(&rnd, sizeof(rnd)); @@ -402,9 +402,7 @@ static void shuffle_array(int *arr, int n) j = rnd % i; /* Swap indexes. */ - x = arr[i]; - arr[i] = arr[j]; - arr[j] = x; + swap(arr[i], arr[j]); } } From 084f7e2377e89ccbc8375b5486c6b4c16682f602 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Nov 2021 13:39:59 -0700 Subject: [PATCH 352/512] mm/large system hash: avoid possible NULL deref in alloc_large_system_hash If __vmalloc() returned NULL, is_vm_area_hugepages(NULL) will fault if CONFIG_HAVE_ARCH_HUGE_VMALLOC=y Link: https://lkml.kernel.org/r/20210915212530.2321545-1-eric.dumazet@gmail.com Fixes: 121e6f3258fe ("mm/vmalloc: hugepage vmalloc mappings") Signed-off-by: Eric Dumazet Reviewed-by: Andrew Morton Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23d3339ac4e8..133dad9f4669 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8762,7 +8762,8 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; - huge = is_vm_area_hugepages(table); + if (table) + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free From ea808b4efd15f6f019e9779617a166c9708856c1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:40:02 -0700 Subject: [PATCH 353/512] mm/page_alloc.c: remove meaningless VM_BUG_ON() in pindex_to_order() Patch series "Cleanups and fixup for page_alloc", v2. This series contains cleanups to remove meaningless VM_BUG_ON(), use helpers to simplify the code and remove obsolete comment. Also we avoid allocating highmem pages via alloc_pages_exact[_nid]. More details can be found in the respective changelogs. This patch (of 5): It's meaningless to VM_BUG_ON() order != pageblock_order just after setting order to pageblock_order. Remove it. Link: https://lkml.kernel.org/r/20210902121242.41607-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210902121242.41607-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Mel Gorman Reviewed-by: David Hildenbrand Cc: Vlastimil Babka Cc: Stephen Rothwell Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 133dad9f4669..afaa544f129f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -677,10 +677,8 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) { + if (order > PAGE_ALLOC_COSTLY_ORDER) order = pageblock_order; - VM_BUG_ON(order != pageblock_order); - } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif From ff7ed9e4532d14e0478d192548e77d78d72387e9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:40:05 -0700 Subject: [PATCH 354/512] mm/page_alloc.c: simplify the code by using macro K() Use helper macro K() to convert the pages to the corresponding size. Minor readability improvement. Link: https://lkml.kernel.org/r/20210902121242.41607-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Mel Gorman Reviewed-by: David Hildenbrand Cc: Peter Zijlstra Cc: Stephen Rothwell Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index afaa544f129f..8224c6c302dd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8130,8 +8130,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char } if (pages && s) - pr_info("Freeing %s memory: %ldK\n", - s, pages << (PAGE_SHIFT - 10)); + pr_info("Freeing %s memory: %ldK\n", s, K(pages)); return pages; } @@ -8176,14 +8175,13 @@ void __init mem_init_print_info(void) ", %luK highmem" #endif ")\n", - nr_free_pages() << (PAGE_SHIFT - 10), - physpages << (PAGE_SHIFT - 10), + K(nr_free_pages()), K(physpages), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), - totalcma_pages << (PAGE_SHIFT - 10) + K(physpages - totalram_pages() - totalcma_pages), + K(totalcma_pages) #ifdef CONFIG_HIGHMEM - , totalhigh_pages() << (PAGE_SHIFT - 10) + , K(totalhigh_pages()) #endif ); } From 7cba630bd830472f88ada5fdd34bbfd5825d9217 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:40:08 -0700 Subject: [PATCH 355/512] mm/page_alloc.c: fix obsolete comment in free_pcppages_bulk() The second two paragraphs about "all pages pinned" and pages_scanned is obsolete. And There are PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP orders in pcp. So the same order assumption is not held now. Link: https://lkml.kernel.org/r/20210902121242.41607-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Peter Zijlstra Cc: Stephen Rothwell Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8224c6c302dd..c494c9c66f4f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1428,14 +1428,8 @@ static inline void prefetch_buddy(struct page *page) /* * Frees a number of pages from the PCP lists - * Assumes all pages on list are in same zone, and of same order. + * Assumes all pages on list are in same zone. * count is the number of pages to free. - * - * If the zone was previously in an "all pages pinned" state then look to - * see if this freeing clears that state. - * - * And clear the zone's pages_scanned counter, to hold off the "all pages are - * pinned" detection logic. */ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) From 86fb05b9cc1ac7cdcf37e5408b927dd3ad95db96 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:40:11 -0700 Subject: [PATCH 356/512] mm/page_alloc.c: use helper function zone_spans_pfn() Use helper function zone_spans_pfn() to check whether pfn is within a zone to simplify the code slightly. Link: https://lkml.kernel.org/r/20210902121242.41607-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Mel Gorman Reviewed-by: David Hildenbrand Cc: Peter Zijlstra Cc: Stephen Rothwell Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c494c9c66f4f..26aebbbf1f66 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1583,7 +1583,7 @@ static void __meminit init_reserved_page(unsigned long pfn) for (zid = 0; zid < MAX_NR_ZONES; zid++) { struct zone *zone = &pgdat->node_zones[zid]; - if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + if (zone_spans_pfn(zone, pfn)) break; } __init_single_page(pfn_to_page(pfn), pfn, zid, nid); From ba7f1b9e3fd9a2a64c5da04995919c9eb5bab974 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:40:15 -0700 Subject: [PATCH 357/512] mm/page_alloc.c: avoid allocating highmem pages via alloc_pages_exact[_nid] Don't use with __GFP_HIGHMEM because page_address() cannot represent highmem pages without kmap(). Newly allocated pages would leak as page_address() will return NULL for highmem pages here. But It works now because the callers do not specify __GFP_HIGHMEM now. Link: https://lkml.kernel.org/r/20210902121242.41607-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Mel Gorman Cc: Peter Zijlstra Cc: Stephen Rothwell Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 26aebbbf1f66..307ad23cb9e9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5610,8 +5610,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); unsigned long addr; - if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) - gfp_mask &= ~__GFP_COMP; + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); addr = __get_free_pages(gfp_mask, order); return make_alloc_exact(addr, order, size); @@ -5635,8 +5635,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); struct page *p; - if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) - gfp_mask &= ~__GFP_COMP; + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); p = alloc_pages_node(nid, gfp_mask, order); if (!p) From 6cf253925df72e522c06dac09ede7e81a6e38121 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Fri, 5 Nov 2021 13:40:18 -0700 Subject: [PATCH 358/512] mm/page_alloc: print node fallback order Patch series "Fix NUMA nodes fallback list ordering". For a NUMA system that has multiple nodes at same distance from other nodes, the fallback list generation prefers same node order for them instead of round-robin thereby penalizing one node over others. This series fixes it. More description of the problem and the fix is present in the patch description. This patch (of 2): Print information message about the allocation fallback order for each NUMA node during boot. No functional changes here. This makes it easier to illustrate the problem in the node fallback list generation, which the next patch fixes. Link: https://lkml.kernel.org/r/20210830121603.1081-1-bharata@amd.com Link: https://lkml.kernel.org/r/20210830121603.1081-2-bharata@amd.com Signed-off-by: Bharata B Rao Acked-by: Mel Gorman Reviewed-by: Anshuman Khandual Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Krupa Ramakrishnan Cc: Sadagopan Srinivasan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 307ad23cb9e9..163f60dcac47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6262,6 +6262,10 @@ static void build_zonelists(pg_data_t *pgdat) build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); + pr_info("Fallback order for Node %d: ", local_node); + for (node = 0; node < nr_nodes; node++) + pr_cont("%d ", node_order[node]); + pr_cont("\n"); } #ifdef CONFIG_HAVE_MEMORYLESS_NODES From 54d032ced98378bcb9d32dd5e378b7e402b36ad8 Mon Sep 17 00:00:00 2001 From: Krupa Ramakrishnan Date: Fri, 5 Nov 2021 13:40:21 -0700 Subject: [PATCH 359/512] mm/page_alloc: use accumulated load when building node fallback list In build_zonelists(), when the fallback list is built for the nodes, the node load gets reinitialized during each iteration. This results in nodes with same distances occupying the same slot in different node fallback lists rather than appearing in the intended round- robin manner. This results in one node getting picked for allocation more compared to other nodes with the same distance. As an example, consider a 4 node system with the following distance matrix. Node 0 1 2 3 ---------------- 0 10 12 32 32 1 12 10 32 32 2 32 32 10 12 3 32 32 12 10 For this case, the node fallback list gets built like this: Node Fallback list --------------------- 0 0 1 2 3 1 1 0 3 2 2 2 3 0 1 3 3 2 0 1 <-- Unexpected fallback order In the fallback list for nodes 2 and 3, the nodes 0 and 1 appear in the same order which results in more allocations getting satisfied from node 0 compared to node 1. The effect of this on remote memory bandwidth as seen by stream benchmark is shown below: Case 1: Bandwidth from cores on nodes 2 & 3 to memory on nodes 0 & 1 (numactl -m 0,1 ./stream_lowOverhead ... --cores ) Case 2: Bandwidth from cores on nodes 0 & 1 to memory on nodes 2 & 3 (numactl -m 2,3 ./stream_lowOverhead ... --cores ) ---------------------------------------- BANDWIDTH (MB/s) TEST Case 1 Case 2 ---------------------------------------- COPY 57479.6 110791.8 SCALE 55372.9 105685.9 ADD 50460.6 96734.2 TRIADD 50397.6 97119.1 ---------------------------------------- The bandwidth drop in Case 1 occurs because most of the allocations get satisfied by node 0 as it appears first in the fallback order for both nodes 2 and 3. This can be fixed by accumulating the node load in build_zonelists() rather than reinitializing it during each iteration. With this the nodes with the same distance rightly get assigned in the round robin manner. In fact this was how it was originally until commit f0c0b2b808f2 ("change zonelist order: zonelist order selection logic") dropped the load accumulation and resorted to initializing the load during each iteration. While zonelist ordering was removed by commit c9bff3eebc09 ("mm, page_alloc: rip out ZONELIST_ORDER_ZONE"), the change to the node load accumulation in build_zonelists() remained. So essentially this patch reverts back to the accumulated node load logic. After this fix, the fallback order gets built like this: Node Fallback list ------------------ 0 0 1 2 3 1 1 0 3 2 2 2 3 0 1 3 3 2 1 0 <-- Note the change here The bandwidth in Case 1 improves and matches Case 2 as shown below. ---------------------------------------- BANDWIDTH (MB/s) TEST Case 1 Case 2 ---------------------------------------- COPY 110438.9 110107.2 SCALE 105930.5 105817.5 ADD 97005.1 96159.8 TRIADD 97441.5 96757.1 ---------------------------------------- The correctness of the fallback list generation has been verified for the above node configuration where the node 3 starts as memory-less node and comes up online only during memory hotplug. [bharata@amd.com: Added changelog, review, test validation] Link: https://lkml.kernel.org/r/20210830121603.1081-3-bharata@amd.com Fixes: f0c0b2b808f2 ("change zonelist order: zonelist order selection logic") Signed-off-by: Krupa Ramakrishnan Co-developed-by: Sadagopan Srinivasan Signed-off-by: Sadagopan Srinivasan Signed-off-by: Bharata B Rao Acked-by: Mel Gorman Reviewed-by: Anshuman Khandual Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 163f60dcac47..1eab16b4006d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6253,7 +6253,7 @@ static void build_zonelists(pg_data_t *pgdat) */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] = load; + node_load[node] += load; node_order[nr_nodes++] = node; prev_node = node; From 61bb6cd2f765b90cfc5f0f91696889c366a6a13d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 5 Nov 2021 13:40:24 -0700 Subject: [PATCH 360/512] mm: move node_reclaim_distance to fix NUMA without SMP Patch series "Fix NUMA without SMP". SuperH is the only architecture which still supports NUMA without SMP, for good reasons (various memories scattered around the address space, each with varying latencies). This series fixes two build errors due to variables and functions used by the NUMA code being provided by SMP-only source files or sections. This patch (of 2): If CONFIG_NUMA=y, but CONFIG_SMP=n (e.g. sh/migor_defconfig): sh4-linux-gnu-ld: mm/page_alloc.o: in function `get_page_from_freelist': page_alloc.c:(.text+0x2c24): undefined reference to `node_reclaim_distance' Fix this by moving the declaration of node_reclaim_distance from an SMP-only to a generic file. Link: https://lkml.kernel.org/r/cover.1631781495.git.geert+renesas@glider.be Link: https://lkml.kernel.org/r/6432666a648dde85635341e6c918cee97c97d264.1631781495.git.geert+renesas@glider.be Fixes: a55c7454a8c887b2 ("sched/topology: Improve load balancing on AMD EPYC systems") Signed-off-by: Geert Uytterhoeven Suggested-by: Matt Fleming Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Juri Lelli Cc: Vincent Guittot Cc: Vlastimil Babka Cc: Yoshinori Sato Cc: Rich Felker Cc: Gon Solo Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/topology.c | 1 - mm/page_alloc.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4e8698e62f07..738ee7fa7972 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1481,7 +1481,6 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1eab16b4006d..e4aef907abe9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3960,6 +3960,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= From ebeac3ea995b59e823510937ab92825004e3fbaa Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 5 Nov 2021 13:40:28 -0700 Subject: [PATCH 361/512] mm: move fold_vm_numa_events() to fix NUMA without SMP If CONFIG_NUMA=y, but CONFIG_SMP=n (e.g. sh/migor_defconfig): sh4-linux-gnu-ld: mm/vmstat.o: in function `vmstat_start': vmstat.c:(.text+0x97c): undefined reference to `fold_vm_numa_events' sh4-linux-gnu-ld: drivers/base/node.o: in function `node_read_vmstat': node.c:(.text+0x140): undefined reference to `fold_vm_numa_events' sh4-linux-gnu-ld: drivers/base/node.o: in function `node_read_numastat': node.c:(.text+0x1d0): undefined reference to `fold_vm_numa_events' Fix this by moving fold_vm_numa_events() outside the SMP-only section. Link: https://lkml.kernel.org/r/9d16ccdd9ef32803d7100c84f737de6a749314fb.1631781495.git.geert+renesas@glider.be Fixes: f19298b9516c1a03 ("mm/vmstat: convert NUMA statistics to basic NUMA counters") Signed-off-by: Geert Uytterhoeven Acked-by: Mel Gorman Cc: Gon Solo Cc: Ingo Molnar Cc: Juri Lelli Cc: Matt Fleming Cc: Peter Zijlstra Cc: Rich Felker Cc: Vincent Guittot Cc: Vlastimil Babka Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 56 ++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 8ce2620344b2..5db54581feab 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -165,6 +165,34 @@ atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_zone_stat); EXPORT_SYMBOL(vm_node_stat); +#ifdef CONFIG_NUMA +static void fold_vm_zone_numa_events(struct zone *zone) +{ + unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; + int cpu; + enum numa_stat_item item; + + for_each_online_cpu(cpu) { + struct per_cpu_zonestat *pzstats; + + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); + } + + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_event_add(zone_numa_events[item], zone, item); +} + +void fold_vm_numa_events(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + fold_vm_zone_numa_events(zone); +} +#endif + #ifdef CONFIG_SMP int calculate_pressure_threshold(struct zone *zone) @@ -771,34 +799,6 @@ static int fold_diff(int *zone_diff, int *node_diff) return changes; } -#ifdef CONFIG_NUMA -static void fold_vm_zone_numa_events(struct zone *zone) -{ - unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; - int cpu; - enum numa_stat_item item; - - for_each_online_cpu(cpu) { - struct per_cpu_zonestat *pzstats; - - pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); - for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) - zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); - } - - for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) - zone_numa_event_add(zone_numa_events[item], zone, item); -} - -void fold_vm_numa_events(void) -{ - struct zone *zone; - - for_each_populated_zone(zone) - fold_vm_zone_numa_events(zone); -} -#endif - /* * Update the zone counters for the current cpu. * From 8446b59baaf45e83e1187cdb174ac78ac5d7d0ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Nov 2021 13:40:31 -0700 Subject: [PATCH 362/512] mm/page_alloc.c: do not acquire zone lock in is_free_buddy_page() Grabbing zone lock in is_free_buddy_page() gives a wrong sense of safety, and has potential performance implications when zone is experiencing lock contention. In any case, if a caller needs a stable result, it should grab zone lock before calling this function. Link: https://lkml.kernel.org/r/20210922152833.4023972-1-eric.dumazet@gmail.com Signed-off-by: Eric Dumazet Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e4aef907abe9..e891560e0a80 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9356,21 +9356,21 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif +/* + * This function returns a stable result only if called under zone lock. + */ bool is_free_buddy_page(struct page *page) { - struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); - unsigned long flags; unsigned int order; - spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && buddy_order(page_head) >= order) + if (PageBuddy(page_head) && + buddy_order_unsafe(page_head) >= order) break; } - spin_unlock_irqrestore(&zone->lock, flags); return order < MAX_ORDER; } From 8ca1b5a49885f0c0c486544da46a9e0ac790831d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 5 Nov 2021 13:40:34 -0700 Subject: [PATCH 363/512] mm/page_alloc: detect allocation forbidden by cpuset and bail out early There was a report that starting an Ubuntu in docker while using cpuset to bind it to movable nodes (a node only has movable zone, like a node for hotplug or a Persistent Memory node in normal usage) will fail due to memory allocation failure, and then OOM is involved and many other innocent processes got killed. It can be reproduced with command: $ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status" (where node 4 is a movable node) runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0 CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G W I E 5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased) Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020 Call Trace: dump_stack+0x6b/0x88 dump_header+0x4a/0x1e2 oom_kill_process.cold+0xb/0x10 out_of_memory.part.0+0xaf/0x230 out_of_memory+0x3d/0x80 __alloc_pages_slowpath.constprop.0+0x954/0xa20 __alloc_pages_nodemask+0x2d3/0x300 pipe_write+0x322/0x590 new_sync_write+0x196/0x1b0 vfs_write+0x1c3/0x1f0 ksys_write+0xa7/0xe0 do_syscall_64+0x52/0xd0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Mem-Info: active_anon:392832 inactive_anon:182 isolated_anon:0 active_file:68130 inactive_file:151527 isolated_file:0 unevictable:2701 dirty:0 writeback:7 slab_reclaimable:51418 slab_unreclaimable:116300 mapped:45825 shmem:735 pagetables:2540 bounce:0 free:159849484 free_pcp:73 free_cma:0 Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 0 Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0 Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0 oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB The reason is that in this case, the target cpuset nodes only have movable zone, while the creation of an OS in docker sometimes needs to allocate memory in non-movable zones (dma/dma32/normal) like GFP_HIGHUSER, and the cpuset limit forbids the allocation, then out-of-memory killing is involved even when normal nodes and movable nodes both have many free memory. The OOM killer cannot help to resolve the situation as there is no usable memory for the request in the cpuset scope. The only reasonable measure to take is to fail the allocation right away and have the caller to deal with it. So add a check for cases like this in the slowpath of allocation, and bail out early returning NULL for the allocation. As page allocation is one of the hottest path in kernel, this check will hurt all users with sane cpuset configuration, add a static branch check and detect the abnormal config in cpuset memory binding setup so that the extra check cost in page allocation is not paid by everyone. [thanks to Micho Hocko and David Rientjes for suggesting not handling it inside OOM code, adding cpuset check, refining comments] Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: David Rientjes Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 17 +++++++++++++++++ include/linux/mmzone.h | 22 ++++++++++++++++++++++ kernel/cgroup/cpuset.c | 23 +++++++++++++++++++++++ mm/page_alloc.c | 13 +++++++++++++ 4 files changed, 75 insertions(+) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d2b9c41c8edf..d58e0476ee8e 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -34,6 +34,8 @@ */ extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; +extern struct static_key_false cpusets_insane_config_key; + static inline bool cpusets_enabled(void) { return static_branch_unlikely(&cpusets_enabled_key); @@ -51,6 +53,19 @@ static inline void cpuset_dec(void) static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } +/* + * This will get enabled whenever a cpuset configuration is considered + * unsupportable in general. E.g. movable only node which cannot satisfy + * any non movable allocations (see update_nodemask). Page allocator + * needs to make additional checks for those configurations and this + * check is meant to guard those checks without any overhead for sane + * configurations. + */ +static inline bool cpusets_insane_config(void) +{ + return static_branch_unlikely(&cpusets_insane_config_key); +} + extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); @@ -167,6 +182,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline bool cpusets_enabled(void) { return false; } +static inline bool cpusets_insane_config(void) { return false; } + static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 832aa49d0d8e..fb36a29e3aae 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1220,6 +1220,28 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) +/* Whether the 'nodes' are all movable nodes */ +static inline bool movable_only_nodes(nodemask_t *nodes) +{ + struct zonelist *zonelist; + struct zoneref *z; + int nid; + + if (nodes_empty(*nodes)) + return false; + + /* + * We can chose arbitrary node from the nodemask to get a + * zonelist as they are interlinked. We just need to find + * at least one zone that can satisfy kernel allocations. + */ + nid = first_node(*nodes); + zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; + z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); + return (!z->zone) ? true : false; +} + + #ifdef CONFIG_SPARSEMEM #include #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2a9695ccb65f..d0e163a02099 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -69,6 +69,13 @@ DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); +/* + * There could be abnormal cpuset configurations for cpu or memory + * node binding, add this key to provide a quick low-cost judgement + * of the situation. + */ +DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); + /* See "Frequency meter" comments, below. */ struct fmeter { @@ -372,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); +static inline void check_insane_mems_config(nodemask_t *nodes) +{ + if (!cpusets_insane_config() && + movable_only_nodes(nodes)) { + static_branch_enable(&cpusets_insane_config_key); + pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" + "Cpuset allocations might fail even with a lot of memory available.\n", + nodemask_pr_args(nodes)); + } +} + /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting @@ -1870,6 +1888,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; + check_insane_mems_config(&trialcs->mems_allowed); + spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; spin_unlock_irq(&callback_lock); @@ -3173,6 +3193,9 @@ update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); + if (mems_updated) + check_insane_mems_config(&new_mems); + if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e891560e0a80..e493d7da2614 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4910,6 +4910,19 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; + /* + * Check for insane configurations where the cpuset doesn't contain + * any suitable zone to satisfy the request - e.g. non-movable + * GFP_HIGHUSER allocations from MOVABLE nodes only. + */ + if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) { + struct zoneref *z = first_zones_zonelist(ac->zonelist, + ac->highest_zoneidx, + &cpuset_current_mems_allowed); + if (!z->zone) + goto nopage; + } + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); From a6ea8b5b9f1ce3403a1c8516035d653006741e80 Mon Sep 17 00:00:00 2001 From: Liangcai Fan Date: Fri, 5 Nov 2021 13:40:37 -0700 Subject: [PATCH 364/512] mm/page_alloc.c: show watermark_boost of zone in zoneinfo min/low/high_wmark_pages(z) is defined as (z->_watermark[WMARK_MIN/LOW/HIGH] + z->watermark_boost) If kswapd is frequently woken up due to the increase of min/low/high_wmark_pages, printing watermark_boost can quickly locate whether watermark_boost or _watermark[WMARK_MIN/LOW/HIGH] caused min/low/high_wmark_pages to increase. Link: https://lkml.kernel.org/r/1632472566-12246-1-git-send-email-liangcaifan19@gmail.com Signed-off-by: Liangcai Fan Cc: Chunyan Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ mm/vmstat.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e493d7da2614..6714039cddf4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5993,6 +5993,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk(KERN_CONT "%s" " free:%lukB" + " boost:%lukB" " min:%lukB" " low:%lukB" " high:%lukB" @@ -6013,6 +6014,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone->watermark_boost), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), diff --git a/mm/vmstat.c b/mm/vmstat.c index 5db54581feab..ae87c90e0b4e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1656,6 +1656,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n pages free %lu" + "\n boost %lu" "\n min %lu" "\n low %lu" "\n high %lu" @@ -1664,6 +1665,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n managed %lu" "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), + zone->watermark_boost, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), From d2635f2012a44e3d469ab9a4022162dbe0e53f21 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 5 Nov 2021 13:40:40 -0700 Subject: [PATCH 365/512] mm: create a new system state and fix core_kernel_text() core_kernel_text() considers that until system_state in at least SYSTEM_RUNNING, init memory is valid. But init memory is freed a few lines before setting SYSTEM_RUNNING, so we have a small period of time when core_kernel_text() is wrong. Create an intermediate system state called SYSTEM_FREEING_INIT that is set before starting freeing init memory, and use it in core_kernel_text() to report init memory invalid earlier. Link: https://lkml.kernel.org/r/9ecfdee7dd4d741d172cb93ff1d87f1c58127c9a.1633001016.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Gerald Schaefer Cc: Kefeng Wang Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + init/main.c | 2 ++ kernel/extable.c | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2776423a587e..471bc0593679 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -248,6 +248,7 @@ extern bool early_boot_irqs_disabled; extern enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, + SYSTEM_FREEING_INITMEM, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, diff --git a/init/main.c b/init/main.c index 3c4054a95545..767ee2672176 100644 --- a/init/main.c +++ b/init/main.c @@ -1506,6 +1506,8 @@ static int __ref kernel_init(void *unused) kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + + system_state = SYSTEM_FREEING_INITMEM; kprobe_free_init_mem(); ftrace_free_init_mem(); kgdb_free_init_mem(); diff --git a/kernel/extable.c b/kernel/extable.c index b0ea5eb0c3b4..290661f68e6b 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -76,7 +76,7 @@ int notrace core_kernel_text(unsigned long addr) addr < (unsigned long)_etext) return 1; - if (system_state < SYSTEM_RUNNING && + if (system_state < SYSTEM_FREEING_INITMEM && init_kernel_text(addr)) return 1; return 0; From e5ae3728327fda5209454d738eebdb20443fdfac Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 5 Nov 2021 13:40:43 -0700 Subject: [PATCH 366/512] mm: make generic arch_is_kernel_initmem_freed() do what it says Commit 7a5da02de8d6 ("locking/lockdep: check for freed initmem in static_obj()") added arch_is_kernel_initmem_freed() which is supposed to report whether an object is part of already freed init memory. For the time being, the generic version of arch_is_kernel_initmem_freed() always reports 'false', allthough free_initmem() is generically called on all architectures. Therefore, change the generic version of arch_is_kernel_initmem_freed() to check whether free_initmem() has been called. If so, then check if a given address falls into init memory. To ease the use of system_state, move it out of line into its only caller which is lockdep.c Link: https://lkml.kernel.org/r/1d40783e676e07858be97d881f449ee7ea8adfb1.1633001016.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Gerald Schaefer Cc: Kefeng Wang Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/sections.h | 14 -------------- kernel/locking/lockdep.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index d16302d3eb59..596ab2092289 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -80,20 +80,6 @@ static inline int arch_is_kernel_data(unsigned long addr) } #endif -/* - * Check if an address is part of freed initmem. This is needed on architectures - * with virt == phys kernel mapping, for code that wants to check if an address - * is part of a static object within [_stext, _end]. After initmem is freed, - * memory can be allocated from it, and such allocations would then have - * addresses within the range [_stext, _end]. - */ -#ifndef arch_is_kernel_initmem_freed -static inline int arch_is_kernel_initmem_freed(unsigned long addr) -{ - return 0; -} -#endif - /** * memory_contains - checks if an object is contained within a memory region * @begin: virtual address of the beginning of the memory region diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bf1c00c881e4..8e118caf835e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -788,6 +788,21 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ +/* + * Check if an address is part of freed initmem. After initmem is freed, + * memory can be allocated from it, and such allocations would then have + * addresses within the range [_stext, _end]. + */ +#ifndef arch_is_kernel_initmem_freed +static int arch_is_kernel_initmem_freed(unsigned long addr) +{ + if (system_state < SYSTEM_FREEING_INITMEM) + return 0; + + return init_section_contains((void *)addr, 1); +} +#endif + static int static_obj(const void *obj) { unsigned long start = (unsigned long) &_stext, From e012a25d81a12fb332e862b51bfb59321acf96e4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 5 Nov 2021 13:40:46 -0700 Subject: [PATCH 367/512] powerpc: use generic version of arch_is_kernel_initmem_freed() The generic version of arch_is_kernel_initmem_freed() now does the same as powerpc version. Remove the powerpc version. Link: https://lkml.kernel.org/r/c53764eb45d41491e2b21da2e7812239897dbebb.1633001016.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Kefeng Wang Cc: Benjamin Herrenschmidt Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/sections.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 6e4af4492a14..79cb7a25a5fb 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -6,21 +6,8 @@ #include #include -#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed - #include -extern bool init_mem_is_free; - -static inline int arch_is_kernel_initmem_freed(unsigned long addr) -{ - if (!init_mem_is_free) - return 0; - - return addr >= (unsigned long)__init_begin && - addr < (unsigned long)__init_end; -} - extern char __head_end[]; #ifdef __powerpc64__ From 564f6ea1a689b06284641c12e164f3c5b57f3aaf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 5 Nov 2021 13:40:49 -0700 Subject: [PATCH 368/512] s390: use generic version of arch_is_kernel_initmem_freed() The generic version of arch_is_kernel_initmem_freed() now does the same as s390 version. Remove the s390 version. Link: https://lkml.kernel.org/r/b6feb5dfe611a322de482762fc2df3a9eece70c7.1633001016.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Acked-by: Heiko Carstens Cc: Gerald Schaefer Cc: Kefeng Wang Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/sections.h | 12 ------------ arch/s390/mm/init.c | 3 --- 2 files changed, 15 deletions(-) diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 85881dd48022..3fecaa4e8b74 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -2,20 +2,8 @@ #ifndef _S390_SECTIONS_H #define _S390_SECTIONS_H -#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed - #include -extern bool initmem_freed; - -static inline int arch_is_kernel_initmem_freed(unsigned long addr) -{ - if (!initmem_freed) - return 0; - return addr >= (unsigned long)__init_begin && - addr < (unsigned long)__init_end; -} - /* * .boot.data section contains variables "shared" between the decompressor and * the decompressed kernel. The decompressor will store values in them, and diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index a04faf49001a..8c6f258a6183 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -58,8 +58,6 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -bool initmem_freed; - static void __init setup_zero_pages(void) { unsigned int order; @@ -214,7 +212,6 @@ void __init mem_init(void) void free_initmem(void) { - initmem_freed = true; __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RW | SET_MEMORY_NX); From 9c25cbfcb38462803a3d68f5d88e66a587f5f045 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 5 Nov 2021 13:40:52 -0700 Subject: [PATCH 369/512] mm: page_alloc: use migrate_disable() in drain_local_pages_wq() drain_local_pages_wq() disables preemption to avoid CPU migration during CPU hotplug and can't use cpus_read_lock(). Using migrate_disable() works here, too. The scheduler won't take the CPU offline until the task left the migrate-disable section. The problem with disabled preemption here is that drain_local_pages() acquires locks which are turned into sleeping locks on PREEMPT_RT and can't be acquired with disabled preemption. Use migrate_disable() in drain_local_pages_wq(). Link: https://lkml.kernel.org/r/20211015210933.viw6rjvo64qtqxn4@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6714039cddf4..2aafc337d068 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3141,9 +3141,9 @@ static void drain_local_pages_wq(struct work_struct *work) * cpu which is alright but we also have to make sure to not move to * a different one. */ - preempt_disable(); + migrate_disable(); drain_local_pages(drain->zone); - preempt_enable(); + migrate_enable(); } /* From 59d336bdf6931a6a8c2ba41e533267d1cc799fc9 Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Fri, 5 Nov 2021 13:40:55 -0700 Subject: [PATCH 370/512] mm/page_alloc: use clamp() to simplify code This patch uses clamp() to simplify code in init_per_zone_wmark_min(). Link: https://lkml.kernel.org/r/20211021034830.1049150-1-bobo.shaobowang@huawei.com Signed-off-by: Wang ShaoBo Reviewed-by: David Hildenbrand Cc: Wei Yongjun Cc: Li Bin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2aafc337d068..a7035467bf6d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8477,16 +8477,12 @@ int __meminit init_per_zone_wmark_min(void) lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (new_min_free_kbytes > user_min_free_kbytes) { - min_free_kbytes = new_min_free_kbytes; - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 262144) - min_free_kbytes = 262144; - } else { + if (new_min_free_kbytes > user_min_free_kbytes) + min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); + else pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); - } + setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); From 477d01fce8dacb41cae9363136ea56812e8baa59 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:40:58 -0700 Subject: [PATCH 371/512] mm: fix data race in PagePoisoned() PagePoisoned() accesses page->flags which can be updated concurrently: | BUG: KCSAN: data-race in next_uptodate_page / unlock_page | | write (marked) to 0xffffea00050f37c0 of 8 bytes by task 1872 on cpu 1: | instrument_atomic_write include/linux/instrumented.h:87 [inline] | clear_bit_unlock_is_negative_byte include/asm-generic/bitops/instrumented-lock.h:74 [inline] | unlock_page+0x102/0x1b0 mm/filemap.c:1465 | filemap_map_pages+0x6c6/0x890 mm/filemap.c:3057 | ... | read to 0xffffea00050f37c0 of 8 bytes by task 1873 on cpu 0: | PagePoisoned include/linux/page-flags.h:204 [inline] | PageReadahead include/linux/page-flags.h:382 [inline] | next_uptodate_page+0x456/0x830 mm/filemap.c:2975 | ... | CPU: 0 PID: 1873 Comm: systemd-udevd Not tainted 5.11.0-rc4-00001-gf9ce0be71d1f #1 To avoid the compiler tearing or otherwise optimizing the access, use READ_ONCE() to access flags. Link: https://lore.kernel.org/all/20210826144157.GA26950@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/20210913113542.2658064-1-elver@google.com Reported-by: kernel test robot Signed-off-by: Marco Elver Acked-by: Kirill A. Shutemov Acked-by: Will Deacon Cc: Marco Elver Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index fbfd3fad48f2..dd80cf0bb579 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -215,7 +215,7 @@ static __always_inline int PageCompound(struct page *page) #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { - return page->flags == PAGE_POISON_PATTERN; + return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM From ba9eb3cef9e699e259f9ceefdbcd3ee83d3529e2 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Fri, 5 Nov 2021 13:41:01 -0700 Subject: [PATCH 372/512] mm/memory_failure: constify static mm_walk_ops The only usage of hwp_walk_ops is to pass its address to walk_page_range() which takes a pointer to const mm_walk_ops as argument. Make it const to allow the compiler to put it in read-only memory. Link: https://lkml.kernel.org/r/20211014075042.17174-3-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Acked-by: Naoya Horiguchi Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8376cfe0efce..54a5d28ea601 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -674,7 +674,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, #define hwpoison_hugetlb_range NULL #endif -static struct mm_walk_ops hwp_walk_ops = { +static const struct mm_walk_ops hwp_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, }; From e0f43fa50605f89d45708bce3b94e408ef5c4342 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:41:04 -0700 Subject: [PATCH 373/512] mm: filemap: coding style cleanup for filemap_map_pmd() Patch series "Solve silent data loss caused by poisoned page cache (shmem/tmpfs)", v5. When discussing the patch that splits page cache THP in order to offline the poisoned page, Noaya mentioned there is a bigger problem [1] that prevents this from working since the page cache page will be truncated if uncorrectable errors happen. By looking this deeper it turns out this approach (truncating poisoned page) may incur silent data loss for all non-readonly filesystems if the page is dirty. It may be worse for in-memory filesystem, e.g. shmem/tmpfs since the data blocks are actually gone. To solve this problem we could keep the poisoned dirty page in page cache then notify the users on any later access, e.g. page fault, read/write, etc. The clean page could be truncated as is since they can be reread from disk later on. The consequence is the filesystems may find poisoned page and manipulate it as healthy page since all the filesystems actually don't check if the page is poisoned or not in all the relevant paths except page fault. In general, we need make the filesystems be aware of poisoned page before we could keep the poisoned page in page cache in order to solve the data loss problem. To make filesystems be aware of poisoned page we should consider: - The page should be not written back: clearing dirty flag could prevent from writeback. - The page should not be dropped (it shows as a clean page) by drop caches or other callers: the refcount pin from hwpoison could prevent from invalidating (called by cache drop, inode cache shrinking, etc), but it doesn't avoid invalidation in DIO path. - The page should be able to get truncated/hole punched/unlinked: it works as it is. - Notify users when the page is accessed, e.g. read/write, page fault and other paths (compression, encryption, etc). The scope of the last one is huge since almost all filesystems need do it once a page is returned from page cache lookup. There are a couple of options to do it: 1. Check hwpoison flag for every path, the most straightforward way. 2. Return NULL for poisoned page from page cache lookup, the most callsites check if NULL is returned, this should have least work I think. But the error handling in filesystems just return -ENOMEM, the error code will incur confusion to the users obviously. 3. To improve #2, we could return error pointer, e.g. ERR_PTR(-EIO), but this will involve significant amount of code change as well since all the paths need check if the pointer is ERR or not just like option #1. I did prototypes for both #1 and #3, but it seems #3 may require more changes than #1. For #3 ERR_PTR will be returned so all the callers need to check the return value otherwise invalid pointer may be dereferenced, but not all callers really care about the content of the page, for example, partial truncate which just sets the truncated range in one page to 0. So for such paths it needs additional modification if ERR_PTR is returned. And if the callers have their own way to handle the problematic pages we need to add a new FGP flag to tell FGP functions to return the pointer to the page. It may happen very rarely, but once it happens the consequence (data corruption) could be very bad and it is very hard to debug. It seems this problem had been slightly discussed before, but seems no action was taken at that time. [2] As the aforementioned investigation, it needs huge amount of work to solve the potential data loss for all filesystems. But it is much easier for in-memory filesystems and such filesystems actually suffer more than others since even the data blocks are gone due to truncating. So this patchset starts from shmem/tmpfs by taking option #1. TODO: * The unpoison has been broken since commit 0ed950d1f281 ("mm,hwpoison: make get_hwpoison_page() call get_any_page()"), and this patch series make refcount check for unpoisoning shmem page fail. * Expand to other filesystems. But I haven't heard feedback from filesystem developers yet. Patch breakdown: Patch #1: cleanup, depended by patch #2 Patch #2: fix THP with hwpoisoned subpage(s) PMD map bug Patch #3: coding style cleanup Patch #4: refactor and preparation. Patch #5: keep the poisoned page in page cache and handle such case for all the paths. Patch #6: the previous patches unblock page cache THP split, so this patch add page cache THP split support. This patch (of 4): A minor cleanup to the indent. Link: https://lkml.kernel.org/r/20211020210755.23964-1-shy828301@gmail.com Link: https://lkml.kernel.org/r/20211020210755.23964-4-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Naoya Horiguchi Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index d05f8013a4f0..6f2c4bd6a571 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3203,12 +3203,12 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) } if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { - vm_fault_t ret = do_set_pmd(vmf, page); - if (!ret) { - /* The page is mapped successfully, reference consumed. */ - unlock_page(page); - return true; - } + vm_fault_t ret = do_set_pmd(vmf, page); + if (!ret) { + /* The page is mapped successfully, reference consumed. */ + unlock_page(page); + return true; + } } if (pmd_none(*vmf->pmd)) From dd0f230a0a80ff396c7ce587f16429f2a8131344 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:41:07 -0700 Subject: [PATCH 374/512] mm: hwpoison: refactor refcount check handling Memory failure will report failure if the page still has extra pinned refcount other than from hwpoison after the handler is done. Actually the check is not necessary for all handlers, so move the check into specific handlers. This would make the following keeping shmem page in page cache patch easier. There may be expected extra pin for some cases, for example, when the page is dirty and in swapcache. Link: https://lkml.kernel.org/r/20211020210755.23964-5-shy828301@gmail.com Signed-off-by: Yang Shi Signed-off-by: Naoya Horiguchi Suggested-by: Naoya Horiguchi Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 93 +++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 54a5d28ea601..bbd433e96790 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -807,12 +807,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn, return ret; } +struct page_state { + unsigned long mask; + unsigned long res; + enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ + int (*action)(struct page_state *ps, struct page *p); +}; + +/* + * Return true if page is still referenced by others, otherwise return + * false. + * + * The extra_pins is true when one extra refcount is expected. + */ +static bool has_extra_refcount(struct page_state *ps, struct page *p, + bool extra_pins) +{ + int count = page_count(p) - 1; + + if (extra_pins) + count -= 1; + + if (count > 0) { + pr_err("Memory failure: %#lx: %s still referenced by %d users\n", + page_to_pfn(p), action_page_types[ps->type], count); + return true; + } + + return false; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we * could be more sophisticated. */ -static int me_kernel(struct page *p, unsigned long pfn) +static int me_kernel(struct page_state *ps, struct page *p) { unlock_page(p); return MF_IGNORED; @@ -821,9 +853,9 @@ static int me_kernel(struct page *p, unsigned long pfn) /* * Page in unknown state. Do nothing. */ -static int me_unknown(struct page *p, unsigned long pfn) +static int me_unknown(struct page_state *ps, struct page *p) { - pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); return MF_FAILED; } @@ -831,7 +863,7 @@ static int me_unknown(struct page *p, unsigned long pfn) /* * Clean (or cleaned) page cache page. */ -static int me_pagecache_clean(struct page *p, unsigned long pfn) +static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; @@ -868,9 +900,13 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_rwsem or not for this? Right now we don't. */ - ret = truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, page_to_pfn(p), mapping); out: unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; } @@ -879,7 +915,7 @@ out: * Issues: when the error hit a hole page the error is not properly * propagated. */ -static int me_pagecache_dirty(struct page *p, unsigned long pfn) +static int me_pagecache_dirty(struct page_state *ps, struct page *p) { struct address_space *mapping = page_mapping(p); @@ -923,7 +959,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) mapping_set_error(mapping, -EIO); } - return me_pagecache_clean(p, pfn); + return me_pagecache_clean(ps, p); } /* @@ -945,9 +981,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * Clean swap cache pages can be directly isolated. A later page fault will * bring in the known good data from disk. */ -static int me_swapcache_dirty(struct page *p, unsigned long pfn) +static int me_swapcache_dirty(struct page_state *ps, struct page *p) { int ret; + bool extra_pins = false; ClearPageDirty(p); /* Trigger EIO in shmem: */ @@ -955,10 +992,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; unlock_page(p); + + if (ret == MF_DELAYED) + extra_pins = true; + + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + return ret; } -static int me_swapcache_clean(struct page *p, unsigned long pfn) +static int me_swapcache_clean(struct page_state *ps, struct page *p) { int ret; @@ -966,6 +1010,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; } @@ -975,7 +1023,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. */ -static int me_huge_page(struct page *p, unsigned long pfn) +static int me_huge_page(struct page_state *ps, struct page *p) { int res; struct page *hpage = compound_head(p); @@ -986,7 +1034,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { - res = truncate_error_page(hpage, pfn, mapping); + res = truncate_error_page(hpage, page_to_pfn(p), mapping); unlock_page(hpage); } else { res = MF_FAILED; @@ -1004,6 +1052,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) } } + if (has_extra_refcount(ps, p, false)) + res = MF_FAILED; + return res; } @@ -1029,14 +1080,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) -static struct page_state { - unsigned long mask; - unsigned long res; - enum mf_action_page_type type; - - /* Callback ->action() has to unlock the relevant page inside it. */ - int (*action)(struct page *p, unsigned long pfn); -} error_states[] = { +static struct page_state error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: @@ -1096,19 +1140,10 @@ static int page_action(struct page_state *ps, struct page *p, unsigned long pfn) { int result; - int count; /* page p should be unlocked after returning from ps->action(). */ - result = ps->action(p, pfn); + result = ps->action(ps, p); - count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == MF_DELAYED) - count--; - if (count > 0) { - pr_err("Memory failure: %#lx: %s still referenced by %d users\n", - pfn, action_page_types[ps->type], count); - result = MF_FAILED; - } action_result(pfn, ps->type, result); /* Could do more checks here if page looks ok */ From b9d02f1bdd98f38e6e5ecacc9786a8f58f3f8b2c Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:41:10 -0700 Subject: [PATCH 375/512] mm: shmem: don't truncate page if memory failure happens The current behavior of memory failure is to truncate the page cache regardless of dirty or clean. If the page is dirty the later access will get the obsolete data from disk without any notification to the users. This may cause silent data loss. It is even worse for shmem since shmem is in-memory filesystem, truncating page cache means discarding data blocks. The later read would return all zero. The right approach is to keep the corrupted page in page cache, any later access would return error for syscalls or SIGBUS for page fault, until the file is truncated, hole punched or removed. The regular storage backed filesystems would be more complicated so this patch is focused on shmem. This also unblock the support for soft offlining shmem THP. [arnd@arndb.de: fix uninitialized variable use in me_pagecache_clean()] Link: https://lkml.kernel.org/r/20211022064748.4173718-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20211020210755.23964-6-shy828301@gmail.com Signed-off-by: Yang Shi Signed-off-by: Arnd Bergmann Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 14 +++++++++++--- mm/shmem.c | 38 +++++++++++++++++++++++++++++++++++--- mm/userfaultfd.c | 5 +++++ 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bbd433e96790..952a41d568c4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -867,6 +868,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -895,18 +897,24 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) goto out; } + /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); - if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index df2f0288b9ca..6a7bb46019e2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2454,6 +2454,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@ -2464,7 +2465,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (*pagep && PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + ret = -EIO; + } + + return ret; } static int @@ -2551,6 +2560,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (sgp == SGP_CACHE) set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } } /* @@ -3112,7 +3127,8 @@ static const char *shmem_get_link(struct dentry *dentry, page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@ -3120,6 +3136,11 @@ static const char *shmem_get_link(struct dentry *dentry, error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (page && PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@ -3770,6 +3791,13 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, @@ -3780,7 +3808,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops); @@ -4191,6 +4219,10 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, page = ERR_PTR(error); else unlock_page(page); + + if (PageHWPoison(page)) + page = ERR_PTR(-EIO); + return page; #else /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index caf6dfff2a60..77fce86371a9 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -232,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, goto out; } + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, page, false, wp_copy); if (ret) From 4966455d9100236fd6dd72b0cd00818435fdb25d Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:41:14 -0700 Subject: [PATCH 376/512] mm: hwpoison: handle non-anonymous THP correctly Currently hwpoison doesn't handle non-anonymous THP, but since v4.8 THP support for tmpfs and read-only file cache has been added. They could be offlined by split THP, just like anonymous THP. Link: https://lkml.kernel.org/r/20211020210755.23964-7-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Naoya Horiguchi Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 952a41d568c4..f38b7b42a508 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1444,14 +1444,11 @@ static int identify_page_state(unsigned long pfn, struct page *p, static int try_to_split_thp_page(struct page *page, const char *msg) { lock_page(page); - if (!PageAnon(page) || unlikely(split_huge_page(page))) { + if (unlikely(split_huge_page(page))) { unsigned long pfn = page_to_pfn(page); unlock_page(page); - if (!PageAnon(page)) - pr_info("%s: %#lx: non anonymous thp\n", msg, pfn); - else - pr_info("%s: %#lx: thp split failed\n", msg, pfn); + pr_info("%s: %#lx: thp split failed\n", msg, pfn); put_page(page); return -EBUSY; } From 73c54763482b841d9c14bad87eec98f80f700e0b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:41:17 -0700 Subject: [PATCH 377/512] mm/hugetlb: drop __unmap_hugepage_range definition from hugetlb.h Remove __unmap_hugepage_range() from the header file, because it is only used in hugetlb.c. Link: https://lkml.kernel.org/r/20210917165108.9341-1-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 10 ---------- mm/hugetlb.c | 6 +++--- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1faebe1cd0ed..3cbf60464398 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -143,9 +143,6 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page); -void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct page *ref_page); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo(void); @@ -385,13 +382,6 @@ static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, BUG(); } -static inline void __unmap_hugepage_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) -{ - BUG(); -} - static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 95dc7b83381f..d394d9545c4e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4426,9 +4426,9 @@ again: return ret; } -void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct page *ref_page) +static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page) { struct mm_struct *mm = vma->vm_mm; unsigned long address; From 79dfc695525f60c8410015362b8aa4eb5c57210c Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Nov 2021 13:41:20 -0700 Subject: [PATCH 378/512] hugetlb: add demote hugetlb page sysfs interfaces Patch series "hugetlb: add demote/split page functionality", v4. The concurrent use of multiple hugetlb page sizes on a single system is becoming more common. One of the reasons is better TLB support for gigantic page sizes on x86 hardware. In addition, hugetlb pages are being used to back VMs in hosting environments. When using hugetlb pages to back VMs, it is often desirable to preallocate hugetlb pools. This avoids the delay and uncertainty of allocating hugetlb pages at VM startup. In addition, preallocating huge pages minimizes the issue of memory fragmentation that increases the longer the system is up and running. In such environments, a combination of larger and smaller hugetlb pages are preallocated in anticipation of backing VMs of various sizes. Over time, the preallocated pool of smaller hugetlb pages may become depleted while larger hugetlb pages still remain. In such situations, it is desirable to convert larger hugetlb pages to smaller hugetlb pages. Converting larger to smaller hugetlb pages can be accomplished today by first freeing the larger page to the buddy allocator and then allocating the smaller pages. For example, to convert 50 GB pages on x86: gb_pages=`cat .../hugepages-1048576kB/nr_hugepages` m2_pages=`cat .../hugepages-2048kB/nr_hugepages` echo $(($gb_pages - 50)) > .../hugepages-1048576kB/nr_hugepages echo $(($m2_pages + 25600)) > .../hugepages-2048kB/nr_hugepages On an idle system this operation is fairly reliable and results are as expected. The number of 2MB pages is increased as expected and the time of the operation is a second or two. However, when there is activity on the system the following issues arise: 1) This process can take quite some time, especially if allocation of the smaller pages is not immediate and requires migration/compaction. 2) There is no guarantee that the total size of smaller pages allocated will match the size of the larger page which was freed. This is because the area freed by the larger page could quickly be fragmented. In a test environment with a load that continually fills the page cache with clean pages, results such as the following can be observed: Unexpected number of 2MB pages allocated: Expected 25600, have 19944 real 0m42.092s user 0m0.008s sys 0m41.467s To address these issues, introduce the concept of hugetlb page demotion. Demotion provides a means of 'in place' splitting of a hugetlb page to pages of a smaller size. This avoids freeing pages to buddy and then trying to allocate from buddy. Page demotion is controlled via sysfs files that reside in the per-hugetlb page size and per node directories. - demote_size Target page size for demotion, a smaller huge page size. File can be written to chose a smaller huge page size if multiple are available. - demote Writable number of hugetlb pages to be demoted To demote 50 GB huge pages, one would: cat .../hugepages-1048576kB/free_hugepages /* optional, verify free pages */ cat .../hugepages-1048576kB/demote_size /* optional, verify target size */ echo 50 > .../hugepages-1048576kB/demote Only hugetlb pages which are free at the time of the request can be demoted. Demotion does not add to the complexity of surplus pages and honors reserved huge pages. Therefore, when a value is written to the sysfs demote file, that value is only the maximum number of pages which will be demoted. It is possible fewer will actually be demoted. The recently introduced per-hstate mutex is used to synchronize demote operations with other operations that modify hugetlb pools. Real world use cases -------------------- The above scenario describes a real world use case where hugetlb pages are used to back VMs on x86. Both issues of long allocation times and not necessarily getting the expected number of smaller huge pages after a free and allocate cycle have been experienced. The occurrence of these issues is dependent on other activity within the host and can not be predicted. This patch (of 5): Two new sysfs files are added to demote hugtlb pages. These files are both per-hugetlb page size and per node. Files are: demote_size - The size in Kb that pages are demoted to. (read-write) demote - The number of huge pages to demote. (write-only) By default, demote_size is the next smallest huge page size. Valid huge page sizes less than huge page size may be written to this file. When huge pages are demoted, they are demoted to this size. Writing a value to demote will result in an attempt to demote that number of hugetlb pages to an appropriate number of demote_size pages. NOTE: Demote interfaces are only provided for huge page sizes if there is a smaller target demote huge page size. For example, on x86 1GB huge pages will have demote interfaces. 2MB huge pages will not have demote interfaces. This patch does not provide full demote functionality. It only provides the sysfs interfaces. It also provides documentation for the new interfaces. [mike.kravetz@oracle.com: n_mask initialization does not need to be protected by the mutex] Link: https://lkml.kernel.org/r/0530e4ef-2492-5186-f919-5db68edea654@oracle.com Link: https://lkml.kernel.org/r/20211007181918.136982-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Michal Hocko Cc: Zi Yan Cc: Muchun Song Cc: Naoya Horiguchi Cc: David Rientjes Cc: "Aneesh Kumar K . V" Cc: Nghia Le Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/hugetlbpage.rst | 30 +++- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 155 ++++++++++++++++++- 3 files changed, 183 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 8abaeb144e44..bb90de3885d1 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -234,8 +234,12 @@ will exist, of the form:: hugepages-${size}kB -Inside each of these directories, the same set of files will exist:: +Inside each of these directories, the set of files contained in ``/proc`` +will exist. In addition, two additional interfaces for demoting huge +pages may exist:: + demote + demote_size nr_hugepages nr_hugepages_mempolicy nr_overcommit_hugepages @@ -243,7 +247,29 @@ Inside each of these directories, the same set of files will exist:: resv_hugepages surplus_hugepages -which function as described above for the default huge page-sized case. +The demote interfaces provide the ability to split a huge page into +smaller huge pages. For example, the x86 architecture supports both +1GB and 2MB huge pages sizes. A 1GB huge page can be split into 512 +2MB huge pages. Demote interfaces are not available for the smallest +huge page size. The demote interfaces are: + +demote_size + is the size of demoted pages. When a page is demoted a corresponding + number of huge pages of demote_size will be created. By default, + demote_size is set to the next smaller huge page size. If there are + multiple smaller huge page sizes, demote_size can be set to any of + these smaller sizes. Only huge page sizes less than the current huge + pages size are allowed. + +demote + is used to demote a number of huge pages. A user with root privileges + can write to this file. It may not be possible to demote the + requested number of huge pages. To determine how many pages were + actually demoted, compare the value of nr_hugepages before and after + writing to the demote interface. demote is a write only interface. + +The interfaces which are the same as in ``/proc`` (all except demote and +demote_size) function as described above for the default huge page-sized case. .. _mem_policy_and_hp_alloc: diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3cbf60464398..2bddd6c38204 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -586,6 +586,7 @@ struct hstate { int next_nid_to_alloc; int next_nid_to_free; unsigned int order; + unsigned int demote_order; unsigned long mask; unsigned long max_huge_pages; unsigned long nr_huge_pages; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d394d9545c4e..1a18ff2f0001 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2986,7 +2986,7 @@ free: static void __init hugetlb_init_hstates(void) { - struct hstate *h; + struct hstate *h, *h2; for_each_hstate(h) { if (minimum_order > huge_page_order(h)) @@ -2995,6 +2995,22 @@ static void __init hugetlb_init_hstates(void) /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); + + /* + * Set demote order for each hstate. Note that + * h->demote_order is initially 0. + * - We can not demote gigantic pages if runtime freeing + * is not supported, so skip this. + */ + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + continue; + for_each_hstate(h2) { + if (h2 == h) + continue; + if (h2->order < h->order && + h2->order > h->demote_order) + h->demote_order = h2->order; + } } VM_BUG_ON(minimum_order == UINT_MAX); } @@ -3235,9 +3251,31 @@ out: return 0; } +static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) + __must_hold(&hugetlb_lock) +{ + int rc = 0; + + lockdep_assert_held(&hugetlb_lock); + + /* We should never get here if no demote order */ + if (!h->demote_order) { + pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); + return -EINVAL; /* internal error */ + } + + /* + * TODO - demote fucntionality will be added in subsequent patch + */ + return rc; +} + #define HSTATE_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define HSTATE_ATTR_WO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_WO(_name) + #define HSTATE_ATTR(_name) \ static struct kobj_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) @@ -3433,6 +3471,105 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj, } HSTATE_ATTR_RO(surplus_hugepages); +static ssize_t demote_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + unsigned long nr_demote; + unsigned long nr_available; + nodemask_t nodes_allowed, *n_mask; + struct hstate *h; + int err = 0; + int nid; + + err = kstrtoul(buf, 10, &nr_demote); + if (err) + return err; + h = kobj_to_hstate(kobj, &nid); + + if (nid != NUMA_NO_NODE) { + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; + } else { + n_mask = &node_states[N_MEMORY]; + } + + /* Synchronize with other sysfs operations modifying huge pages */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); + + while (nr_demote) { + /* + * Check for available pages to demote each time thorough the + * loop as demote_pool_huge_page will drop hugetlb_lock. + * + * NOTE: demote_pool_huge_page does not yet drop hugetlb_lock + * but will when full demote functionality is added in a later + * patch. + */ + if (nid != NUMA_NO_NODE) + nr_available = h->free_huge_pages_node[nid]; + else + nr_available = h->free_huge_pages; + nr_available -= h->resv_huge_pages; + if (!nr_available) + break; + + err = demote_pool_huge_page(h, n_mask); + if (err) + break; + + nr_demote--; + } + + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); + + if (err) + return err; + return len; +} +HSTATE_ATTR_WO(demote); + +static ssize_t demote_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int nid; + struct hstate *h = kobj_to_hstate(kobj, &nid); + unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; + + return sysfs_emit(buf, "%lukB\n", demote_size); +} + +static ssize_t demote_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct hstate *h, *demote_hstate; + unsigned long demote_size; + unsigned int demote_order; + int nid; + + demote_size = (unsigned long)memparse(buf, NULL); + + demote_hstate = size_to_hstate(demote_size); + if (!demote_hstate) + return -EINVAL; + demote_order = demote_hstate->order; + + /* demote order must be smaller than hstate order */ + h = kobj_to_hstate(kobj, &nid); + if (demote_order >= h->order) + return -EINVAL; + + /* resize_lock synchronizes access to demote size and writes */ + mutex_lock(&h->resize_lock); + h->demote_order = demote_order; + mutex_unlock(&h->resize_lock); + + return count; +} +HSTATE_ATTR(demote_size); + static struct attribute *hstate_attrs[] = { &nr_hugepages_attr.attr, &nr_overcommit_hugepages_attr.attr, @@ -3449,6 +3586,16 @@ static const struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; +static struct attribute *hstate_demote_attrs[] = { + &demote_size_attr.attr, + &demote_attr.attr, + NULL, +}; + +static const struct attribute_group hstate_demote_attr_group = { + .attrs = hstate_demote_attrs, +}; + static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct kobject **hstate_kobjs, const struct attribute_group *hstate_attr_group) @@ -3466,6 +3613,12 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, hstate_kobjs[hi] = NULL; } + if (h->demote_order) { + if (sysfs_create_group(hstate_kobjs[hi], + &hstate_demote_attr_group)) + pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); + } + return retval; } From 9871e2ded6c1ff61a59988d7a0e975f012105d52 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Nov 2021 13:41:23 -0700 Subject: [PATCH 379/512] mm/cma: add cma_pages_valid to determine if pages are in CMA Add new interface cma_pages_valid() which indicates if the specified pages are part of a CMA region. This interface will be used in a subsequent patch by hugetlb code. In order to keep the same amount of DEBUG information, a pr_debug() call was added to cma_pages_valid(). In the case where the page passed to cma_release is not in cma region, the debug message will be printed from cma_pages_valid as opposed to cma_release. Link: https://lkml.kernel.org/r/20211007181918.136982-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: David Rientjes Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nghia Le Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 1 + mm/cma.c | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 53fd8c3cdbd0..bd801023504b 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -46,6 +46,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); +extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); diff --git a/mm/cma.c b/mm/cma.c index 995e15480937..11152c3fb23c 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -524,6 +524,25 @@ out: return page; } +bool cma_pages_valid(struct cma *cma, const struct page *pages, + unsigned long count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) { + pr_debug("%s(page %p, count %lu)\n", __func__, + (void *)pages, count); + return false; + } + + return true; +} + /** * cma_release() - release allocated pages * @cma: Contiguous memory region for which the allocation is performed. @@ -539,16 +558,13 @@ bool cma_release(struct cma *cma, const struct page *pages, { unsigned long pfn; - if (!cma || !pages) + if (!cma_pages_valid(cma, pages, count)) return false; pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) - return false; - VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); free_contig_range(pfn, count); From a01f43901cfb93b7b63f2739774b80469b21128d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Nov 2021 13:41:27 -0700 Subject: [PATCH 380/512] hugetlb: be sure to free demoted CMA pages to CMA When huge page demotion is fully implemented, gigantic pages can be demoted to a smaller huge page size. For example, on x86 a 1G page can be demoted to 512 2M pages. However, gigantic pages can potentially be allocated from CMA. If a gigantic page which was allocated from CMA is demoted, the corresponding demoted pages needs to be returned to CMA. Use the new interface cma_pages_valid() to determine if a non-gigantic hugetlb page should be freed to CMA. Also, clear mapping field of these pages as expected by cma_release. This also requires a change to CMA region creation for gigantic pages. CMA uses a per-region bit map to track allocations. When setting up the region, you specify how many pages each bit represents. Currently, only gigantic pages are allocated/freed from CMA so the region is set up such that one bit represents a gigantic page size allocation. With demote, a gigantic page (allocation) could be split into smaller size pages. And, these smaller size pages will be freed to CMA. So, since the per-region bit map needs to be set up to represent the smallest allocation/free size, it now needs to be set to the smallest huge page size which can be freed to CMA. Unfortunately, we set up the CMA region for huge pages before we set up huge pages sizes (hstates). So, technically we do not know the smallest huge page size as this can change via command line options and architecture specific code. Therefore, at region setup time we use HUGETLB_PAGE_ORDER as the smallest possible huge page size that can be given back to CMA. It is possible that this value is sub-optimal for some architectures/config options. If needed, this can be addressed in follow on work. Link: https://lkml.kernel.org/r/20211007181918.136982-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: "Aneesh Kumar K . V" Cc: David Hildenbrand Cc: David Rientjes Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nghia Le Cc: Oscar Salvador Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a18ff2f0001..6662a99e6dab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -50,6 +50,16 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; +static bool hugetlb_cma_page(struct page *page, unsigned int order) +{ + return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, + 1 << order); +} +#else +static bool hugetlb_cma_page(struct page *page, unsigned int order) +{ + return false; +} #endif static unsigned long hugetlb_cma_size __initdata; @@ -1272,6 +1282,7 @@ static void destroy_compound_gigantic_page(struct page *page, atomic_set(compound_pincount_ptr(page), 0); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + p->mapping = NULL; clear_compound_head(p); set_page_refcounted(p); } @@ -1476,7 +1487,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_private | 1 << PG_writeback); } - if (hstate_is_gigantic(h)) { + + /* + * Non-gigantic pages demoted from CMA allocated gigantic pages + * need to be given back to CMA in free_gigantic_page. + */ + if (hstate_is_gigantic(h) || + hugetlb_cma_page(page, huge_page_order(h))) { destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); } else { @@ -3001,9 +3018,13 @@ static void __init hugetlb_init_hstates(void) * h->demote_order is initially 0. * - We can not demote gigantic pages if runtime freeing * is not supported, so skip this. + * - If CMA allocation is possible, we can not demote + * HUGETLB_PAGE_ORDER or smaller size pages. */ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) continue; + if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) + continue; for_each_hstate(h2) { if (h2 == h) continue; @@ -3555,6 +3576,8 @@ static ssize_t demote_size_store(struct kobject *kobj, if (!demote_hstate) return -EINVAL; demote_order = demote_hstate->order; + if (demote_order < HUGETLB_PAGE_ORDER) + return -EINVAL; /* demote order must be smaller than hstate order */ h = kobj_to_hstate(kobj, &nid); @@ -6543,6 +6566,7 @@ void __init hugetlb_cma_reserve(int order) if (hugetlb_cma_size < (PAGE_SIZE << order)) { pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", (PAGE_SIZE << order) / SZ_1M); + hugetlb_cma_size = 0; return; } @@ -6563,7 +6587,13 @@ void __init hugetlb_cma_reserve(int order) size = round_up(size, PAGE_SIZE << order); snprintf(name, sizeof(name), "hugetlb%d", nid); - res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, + /* + * Note that 'order per bit' is based on smallest size that + * may be returned to CMA allocator in the case of + * huge page demotion. + */ + res = cma_declare_contiguous_nid(0, size, 0, + PAGE_SIZE << HUGETLB_PAGE_ORDER, 0, false, name, &hugetlb_cma[nid], nid); if (res) { @@ -6579,6 +6609,13 @@ void __init hugetlb_cma_reserve(int order) if (reserved >= hugetlb_cma_size) break; } + + if (!reserved) + /* + * hugetlb_cma_size is used to determine if allocations from + * cma are possible. Set to zero if no cma regions are set up. + */ + hugetlb_cma_size = 0; } void __init hugetlb_cma_check(void) From 34d9e35b13d5607d6d5105b263adaaba90bdafeb Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Nov 2021 13:41:30 -0700 Subject: [PATCH 381/512] hugetlb: add demote bool to gigantic page routines The routines remove_hugetlb_page and destroy_compound_gigantic_page will remove a gigantic page and make the set of base pages ready to be returned to a lower level allocator. In the process of doing this, they make all base pages reference counted. The routine prep_compound_gigantic_page creates a gigantic page from a set of base pages. It assumes that all these base pages are reference counted. During demotion, a gigantic page will be split into huge pages of a smaller size. This logically involves use of the routines, remove_hugetlb_page, and destroy_compound_gigantic_page followed by prep_compound*_page for each smaller huge page. When pages are reference counted (ref count >= 0), additional speculative ref counts could be taken as described in previous commits [1] and [2]. This could result in errors while demoting a huge page. Quite a bit of code would need to be created to handle all possible issues. Instead of dealing with the possibility of speculative ref counts, avoid the possibility by keeping ref counts at zero during the demote process. Add a boolean 'demote' to the routines remove_hugetlb_page, destroy_compound_gigantic_page and prep_compound_gigantic_page. If the boolean is set, the remove and destroy routines will not reference count pages and the prep routine will not expect reference counted pages. '*_for_demote' wrappers of the routines will be added in a subsequent patch where this functionality is used. [1] https://lore.kernel.org/linux-mm/20210622021423.154662-3-mike.kravetz@oracle.com/ [2] https://lore.kernel.org/linux-mm/20210809184832.18342-3-mike.kravetz@oracle.com/ Link: https://lkml.kernel.org/r/20211007181918.136982-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: David Hildenbrand Cc: David Rientjes Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nghia Le Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 54 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6662a99e6dab..c00dd2d1e5f4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1271,8 +1271,8 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) nr_nodes--) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_page(struct page *page, - unsigned int order) +static void __destroy_compound_gigantic_page(struct page *page, + unsigned int order, bool demote) { int i; int nr_pages = 1 << order; @@ -1284,7 +1284,8 @@ static void destroy_compound_gigantic_page(struct page *page, for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { p->mapping = NULL; clear_compound_head(p); - set_page_refcounted(p); + if (!demote) + set_page_refcounted(p); } set_compound_order(page, 0); @@ -1292,6 +1293,12 @@ static void destroy_compound_gigantic_page(struct page *page, __ClearPageHead(page); } +static void destroy_compound_gigantic_page(struct page *page, + unsigned int order) +{ + __destroy_compound_gigantic_page(page, order, false); +} + static void free_gigantic_page(struct page *page, unsigned int order) { /* @@ -1364,12 +1371,15 @@ static inline void destroy_compound_gigantic_page(struct page *page, /* * Remove hugetlb page from lists, and update dtor so that page appears - * as just a compound page. A reference is held on the page. + * as just a compound page. + * + * A reference is held on the page, except in the case of demote. * * Must be called with hugetlb lock held. */ -static void remove_hugetlb_page(struct hstate *h, struct page *page, - bool adjust_surplus) +static void __remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus, + bool demote) { int nid = page_to_nid(page); @@ -1407,8 +1417,12 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, * * This handles the case where more than one ref is held when and * after update_and_free_page is called. + * + * In the case of demote we do not ref count the page as it will soon + * be turned into a page of smaller size. */ - set_page_refcounted(page); + if (!demote) + set_page_refcounted(page); if (hstate_is_gigantic(h)) set_compound_page_dtor(page, NULL_COMPOUND_DTOR); else @@ -1418,6 +1432,12 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, h->nr_huge_pages_node[nid]--; } +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + __remove_hugetlb_page(h, page, adjust_surplus, false); +} + static void add_hugetlb_page(struct hstate *h, struct page *page, bool adjust_surplus) { @@ -1681,7 +1701,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_unlock_irq(&hugetlb_lock); } -static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, + bool demote) { int i, j; int nr_pages = 1 << order; @@ -1719,10 +1740,16 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order) * the set of pages can not be converted to a gigantic page. * The caller who allocated the pages should then discard the * pages using the appropriate free interface. + * + * In the case of demote, the ref count will be zero. */ - if (!page_ref_freeze(p, 1)) { - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); - goto out_error; + if (!demote) { + if (!page_ref_freeze(p, 1)) { + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + goto out_error; + } + } else { + VM_BUG_ON_PAGE(page_count(p), p); } set_page_count(p, 0); set_compound_head(p, page); @@ -1747,6 +1774,11 @@ out_error: return false; } +static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +{ + return __prep_compound_gigantic_page(page, order, false); +} + /* * PageHuge() only returns true for hugetlbfs pages, but not for normal or * transparent huge pages. See the PageTransHuge() documentation for more From 8531fc6f52f5fc201f43d8c36c2606e25748b1c4 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Nov 2021 13:41:33 -0700 Subject: [PATCH 382/512] hugetlb: add hugetlb demote page support Demote page functionality will split a huge page into a number of huge pages of a smaller size. For example, on x86 a 1GB huge page can be demoted into 512 2M huge pages. Demotion is done 'in place' by simply splitting the huge page. Added '*_for_demote' wrappers for remove_hugetlb_page, destroy_compound_hugetlb_page and prep_compound_gigantic_page for use by demote code. [mike.kravetz@oracle.com: v4] Link: https://lkml.kernel.org/r/6ca29b8e-527c-d6ec-900e-e6a43e4f8b73@oracle.com Link: https://lkml.kernel.org/r/20211007181918.136982-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: David Hildenbrand Cc: David Rientjes Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nghia Le Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c00dd2d1e5f4..1835d548fecc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1270,7 +1270,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +/* used to demote non-gigantic_huge pages as well */ static void __destroy_compound_gigantic_page(struct page *page, unsigned int order, bool demote) { @@ -1293,6 +1293,13 @@ static void __destroy_compound_gigantic_page(struct page *page, __ClearPageHead(page); } +static void destroy_compound_hugetlb_page_for_demote(struct page *page, + unsigned int order) +{ + __destroy_compound_gigantic_page(page, order, true); +} + +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { @@ -1438,6 +1445,12 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, __remove_hugetlb_page(h, page, adjust_surplus, false); } +static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + __remove_hugetlb_page(h, page, adjust_surplus, true); +} + static void add_hugetlb_page(struct hstate *h, struct page *page, bool adjust_surplus) { @@ -1779,6 +1792,12 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order) return __prep_compound_gigantic_page(page, order, false); } +static bool prep_compound_gigantic_page_for_demote(struct page *page, + unsigned int order) +{ + return __prep_compound_gigantic_page(page, order, true); +} + /* * PageHuge() only returns true for hugetlbfs pages, but not for normal or * transparent huge pages. See the PageTransHuge() documentation for more @@ -3304,9 +3323,72 @@ out: return 0; } +static int demote_free_huge_page(struct hstate *h, struct page *page) +{ + int i, nid = page_to_nid(page); + struct hstate *target_hstate; + int rc = 0; + + target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); + + remove_hugetlb_page_for_demote(h, page, false); + spin_unlock_irq(&hugetlb_lock); + + rc = alloc_huge_page_vmemmap(h, page); + if (rc) { + /* Allocation of vmemmmap failed, we can not demote page */ + spin_lock_irq(&hugetlb_lock); + set_page_refcounted(page); + add_hugetlb_page(h, page, false); + return rc; + } + + /* + * Use destroy_compound_hugetlb_page_for_demote for all huge page + * sizes as it will not ref count pages. + */ + destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h)); + + /* + * Taking target hstate mutex synchronizes with set_max_huge_pages. + * Without the mutex, pages added to target hstate could be marked + * as surplus. + * + * Note that we already hold h->resize_lock. To prevent deadlock, + * use the convention of always taking larger size hstate mutex first. + */ + mutex_lock(&target_hstate->resize_lock); + for (i = 0; i < pages_per_huge_page(h); + i += pages_per_huge_page(target_hstate)) { + if (hstate_is_gigantic(target_hstate)) + prep_compound_gigantic_page_for_demote(page + i, + target_hstate->order); + else + prep_compound_page(page + i, target_hstate->order); + set_page_private(page + i, 0); + set_page_refcounted(page + i); + prep_new_huge_page(target_hstate, page + i, nid); + put_page(page + i); + } + mutex_unlock(&target_hstate->resize_lock); + + spin_lock_irq(&hugetlb_lock); + + /* + * Not absolutely necessary, but for consistency update max_huge_pages + * based on pool changes for the demoted page. + */ + h->max_huge_pages--; + target_hstate->max_huge_pages += pages_per_huge_page(h); + + return rc; +} + static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) __must_hold(&hugetlb_lock) { + int nr_nodes, node; + struct page *page; int rc = 0; lockdep_assert_held(&hugetlb_lock); @@ -3317,9 +3399,15 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) return -EINVAL; /* internal error */ } - /* - * TODO - demote fucntionality will be added in subsequent patch - */ + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + if (!list_empty(&h->hugepage_freelists[node])) { + page = list_entry(h->hugepage_freelists[node].next, + struct page, lru); + rc = demote_free_huge_page(h, page); + break; + } + } + return rc; } @@ -3554,10 +3642,6 @@ static ssize_t demote_store(struct kobject *kobj, /* * Check for available pages to demote each time thorough the * loop as demote_pool_huge_page will drop hugetlb_lock. - * - * NOTE: demote_pool_huge_page does not yet drop hugetlb_lock - * but will when full demote functionality is added in a later - * patch. */ if (nid != NUMA_NO_NODE) nr_available = h->free_huge_pages_node[nid]; From bd3400ea173fb611cdf2030d03620185ff6c0b0e Mon Sep 17 00:00:00 2001 From: Liangcai Fan Date: Fri, 5 Nov 2021 13:41:36 -0700 Subject: [PATCH 383/512] mm: khugepaged: recalculate min_free_kbytes after stopping khugepaged When initializing transparent huge pages, min_free_kbytes would be calculated according to what khugepaged expected. So when transparent huge pages get disabled, min_free_kbytes should be recalculated instead of the higher value set by khugepaged. Link: https://lkml.kernel.org/r/1633937809-16558-1-git-send-email-liangcaifan19@gmail.com Signed-off-by: Liangcai Fan Signed-off-by: Chunyan Zhang Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/khugepaged.c | 10 ++++++++-- mm/page_alloc.c | 7 ++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e29deb1e0d4..7e37c726b9db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2453,6 +2453,7 @@ extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); +extern void calculate_min_free_kbytes(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8a8b3aa92937..629961966854 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2299,6 +2299,11 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; + if (!khugepaged_enabled()) { + calculate_min_free_kbytes(); + goto update_wmarks; + } + for_each_populated_zone(zone) { /* * We don't need to worry about fragmentation of @@ -2334,6 +2339,8 @@ static void set_recommended_min_free_kbytes(void) min_free_kbytes = recommended_min; } + +update_wmarks: setup_per_zone_wmarks(); } @@ -2355,12 +2362,11 @@ int start_stop_khugepaged(void) if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); - - set_recommended_min_free_kbytes(); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } + set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a7035467bf6d..09a0f1c5d5d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8469,7 +8469,7 @@ void setup_per_zone_wmarks(void) * 8192MB: 11584k * 16384MB: 16384k */ -int __meminit init_per_zone_wmark_min(void) +void calculate_min_free_kbytes(void) { unsigned long lowmem_kbytes; int new_min_free_kbytes; @@ -8483,6 +8483,11 @@ int __meminit init_per_zone_wmark_min(void) pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); +} + +int __meminit init_per_zone_wmark_min(void) +{ + calculate_min_free_kbytes(); setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); From 550a7d60bd5e35a56942dba6d8a26752beb26c9f Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 5 Nov 2021 13:41:40 -0700 Subject: [PATCH 384/512] mm, hugepages: add mremap() support for hugepage backed vma Support mremap() for hugepage backed vma segment by simply repositioning page table entries. The page table entries are repositioned to the new virtual address on mremap(). Hugetlb mremap() support is of course generic; my motivating use case is a library (hugepage_text), which reloads the ELF text of executables in hugepages. This significantly increases the execution performance of said executables. Restrict the mremap operation on hugepages to up to the size of the original mapping as the underlying hugetlb reservation is not yet capable of handling remapping to a larger size. During the mremap() operation we detect pmd_share'd mappings and we unshare those during the mremap(). On access and fault the sharing is established again. Link: https://lkml.kernel.org/r/20211013195825.3058275-1-almasrymina@google.com Signed-off-by: Mina Almasry Reviewed-by: Mike Kravetz Cc: Ken Chen Cc: Chris Kennelly Cc: Michal Hocko Cc: Vlastimil Babka Cc: Kirill Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 19 +++++++ mm/hugetlb.c | 111 +++++++++++++++++++++++++++++++++++++--- mm/mremap.c | 36 +++++++++++-- 3 files changed, 157 insertions(+), 9 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2bddd6c38204..c0a20781b28e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -124,6 +124,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, loff_t *); @@ -132,6 +133,10 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, unsigned long new_addr, + unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, @@ -215,6 +220,10 @@ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } +static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma) +{ +} + static inline unsigned long hugetlb_total_pages(void) { return 0; @@ -262,6 +271,16 @@ static inline int copy_hugetlb_page_range(struct mm_struct *dst, return 0; } +static inline int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, + unsigned long new_addr, + unsigned long len) +{ + BUG(); + return 0; +} + static inline void hugetlb_report_meminfo(struct seq_file *m) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1835d548fecc..8028fb7677eb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1014,6 +1014,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) vma->vm_private_data = (void *)0; } +/* + * Reset and decrement one ref on hugepage private reservation. + * Called with mm->mmap_sem writer semaphore held. + * This function should be only used by move_vma() and operate on + * same sized vma. It should never come here with last ref on the + * reservation. + */ +void clear_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + /* + * Clear the old hugetlb private page reservation. + * It has already been transferred to new_vma. + * + * During a mremap() operation of a hugetlb vma we call move_vma() + * which copies vma into new_vma and unmaps vma. After the copy + * operation both new_vma and vma share a reference to the resv_map + * struct, and at that point vma is about to be unmapped. We don't + * want to return the reservation to the pool at unmap of vma because + * the reservation still lives on in new_vma, so simply decrement the + * ref here and remove the resv_map reference from this vma. + */ + struct resv_map *reservations = vma_resv_map(vma); + + if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&reservations->refs, resv_map_release); + + reset_vma_resv_huge_pages(vma); +} + /* Returns true if the VMA has associated reserve pages */ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) { @@ -4718,6 +4747,82 @@ again: return ret; } +static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pte_t *src_pte) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + pte_t *dst_pte, pte; + spinlock_t *src_ptl, *dst_ptl; + + dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h)); + dst_ptl = huge_pte_lock(h, mm, dst_pte); + src_ptl = huge_pte_lockptr(h, mm, src_pte); + + /* + * We don't have to worry about the ordering of src and dst ptlocks + * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock. + */ + if (src_ptl != dst_ptl) + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + set_huge_pte_at(mm, new_addr, dst_pte, pte); + + if (src_ptl != dst_ptl) + spin_unlock(src_ptl); + spin_unlock(dst_ptl); +} + +int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, unsigned long new_addr, + unsigned long len) +{ + struct hstate *h = hstate_vma(vma); + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + unsigned long old_end = old_addr + len; + unsigned long old_addr_copy; + pte_t *src_pte, *dst_pte; + struct mmu_notifier_range range; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, + old_end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); + /* Prevent race with file truncation */ + i_mmap_lock_write(mapping); + for (; old_addr < old_end; old_addr += sz, new_addr += sz) { + src_pte = huge_pte_offset(mm, old_addr, sz); + if (!src_pte) + continue; + if (huge_pte_none(huge_ptep_get(src_pte))) + continue; + + /* old_addr arg to huge_pmd_unshare() is a pointer and so the + * arg may be modified. Pass a copy instead to preserve the + * value in old_addr. + */ + old_addr_copy = old_addr; + + if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) + continue; + + dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); + if (!dst_pte) + break; + + move_huge_pte(vma, old_addr, new_addr, src_pte); + } + i_mmap_unlock_write(mapping); + flush_tlb_range(vma, old_end - len, old_end); + mmu_notifier_invalidate_range_end(&range); + + return len + old_addr - old_end; +} + static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) @@ -6257,12 +6362,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * sharing is possible. For hugetlbfs, this prevents removal of any page * table entries associated with the address space. This is important as we * are setting up sharing based on existing page table entries (mappings). - * - * NOTE: This routine is only called from huge_pte_alloc. Some callers of - * huge_pte_alloc know that sharing is not possible and do not take - * i_mmap_rwsem as a performance optimization. This is handled by the - * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is - * only required for subsequent processing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) diff --git a/mm/mremap.c b/mm/mremap.c index c0b6c41b7b78..002eec83e91e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -489,6 +489,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); + if (is_vm_hugetlb_page(vma)) + return move_hugetlb_page_tables(vma, new_vma, old_addr, + new_addr, len); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, old_addr, old_end); mmu_notifier_invalidate_range_start(&range); @@ -646,6 +650,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, mremap_userfaultfd_prep(new_vma, uf); } + if (is_vm_hugetlb_page(vma)) { + clear_vma_resv_huge_pages(vma); + } + /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; @@ -739,9 +747,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return ERR_PTR(-EINVAL); - if (is_vm_hugetlb_page(vma)) - return ERR_PTR(-EINVAL); - /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) return ERR_PTR(-EFAULT); @@ -937,6 +942,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (mmap_write_lock_killable(current->mm)) return -EINTR; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) { + ret = EFAULT; + goto out; + } + + if (is_vm_hugetlb_page(vma)) { + struct hstate *h __maybe_unused = hstate_vma(vma); + + old_len = ALIGN(old_len, huge_page_size(h)); + new_len = ALIGN(new_len, huge_page_size(h)); + + /* addrs must be huge page aligned */ + if (addr & ~huge_page_mask(h)) + goto out; + if (new_addr & ~huge_page_mask(h)) + goto out; + + /* + * Don't allow remap expansion, because the underlying hugetlb + * reservation is not yet capable to handle split reservation. + */ + if (new_len > old_len) + goto out; + } if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) { ret = mremap_to(addr, old_len, new_addr, new_len, From 12b613206474cea36671d6e3a7be7d1db7eb8741 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 5 Nov 2021 13:41:43 -0700 Subject: [PATCH 385/512] mm, hugepages: add hugetlb vma mremap() test [almasrymina@google.com: v8] Link: https://lkml.kernel.org/r/20211014200542.4126947-2-almasrymina@google.com [wanjiabing@vivo.com: remove duplicated include in hugepage-mremap] Link: https://lkml.kernel.org/r/20211021122944.8857-1-wanjiabing@vivo.com Link: https://lkml.kernel.org/r/20211013195825.3058275-2-almasrymina@google.com Signed-off-by: Mina Almasry Signed-off-by: Wan Jiabing Acked-by: Mike Kravetz Cc: Ken Chen Cc: Chris Kennelly Cc: Michal Hocko Cc: Vlastimil Babka Cc: Kirill Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/hugepage-mremap.c | 160 +++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 11 ++ 4 files changed, 173 insertions(+) create mode 100644 tools/testing/selftests/vm/hugepage-mremap.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index b02eac613fdd..2e7e86e85282 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap +hugepage-mremap hugepage-shm khugepaged map_hugetlb diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index d9605bd10f2d..1607322a112c 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -29,6 +29,7 @@ TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap +TEST_GEN_FILES += hugepage-mremap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += khugepaged TEST_GEN_FILES += madv_populate diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c new file mode 100644 index 000000000000..68b8b8c4b2e6 --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mremap: + * + * Example of remapping huge page memory in a user application using the + * mremap system call. Code assumes a hugetlbfs filesystem is mounted + * at './huge'. The code will use 10MB worth of huge pages. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include /* Definition of O_* constants */ +#include /* Definition of SYS_* constants */ +#include +#include +#include + +#define LENGTH (1UL * 1024 * 1024 * 1024) + +#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) +#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < LENGTH; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +static void register_region_with_uffd(char *addr, size_t len) +{ + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd"); + exit(1); + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API"); + exit(1); + } + + /* Create a private anonymous mapping. The memory will be + * demand-zero paged--that is, not yet allocated. When we + * actually touch the memory, it will be allocated via + * the userfaultfd. + */ + + addr = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Address returned by mmap() = %p\n", addr); + + /* Register the memory range of the mapping we just created for + * handling by the userfaultfd object. In mode, we request to track + * missing pages (i.e., pages that have not yet been faulted in). + */ + + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(1); + } +} + +int main(void) +{ + int ret = 0; + + int fd = open("/huge/test", O_CREAT | O_RDWR, 0755); + + if (fd < 0) { + perror("Open failed"); + exit(1); + } + + /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ + unsigned long suggested_addr = 0x7eaa40000000; + void *haddr = mmap((void *)suggested_addr, LENGTH, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map haddr: Returned address is %p\n", haddr); + if (haddr == MAP_FAILED) { + perror("mmap1"); + exit(1); + } + + /* mmap again to a dummy address to hopefully trigger pmd sharing. */ + suggested_addr = 0x7daa40000000; + void *daddr = mmap((void *)suggested_addr, LENGTH, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map daddr: Returned address is %p\n", daddr); + if (daddr == MAP_FAILED) { + perror("mmap3"); + exit(1); + } + + suggested_addr = 0x7faa40000000; + void *vaddr = + mmap((void *)suggested_addr, LENGTH, PROTECTION, FLAGS, -1, 0); + printf("Map vaddr: Returned address is %p\n", vaddr); + if (vaddr == MAP_FAILED) { + perror("mmap2"); + exit(1); + } + + register_region_with_uffd(haddr, LENGTH); + + void *addr = mremap(haddr, LENGTH, LENGTH, + MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); + if (addr == MAP_FAILED) { + perror("mremap"); + exit(1); + } + + printf("Mremap: Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr); + ret = read_bytes(addr); + + munmap(addr, LENGTH); + + return ret; +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 45e803af7c77..a24d30af3094 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -108,6 +108,17 @@ else echo "[PASS]" fi +echo "-----------------------" +echo "running hugepage-mremap" +echo "-----------------------" +./hugepage-mremap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." From 38e719ab26735aa2c5d9d422fc4b741cbd36e700 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 5 Nov 2021 13:41:46 -0700 Subject: [PATCH 386/512] hugetlb: support node specified when using cma for gigantic hugepages Now the size of CMA area for gigantic hugepages runtime allocation is balanced for all online nodes, but we also want to specify the size of CMA per-node, or only one node in some cases, which are similar with patch [1]. For example, on some multi-nodes systems, each node's memory can be different, allocating the same size of CMA for each node is not suitable for the low-memory nodes. Meanwhile some workloads like DPDK mentioned by Zhenguo in patch [1] only need hugepages in one node. On the other hand, we have some machines with multiple types of memory, like DRAM and PMEM (persistent memory). On this system, we may want to specify all the hugepages only on DRAM node, or specify the proportion of DRAM node and PMEM node, to tuning the performance of the workloads. Thus this patch adds node format for 'hugetlb_cma' parameter to support specifying the size of CMA per-node. An example is as follows: hugetlb_cma=0:5G,2:5G which means allocating 5G size of CMA area on node 0 and node 2 respectively. And the users should use the node specific sysfs file to allocate the gigantic hugepages if specified the CMA size on that node. Link: https://lkml.kernel.org/r/20211005054729.86457-1-yaozhenguo1@gmail.com [1] Link: https://lkml.kernel.org/r/bb790775ca60bb8f4b26956bb3f6988f74e075c7.1634261144.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Mike Kravetz Cc: Michal Hocko Cc: Roman Gushchin Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 6 +- mm/hugetlb.c | 86 +++++++++++++++++-- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 43dc35fe5bc0..e7f7904edf20 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1587,8 +1587,10 @@ registers. Default set by CONFIG_HPET_MMAP_DEFAULT. hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation - of gigantic hugepages. - Format: nn[KMGTPE] + of gigantic hugepages. Or using node format, the size + of a CMA area per node can be specified. + Format: nn[KMGTPE] or (node format) + :nn[KMGTPE][,:nn[KMGTPE]] Reserve a CMA area of given size and allocate gigantic hugepages using the CMA allocator. If enabled, the diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8028fb7677eb..b86d27870c54 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -50,6 +50,7 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; +static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; static bool hugetlb_cma_page(struct page *page, unsigned int order) { return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, @@ -6762,7 +6763,38 @@ static bool cma_reserve_called __initdata; static int __init cmdline_parse_hugetlb_cma(char *p) { - hugetlb_cma_size = memparse(p, &p); + int nid, count = 0; + unsigned long tmp; + char *s = p; + + while (*s) { + if (sscanf(s, "%lu%n", &tmp, &count) != 1) + break; + + if (s[count] == ':') { + nid = tmp; + if (nid < 0 || nid >= MAX_NUMNODES) + break; + + s += count + 1; + tmp = memparse(s, &s); + hugetlb_cma_size_in_node[nid] = tmp; + hugetlb_cma_size += tmp; + + /* + * Skip the separator if have one, otherwise + * break the parsing. + */ + if (*s == ',') + s++; + else + break; + } else { + hugetlb_cma_size = memparse(p, &p); + break; + } + } + return 0; } @@ -6771,10 +6803,36 @@ early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); void __init hugetlb_cma_reserve(int order) { unsigned long size, reserved, per_node; + bool node_specific_cma_alloc = false; int nid; cma_reserve_called = true; + if (!hugetlb_cma_size) + return; + + for (nid = 0; nid < MAX_NUMNODES; nid++) { + if (hugetlb_cma_size_in_node[nid] == 0) + continue; + + if (!node_state(nid, N_ONLINE)) { + pr_warn("hugetlb_cma: invalid node %d specified\n", nid); + hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; + hugetlb_cma_size_in_node[nid] = 0; + continue; + } + + if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { + pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", + nid, (PAGE_SIZE << order) / SZ_1M); + hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; + hugetlb_cma_size_in_node[nid] = 0; + } else { + node_specific_cma_alloc = true; + } + } + + /* Validate the CMA size again in case some invalid nodes specified. */ if (!hugetlb_cma_size) return; @@ -6785,20 +6843,30 @@ void __init hugetlb_cma_reserve(int order) return; } - /* - * If 3 GB area is requested on a machine with 4 numa nodes, - * let's allocate 1 GB on first three nodes and ignore the last one. - */ - per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); - pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", - hugetlb_cma_size / SZ_1M, per_node / SZ_1M); + if (!node_specific_cma_alloc) { + /* + * If 3 GB area is requested on a machine with 4 numa nodes, + * let's allocate 1 GB on first three nodes and ignore the last one. + */ + per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); + pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", + hugetlb_cma_size / SZ_1M, per_node / SZ_1M); + } reserved = 0; for_each_node_state(nid, N_ONLINE) { int res; char name[CMA_MAX_NAME]; - size = min(per_node, hugetlb_cma_size - reserved); + if (node_specific_cma_alloc) { + if (hugetlb_cma_size_in_node[nid] == 0) + continue; + + size = hugetlb_cma_size_in_node[nid]; + } else { + size = min(per_node, hugetlb_cma_size - reserved); + } + size = round_up(size, PAGE_SIZE << order); snprintf(name, sizeof(name), "hugetlb%d", nid); From b65c23f72e77f87b31cf978fb0915d80875e26a0 Mon Sep 17 00:00:00 2001 From: Ran Jianping Date: Fri, 5 Nov 2021 13:41:49 -0700 Subject: [PATCH 387/512] mm: remove duplicate include in hugepage-mremap.c Remove duplicate includes 'unistd.h' included in '/tools/testing/selftests/vm/hugepage-mremap.c' is duplicated.It is also included on 23 line. Link: https://lkml.kernel.org/r/20211018102336.869726-1-ran.jianping@zte.com.cn Signed-off-by: Ran Jianping Reported-by: Zeal Robot Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/hugepage-mremap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 68b8b8c4b2e6..257df94697a5 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -15,7 +15,6 @@ #include #include /* Definition of O_* constants */ #include /* Definition of SYS_* constants */ -#include #include #include From df8931c89d2edc143fa7f59e049696dce6b151ef Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 5 Nov 2021 13:41:52 -0700 Subject: [PATCH 388/512] hugetlb_cgroup: remove unused hugetlb_cgroup_from_counter macro Patch series "Some cleanups and improvements for hugetlb". This patchset does some cleanups and improvements for hugetlb and hugetlb_cgroup. This patch (of 4): Since commit 726b7bbeafd4 ("hugetlb_cgroup: fix illegal access to memory"), the hugetlb_cgroup_from_counter() macro is not used any more, remove it. Link: https://lkml.kernel.org/r/cover.1634797639.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/f03b29b801fa9942466ab15334ec09988e124ae6.1634797639.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Mike Kravetz Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 5383023d0cca..79d93534ef1e 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -27,9 +27,6 @@ #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -#define hugetlb_cgroup_from_counter(counter, idx) \ - container_of(counter, struct hugetlb_cgroup, hugepage[idx]) - static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline struct page_counter * From aa6d2e8cba2dc6f1f3dd39f7a7cc8ac788ad6c1a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 5 Nov 2021 13:41:55 -0700 Subject: [PATCH 389/512] hugetlb: replace the obsolete hugetlb_instantiation_mutex in the comments After commit 8382d914ebf7 ("mm, hugetlb: improve page-fault scalability"), the hugetlb_instantiation_mutex lock had been replaced by hugetlb_fault_mutex_table to serializes faults on the same logical page. Thus update the obsolete hugetlb_instantiation_mutex related comments. Link: https://lkml.kernel.org/r/4b3febeae37455ff7b74aa0aad16cc6909cf0926.1634797639.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Mike Kravetz Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b86d27870c54..f9c1ec1180d3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5014,7 +5014,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, /* * Hugetlb_cow() should be called with page lock of the original hugepage held. - * Called with hugetlb_instantiation_mutex held and pte_page locked so we + * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ From 0739eb437f3d397eb39b5dc653aa250ee7b453f0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 5 Nov 2021 13:41:58 -0700 Subject: [PATCH 390/512] hugetlb: remove redundant validation in has_same_uncharge_info() The callers of has_same_uncharge_info() has accessed the original file_region and new file_region, and they are impossible to be NULL now. So we can remove the file_region validation in has_same_uncharge_info() to simplify the code. Link: https://lkml.kernel.org/r/97fc68d3f8d34f63c204645e10d7a718997e50b7.1634797639.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Mike Kravetz Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f9c1ec1180d3..1869ca9f21f3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -332,8 +332,7 @@ static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { #ifdef CONFIG_CGROUP_HUGETLB - return rg && org && - rg->reservation_counter == org->reservation_counter && + return rg->reservation_counter == org->reservation_counter && rg->css == org->css; #else From 76efc67a5e7a3dc1226c4ad1b266a15741347031 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 5 Nov 2021 13:42:01 -0700 Subject: [PATCH 391/512] hugetlb: remove redundant VM_BUG_ON() in add_reservation_in_range() When calling hugetlb_resv_map_add(), we've guaranteed that the parameter 'to' is always larger than 'from', so it never returns a negative value from hugetlb_resv_map_add(). Thus remove the redundant VM_BUG_ON(). Link: https://lkml.kernel.org/r/2b565552f3d06753da1e8dda439c0d96d6d9a5a3.1634797639.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Mike Kravetz Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1869ca9f21f3..e304326dba63 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -445,7 +445,6 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, t, h, h_cg, regions_needed); - VM_BUG_ON(add < 0); return add; } From 2c0078a7d820e55ce6a176721a94f3c585833436 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Nov 2021 13:42:04 -0700 Subject: [PATCH 392/512] hugetlb: remove unnecessary set_page_count in prep_compound_gigantic_page In commit 7118fc2906e2 ("hugetlb: address ref count racing in prep_compound_gigantic_page"), page_ref_freeze is used to atomically zero the ref count of tail pages iff they are 1. The unconditional call to set_page_count(0) was left in the code. This call is after page_ref_freeze so it is really a noop. Remove redundant and unnecessary set_page_count call. Link: https://lkml.kernel.org/r/20211026220635.35187-1-mike.kravetz@oracle.com Fixes: 7118fc2906e29 ("hugetlb: address ref count racing in prep_compound_gigantic_page") Signed-off-by: Mike Kravetz Suggested-by: Pasha Tatashin Reviewed-by: Pasha Tatashin Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e304326dba63..5eab627a170b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1792,7 +1792,6 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, } else { VM_BUG_ON_PAGE(page_count(p), p); } - set_page_count(p, 0); set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); From 1c10e674b35e2c2566b621ceca74064817c249f0 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 5 Nov 2021 13:42:07 -0700 Subject: [PATCH 393/512] userfaultfd/selftests: don't rely on GNU extensions for random numbers Patch series "Small userfaultfd selftest fixups", v2. This patch (of 3): Two arguments for doing this: First, and maybe most importantly, the resulting code is significantly shorter / simpler. Then, we avoid using GNU libc extensions. Why does this matter? It makes testing userfaultfd with the selftest easier e.g. on distros which use something other than glibc (e.g., Alpine, which uses musl); basically, it makes the test more portable. Link: https://lkml.kernel.org/r/20210930212309.4001967-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 26 ++++-------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 60aa1a4fc69b..d2acab4411cf 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "../kselftest.h" @@ -518,22 +519,10 @@ static void continue_range(int ufd, __u64 start, __u64 len) static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; - struct random_data rand; unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */ - int32_t rand_nr; unsigned long long count; - char randstate[64]; - unsigned int seed; - if (bounces & BOUNCE_RANDOM) { - seed = (unsigned int) time(NULL) - bounces; - if (!(bounces & BOUNCE_RACINGFAULTS)) - seed += cpu; - bzero(&rand, sizeof(rand)); - bzero(&randstate, sizeof(randstate)); - if (initstate_r(seed, randstate, sizeof(randstate), &rand)) - err("initstate_r failed"); - } else { + if (!(bounces & BOUNCE_RANDOM)) { page_nr = -bounces; if (!(bounces & BOUNCE_RACINGFAULTS)) page_nr += cpu * nr_pages_per_cpu; @@ -541,15 +530,8 @@ static void *locking_thread(void *arg) while (!finished) { if (bounces & BOUNCE_RANDOM) { - if (random_r(&rand, &rand_nr)) - err("random_r failed"); - page_nr = rand_nr; - if (sizeof(page_nr) > sizeof(rand_nr)) { - if (random_r(&rand, &rand_nr)) - err("random_r failed"); - page_nr |= (((unsigned long) rand_nr) << 16) << - 16; - } + if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) + err("getrandom failed"); } else page_nr += 1; page_nr %= nr_pages; From 1042a53d0ec3fb617bcff395ce24b0df90d5a99b Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 5 Nov 2021 13:42:10 -0700 Subject: [PATCH 394/512] userfaultfd/selftests: fix feature support detection Before any tests are run, in set_test_type, we decide what feature(s) we are going to be testing, based upon our command line arguments. However, the supported features are not just a function of the memory type being used, so this is broken. For instance, consider writeprotect support. It is "normally" supported for anonymous memory, but furthermore it requires that the kernel has CONFIG_HAVE_ARCH_USERFAULTFD_WP. So, it is *not* supported at all on aarch64, for example. So, this fixes this by querying the kernel for the set of features it supports in set_test_type, by opening a userfaultfd and issuing a UFFDIO_API ioctl. Based upon the reported features, we toggle what tests are enabled. Link: https://lkml.kernel.org/r/20210930212309.4001967-3-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 54 ++++++++++++++---------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index d2acab4411cf..4efdaff2b563 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -346,6 +346,16 @@ static struct uffd_test_ops hugetlb_uffd_test_ops = { static struct uffd_test_ops *uffd_test_ops; +static inline uint64_t uffd_minor_feature(void) +{ + if (test_type == TEST_HUGETLB && map_shared) + return UFFD_FEATURE_MINOR_HUGETLBFS; + else if (test_type == TEST_SHMEM) + return UFFD_FEATURE_MINOR_SHMEM; + else + return 0; +} + static void userfaultfd_open(uint64_t *features) { struct uffdio_api uffdio_api; @@ -406,7 +416,7 @@ static void uffd_test_ctx_clear(void) munmap_area((void **)&area_dst_alias); } -static void uffd_test_ctx_init_ext(uint64_t *features) +static void uffd_test_ctx_init(uint64_t features) { unsigned long nr, cpu; @@ -415,7 +425,7 @@ static void uffd_test_ctx_init_ext(uint64_t *features) uffd_test_ops->allocate_area((void **)&area_src); uffd_test_ops->allocate_area((void **)&area_dst); - userfaultfd_open(features); + userfaultfd_open(&features); count_verify = malloc(nr_pages * sizeof(unsigned long long)); if (!count_verify) @@ -463,11 +473,6 @@ static void uffd_test_ctx_init_ext(uint64_t *features) err("pipe"); } -static inline void uffd_test_ctx_init(uint64_t features) -{ - uffd_test_ctx_init_ext(&features); -} - static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; @@ -1208,7 +1213,6 @@ static int userfaultfd_minor_test(void) void *expected_page; char c; struct uffd_stats stats = { 0 }; - uint64_t req_features, features_out; if (!test_uffdio_minor) return 0; @@ -1216,21 +1220,7 @@ static int userfaultfd_minor_test(void) printf("testing minor faults: "); fflush(stdout); - if (test_type == TEST_HUGETLB) - req_features = UFFD_FEATURE_MINOR_HUGETLBFS; - else if (test_type == TEST_SHMEM) - req_features = UFFD_FEATURE_MINOR_SHMEM; - else - return 1; - - features_out = req_features; - uffd_test_ctx_init_ext(&features_out); - /* If kernel reports required features aren't supported, skip test. */ - if ((features_out & req_features) != req_features) { - printf("skipping test due to lack of feature support\n"); - fflush(stdout); - return 0; - } + uffd_test_ctx_init(uffd_minor_feature()); uffdio_register.range.start = (unsigned long)area_dst_alias; uffdio_register.range.len = nr_pages * page_size; @@ -1591,6 +1581,8 @@ unsigned long default_huge_page_size(void) static void set_test_type(const char *type) { + uint64_t features = UFFD_API_FEATURES; + if (!strcmp(type, "anon")) { test_type = TEST_ANON; uffd_test_ops = &anon_uffd_test_ops; @@ -1624,6 +1616,22 @@ static void set_test_type(const char *type) if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 > page_size) err("Impossible to run this test"); + + /* + * Whether we can test certain features depends not just on test type, + * but also on whether or not this particular kernel supports the + * feature. + */ + + userfaultfd_open(&features); + + test_uffdio_wp = test_uffdio_wp && + (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); + test_uffdio_minor = test_uffdio_minor && + (features & uffd_minor_feature()); + + close(uffd); + uffd = -1; } static void sigalrm(int sig) From ad0ce23ed099492d7ed1f87cd8cf39a68b9f20a0 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 5 Nov 2021 13:42:13 -0700 Subject: [PATCH 395/512] userfaultfd/selftests: fix calculation of expected ioctls Today, we assert that the ioctls the kernel reports as supported for a registration match a precomputed list. We decide which ioctls are supported by examining the memory type. Then, in several locations we "fix up" this list by adding or removing things this initial decision got wrong. What ioctls the kernel reports is actually a function of several things: - The memory type - Kernel feature support (e.g., no writeprotect on aarch64) - The registration type (e.g., CONTINUE only supported for MINOR mode) So, we can't fully compute this at the start, in set_test_type. It varies per test, depending on what registration mode(s) those tests use. Instead, introduce a new function which computes the correct list. This centralizes the add/remove of ioctls depending on these function inputs in one place, so we don't have to repeat ourselves in various tests. Not only is the resulting code a bit shorter, but it fixes a real bug in the existing code: previously, we would incorrectly require the writeprotect ioctl to be present on aarch64, where it isn't actually supported. Link: https://lkml.kernel.org/r/20210930212309.4001967-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 77 ++++++++++++------------ 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 4efdaff2b563..8a09057d2f22 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -308,37 +308,24 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) } struct uffd_test_ops { - unsigned long expected_ioctls; void (*allocate_area)(void **alloc_area); void (*release_pages)(char *rel_area); void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); }; -#define SHMEM_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ - (1 << _UFFDIO_COPY) | \ - (1 << _UFFDIO_ZEROPAGE)) - -#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ - (1 << _UFFDIO_COPY) | \ - (1 << _UFFDIO_ZEROPAGE) | \ - (1 << _UFFDIO_WRITEPROTECT)) - static struct uffd_test_ops anon_uffd_test_ops = { - .expected_ioctls = ANON_EXPECTED_IOCTLS, .allocate_area = anon_allocate_area, .release_pages = anon_release_pages, .alias_mapping = noop_alias_mapping, }; static struct uffd_test_ops shmem_uffd_test_ops = { - .expected_ioctls = SHMEM_EXPECTED_IOCTLS, .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, .alias_mapping = shmem_alias_mapping, }; static struct uffd_test_ops hugetlb_uffd_test_ops = { - .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE), .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, @@ -356,6 +343,33 @@ static inline uint64_t uffd_minor_feature(void) return 0; } +static uint64_t get_expected_ioctls(uint64_t mode) +{ + uint64_t ioctls = UFFD_API_RANGE_IOCTLS; + + if (test_type == TEST_HUGETLB) + ioctls &= ~(1 << _UFFDIO_ZEROPAGE); + + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); + + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) + ioctls &= ~(1 << _UFFDIO_CONTINUE); + + return ioctls; +} + +static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) +{ + uint64_t expected = get_expected_ioctls(mode); + uint64_t actual = ioctls & expected; + + if (actual != expected) { + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, + expected, actual); + } +} + static void userfaultfd_open(uint64_t *features) { struct uffdio_api uffdio_api; @@ -1017,11 +1031,9 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) { struct uffdio_zeropage uffdio_zeropage; int ret; - unsigned long has_zeropage; + bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); __s64 res; - has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE); - if (offset >= nr_pages * page_size) err("unexpected offset %lu", offset); uffdio_zeropage.range.start = (unsigned long) area_dst + offset; @@ -1061,7 +1073,6 @@ static int uffdio_zeropage(int ufd, unsigned long offset) static int userfaultfd_zeropage_test(void) { struct uffdio_register uffdio_register; - unsigned long expected_ioctls; printf("testing UFFDIO_ZEROPAGE: "); fflush(stdout); @@ -1076,9 +1087,8 @@ static int userfaultfd_zeropage_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) - err("unexpected missing ioctl for anon memory"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); if (uffdio_zeropage(uffd, 0)) if (my_bcmp(area_dst, zeropage, page_size)) @@ -1091,7 +1101,6 @@ static int userfaultfd_zeropage_test(void) static int userfaultfd_events_test(void) { struct uffdio_register uffdio_register; - unsigned long expected_ioctls; pthread_t uffd_mon; int err, features; pid_t pid; @@ -1115,9 +1124,8 @@ static int userfaultfd_events_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) - err("unexpected missing ioctl for anon memory"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) err("uffd_poll_thread create"); @@ -1145,7 +1153,6 @@ static int userfaultfd_events_test(void) static int userfaultfd_sig_test(void) { struct uffdio_register uffdio_register; - unsigned long expected_ioctls; unsigned long userfaults; pthread_t uffd_mon; int err, features; @@ -1169,9 +1176,8 @@ static int userfaultfd_sig_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) - err("unexpected missing ioctl for anon memory"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); if (faulting_process(1)) err("faulting process failed"); @@ -1206,7 +1212,6 @@ static int userfaultfd_sig_test(void) static int userfaultfd_minor_test(void) { struct uffdio_register uffdio_register; - unsigned long expected_ioctls; unsigned long p; pthread_t uffd_mon; uint8_t expected_byte; @@ -1228,10 +1233,8 @@ static int userfaultfd_minor_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - expected_ioctls |= 1 << _UFFDIO_CONTINUE; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) - err("unexpected missing ioctl(s)"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); /* * After registering with UFFD, populate the non-UFFD-registered side of @@ -1428,8 +1431,6 @@ static int userfaultfd_stress(void) pthread_attr_setstacksize(&attr, 16*1024*1024); while (bounces--) { - unsigned long expected_ioctls; - printf("bounces: %d, mode:", bounces); if (bounces & BOUNCE_RANDOM) printf(" rnd"); @@ -1457,10 +1458,8 @@ static int userfaultfd_stress(void) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != - expected_ioctls) - err("unexpected missing ioctl for anon memory"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); if (area_dst_alias) { uffdio_register.range.start = (unsigned long) From e1d8c966dbf11afcac1d115f3fca5a29060bba18 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:42:16 -0700 Subject: [PATCH 396/512] mm/page_isolation: fix potential missing call to unset_migratetype_isolate() In start_isolate_page_range() undo path, pfn_to_online_page() just checks the first pfn in a pageblock while __first_valid_page() will traverse the pageblock until the first online pfn is found. So we may miss the call to unset_migratetype_isolate() in undo path and pages will remain isolated unexpectedly. Fix this by calling undo_isolate_page_range() and this will also help to simplify the code further. Note we shouldn't ever trigger it because MAX_ORDER-1 aligned pfn ranges shouldn't contain memory holes now. Link: https://lkml.kernel.org/r/20210914114348.15569-1-linmiaohe@huawei.com Fixes: 2ce13640b3f4 ("mm: __first_valid_page skip over offline pages") Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a95c2c6562d0..f93cc63d8fa1 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -183,7 +183,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, int flags) { unsigned long pfn; - unsigned long undo_pfn; struct page *page; BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); @@ -193,25 +192,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page) { - if (set_migratetype_isolate(page, migratetype, flags)) { - undo_pfn = pfn; - goto undo; - } + if (page && set_migratetype_isolate(page, migratetype, flags)) { + undo_isolate_page_range(start_pfn, pfn, migratetype); + return -EBUSY; } } return 0; -undo: - for (pfn = start_pfn; - pfn < undo_pfn; - pfn += pageblock_nr_pages) { - struct page *page = pfn_to_online_page(pfn); - if (!page) - continue; - unset_migratetype_isolate(page, migratetype); - } - - return -EBUSY; } /* From a500cb342c84a4c4696850304124bc801331c4a8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:42:19 -0700 Subject: [PATCH 397/512] mm/page_isolation: guard against possible putback unisolated page Isolating a free page in an isolated pageblock is expected to always work as watermarks don't apply here. But if __isolate_free_page() failed, due to condition changes, the page will be left on the free list. And the page will be put back to free list again via __putback_isolated_page(). This may trigger VM_BUG_ON_PAGE() on page->flags checking in __free_one_page() if PageReported is set. Or we will corrupt the free list because list_add() will be called for pages already on another list. Add a VM_WARN_ON() to complain about this change. Link: https://lkml.kernel.org/r/20210914114508.23725-1-linmiaohe@huawei.com Fixes: 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock") Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: John Hubbard Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f93cc63d8fa1..f67c4c70f17f 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -94,8 +94,13 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy = page + (buddy_pfn - pfn); if (!is_migrate_isolate_page(buddy)) { - __isolate_free_page(page, order); - isolated_page = true; + isolated_page = !!__isolate_free_page(page, order); + /* + * Isolating a free page in an isolated pageblock + * is expected to always work as watermarks don't + * apply here. + */ + VM_WARN_ON(!isolated_page); } } } From cb75463ca73483e79fbbad60f6fb704ee0489856 Mon Sep 17 00:00:00 2001 From: Kai Song Date: Fri, 5 Nov 2021 13:42:22 -0700 Subject: [PATCH 398/512] mm/vmscan.c: fix -Wunused-but-set-variable warning We fix the following warning when building kernel with W=1: mm/vmscan.c:1362:6: warning: variable 'err' set but not used [-Wunused-but-set-variable] Link: https://lkml.kernel.org/r/20210924181218.21165-1-songkai01@inspur.com Signed-off-by: Kai Song Reviewed-by: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 74296c2d1fed..9483dd6af550 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1337,7 +1337,6 @@ static unsigned int demote_page_list(struct list_head *demote_pages, { int target_nid = next_demotion_node(pgdat->node_id); unsigned int nr_succeeded; - int err; if (list_empty(demote_pages)) return 0; @@ -1346,7 +1345,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages, return 0; /* Demotion ignores all cpuset and mempolicy settings */ - err = migrate_pages(demote_pages, alloc_demote_page, NULL, + migrate_pages(demote_pages, alloc_demote_page, NULL, target_nid, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); From 8cd7c588decf470bf7e14f2be93b709f839a965e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:25 -0700 Subject: [PATCH 399/512] mm/vmscan: throttle reclaim until some writeback completes if congested Patch series "Remove dependency on congestion_wait in mm/", v5. This series that removes all calls to congestion_wait in mm/ and deletes wait_iff_congested. It's not a clever implementation but congestion_wait has been broken for a long time [1]. Even if congestion throttling worked, it was never a great idea. While excessive dirty/writeback pages at the tail of the LRU is one possibility that reclaim may be slow, there is also the problem of too many pages being isolated and reclaim failing for other reasons (elevated references, too many pages isolated, excessive LRU contention etc). This series replaces the "congestion" throttling with 3 different types. - If there are too many dirty/writeback pages, sleep until a timeout or enough pages get cleaned - If too many pages are isolated, sleep until enough isolated pages are either reclaimed or put back on the LRU - If no progress is being made, direct reclaim tasks sleep until another task makes progress with acceptable efficiency. This was initially tested with a mix of workloads that used to trigger corner cases that no longer work. A new test case was created called "stutterp" (pagereclaim-stutterp-noreaders in mmtests) using a freshly created XFS filesystem. Note that it may be necessary to increase the timeout of ssh if executing remotely as ssh itself can get throttled and the connection may timeout. stutterp varies the number of "worker" processes from 4 up to NR_CPUS*4 to check the impact as the number of direct reclaimers increase. It has four types of worker. - One "anon latency" worker creates small mappings with mmap() and times how long it takes to fault the mapping reading it 4K at a time - X file writers which is fio randomly writing X files where the total size of the files add up to the allowed dirty_ratio. fio is allowed to run for a warmup period to allow some file-backed pages to accumulate. The duration of the warmup is based on the best-case linear write speed of the storage. - Y file readers which is fio randomly reading small files - Z anon memory hogs which continually map (100-dirty_ratio)% of memory - Total estimated WSS = (100+dirty_ration) percentage of memory X+Y+Z+1 == NR_WORKERS varying from 4 up to NR_CPUS*4 The intent is to maximise the total WSS with a mix of file and anon memory where some anonymous memory must be swapped and there is a high likelihood of dirty/writeback pages reaching the end of the LRU. The test can be configured to have no background readers to stress dirty/writeback pages. The results below are based on having zero readers. The short summary of the results is that the series works and stalls until some event occurs but the timeouts may need adjustment. The test results are not broken down by patch as the series should be treated as one block that replaces a broken throttling mechanism with a working one. Finally, three machines were tested but I'm reporting the worst set of results. The other two machines had much better latencies for example. First the results of the "anon latency" latency stutterp 5.15.0-rc1 5.15.0-rc1 vanilla mm-reclaimcongest-v5r4 Amean mmap-4 31.4003 ( 0.00%) 2661.0198 (-8374.52%) Amean mmap-7 38.1641 ( 0.00%) 149.2891 (-291.18%) Amean mmap-12 60.0981 ( 0.00%) 187.8105 (-212.51%) Amean mmap-21 161.2699 ( 0.00%) 213.9107 ( -32.64%) Amean mmap-30 174.5589 ( 0.00%) 377.7548 (-116.41%) Amean mmap-48 8106.8160 ( 0.00%) 1070.5616 ( 86.79%) Stddev mmap-4 41.3455 ( 0.00%) 27573.9676 (-66591.66%) Stddev mmap-7 53.5556 ( 0.00%) 4608.5860 (-8505.23%) Stddev mmap-12 171.3897 ( 0.00%) 5559.4542 (-3143.75%) Stddev mmap-21 1506.6752 ( 0.00%) 5746.2507 (-281.39%) Stddev mmap-30 557.5806 ( 0.00%) 7678.1624 (-1277.05%) Stddev mmap-48 61681.5718 ( 0.00%) 14507.2830 ( 76.48%) Max-90 mmap-4 31.4243 ( 0.00%) 83.1457 (-164.59%) Max-90 mmap-7 41.0410 ( 0.00%) 41.0720 ( -0.08%) Max-90 mmap-12 66.5255 ( 0.00%) 53.9073 ( 18.97%) Max-90 mmap-21 146.7479 ( 0.00%) 105.9540 ( 27.80%) Max-90 mmap-30 193.9513 ( 0.00%) 64.3067 ( 66.84%) Max-90 mmap-48 277.9137 ( 0.00%) 591.0594 (-112.68%) Max mmap-4 1913.8009 ( 0.00%) 299623.9695 (-15555.96%) Max mmap-7 2423.9665 ( 0.00%) 204453.1708 (-8334.65%) Max mmap-12 6845.6573 ( 0.00%) 221090.3366 (-3129.64%) Max mmap-21 56278.6508 ( 0.00%) 213877.3496 (-280.03%) Max mmap-30 19716.2990 ( 0.00%) 216287.6229 (-997.00%) Max mmap-48 477923.9400 ( 0.00%) 245414.8238 ( 48.65%) For most thread counts, the time to mmap() is unfortunately increased. In earlier versions of the series, this was lower but a large number of throttling events were reaching their timeout increasing the amount of inefficient scanning of the LRU. There is no prioritisation of reclaim tasks making progress based on each tasks rate of page allocation versus progress of reclaim. The variance is also impacted for high worker counts but in all cases, the differences in latency are not statistically significant due to very large maximum outliers. Max-90 shows that 90% of the stalls are comparable but the Max results show the massive outliers which are increased to to stalling. It is expected that this will be very machine dependant. Due to the test design, reclaim is difficult so allocations stall and there are variances depending on whether THPs can be allocated or not. The amount of memory will affect exactly how bad the corner cases are and how often they trigger. The warmup period calculation is not ideal as it's based on linear writes where as fio is randomly writing multiple files from multiple tasks so the start state of the test is variable. For example, these are the latencies on a single-socket machine that had more memory Amean mmap-4 42.2287 ( 0.00%) 49.6838 * -17.65%* Amean mmap-7 216.4326 ( 0.00%) 47.4451 * 78.08%* Amean mmap-12 2412.0588 ( 0.00%) 51.7497 ( 97.85%) Amean mmap-21 5546.2548 ( 0.00%) 51.8862 ( 99.06%) Amean mmap-30 1085.3121 ( 0.00%) 72.1004 ( 93.36%) The overall system CPU usage and elapsed time is as follows 5.15.0-rc3 5.15.0-rc3 vanilla mm-reclaimcongest-v5r4 Duration User 6989.03 983.42 Duration System 7308.12 799.68 Duration Elapsed 2277.67 2092.98 The patches reduce system CPU usage by 89% as the vanilla kernel is rarely stalling. The high-level /proc/vmstats show 5.15.0-rc1 5.15.0-rc1 vanilla mm-reclaimcongest-v5r2 Ops Direct pages scanned 1056608451.00 503594991.00 Ops Kswapd pages scanned 109795048.00 147289810.00 Ops Kswapd pages reclaimed 63269243.00 31036005.00 Ops Direct pages reclaimed 10803973.00 6328887.00 Ops Kswapd efficiency % 57.62 21.07 Ops Kswapd velocity 48204.98 57572.86 Ops Direct efficiency % 1.02 1.26 Ops Direct velocity 463898.83 196845.97 Kswapd scanned less pages but the detailed pattern is different. The vanilla kernel scans slowly over time where as the patches exhibits burst patterns of scan activity. Direct reclaim scanning is reduced by 52% due to stalling. The pattern for stealing pages is also slightly different. Both kernels exhibit spikes but the vanilla kernel when reclaiming shows pages being reclaimed over a period of time where as the patches tend to reclaim in spikes. The difference is that vanilla is not throttling and instead scanning constantly finding some pages over time where as the patched kernel throttles and reclaims in spikes. Ops Percentage direct scans 90.59 77.37 For direct reclaim, vanilla scanned 90.59% of pages where as with the patches, 77.37% were direct reclaim due to throttling Ops Page writes by reclaim 2613590.00 1687131.00 Page writes from reclaim context are reduced. Ops Page writes anon 2932752.00 1917048.00 And there is less swapping. Ops Page reclaim immediate 996248528.00 107664764.00 The number of pages encountered at the tail of the LRU tagged for immediate reclaim but still dirty/writeback is reduced by 89%. Ops Slabs scanned 164284.00 153608.00 Slab scan activity is similar. ftrace was used to gather stall activity Vanilla ------- 1 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=16000 2 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=12000 8 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=8000 29 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=4000 82394 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=0 The fast majority of wait_iff_congested calls do not stall at all. What is likely happening is that cond_resched() reschedules the task for a short period when the BDI is not registering congestion (which it never will in this test setup). 1 writeback_congestion_wait: usec_timeout=100000 usec_delayed=120000 2 writeback_congestion_wait: usec_timeout=100000 usec_delayed=132000 4 writeback_congestion_wait: usec_timeout=100000 usec_delayed=112000 380 writeback_congestion_wait: usec_timeout=100000 usec_delayed=108000 778 writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000 congestion_wait if called always exceeds the timeout as there is no trigger to wake it up. Bottom line: Vanilla will throttle but it's not effective. Patch series ------------ Kswapd throttle activity was always due to scanning pages tagged for immediate reclaim at the tail of the LRU 1 usec_timeout=100000 usect_delayed=72000 reason=VMSCAN_THROTTLE_WRITEBACK 4 usec_timeout=100000 usect_delayed=20000 reason=VMSCAN_THROTTLE_WRITEBACK 5 usec_timeout=100000 usect_delayed=12000 reason=VMSCAN_THROTTLE_WRITEBACK 6 usec_timeout=100000 usect_delayed=16000 reason=VMSCAN_THROTTLE_WRITEBACK 11 usec_timeout=100000 usect_delayed=100000 reason=VMSCAN_THROTTLE_WRITEBACK 11 usec_timeout=100000 usect_delayed=8000 reason=VMSCAN_THROTTLE_WRITEBACK 94 usec_timeout=100000 usect_delayed=0 reason=VMSCAN_THROTTLE_WRITEBACK 112 usec_timeout=100000 usect_delayed=4000 reason=VMSCAN_THROTTLE_WRITEBACK The majority of events did not stall or stalled for a short period. Roughly 16% of stalls reached the timeout before expiry. For direct reclaim, the number of times stalled for each reason were 6624 reason=VMSCAN_THROTTLE_ISOLATED 93246 reason=VMSCAN_THROTTLE_NOPROGRESS 96934 reason=VMSCAN_THROTTLE_WRITEBACK The most common reason to stall was due to excessive pages tagged for immediate reclaim at the tail of the LRU followed by a failure to make forward. A relatively small number were due to too many pages isolated from the LRU by parallel threads For VMSCAN_THROTTLE_ISOLATED, the breakdown of delays was 9 usec_timeout=20000 usect_delayed=4000 reason=VMSCAN_THROTTLE_ISOLATED 12 usec_timeout=20000 usect_delayed=16000 reason=VMSCAN_THROTTLE_ISOLATED 83 usec_timeout=20000 usect_delayed=20000 reason=VMSCAN_THROTTLE_ISOLATED 6520 usec_timeout=20000 usect_delayed=0 reason=VMSCAN_THROTTLE_ISOLATED Most did not stall at all. A small number reached the timeout. For VMSCAN_THROTTLE_NOPROGRESS, the breakdown of stalls were all over the map 1 usec_timeout=500000 usect_delayed=324000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usec_timeout=500000 usect_delayed=332000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usec_timeout=500000 usect_delayed=348000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usec_timeout=500000 usect_delayed=360000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=228000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=260000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=340000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=364000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=372000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=428000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=460000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=464000 reason=VMSCAN_THROTTLE_NOPROGRESS 3 usec_timeout=500000 usect_delayed=244000 reason=VMSCAN_THROTTLE_NOPROGRESS 3 usec_timeout=500000 usect_delayed=252000 reason=VMSCAN_THROTTLE_NOPROGRESS 3 usec_timeout=500000 usect_delayed=272000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=188000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=268000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=328000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=380000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=392000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=432000 reason=VMSCAN_THROTTLE_NOPROGRESS 5 usec_timeout=500000 usect_delayed=204000 reason=VMSCAN_THROTTLE_NOPROGRESS 5 usec_timeout=500000 usect_delayed=220000 reason=VMSCAN_THROTTLE_NOPROGRESS 5 usec_timeout=500000 usect_delayed=412000 reason=VMSCAN_THROTTLE_NOPROGRESS 5 usec_timeout=500000 usect_delayed=436000 reason=VMSCAN_THROTTLE_NOPROGRESS 6 usec_timeout=500000 usect_delayed=488000 reason=VMSCAN_THROTTLE_NOPROGRESS 7 usec_timeout=500000 usect_delayed=212000 reason=VMSCAN_THROTTLE_NOPROGRESS 7 usec_timeout=500000 usect_delayed=300000 reason=VMSCAN_THROTTLE_NOPROGRESS 7 usec_timeout=500000 usect_delayed=316000 reason=VMSCAN_THROTTLE_NOPROGRESS 7 usec_timeout=500000 usect_delayed=472000 reason=VMSCAN_THROTTLE_NOPROGRESS 8 usec_timeout=500000 usect_delayed=248000 reason=VMSCAN_THROTTLE_NOPROGRESS 8 usec_timeout=500000 usect_delayed=356000 reason=VMSCAN_THROTTLE_NOPROGRESS 8 usec_timeout=500000 usect_delayed=456000 reason=VMSCAN_THROTTLE_NOPROGRESS 9 usec_timeout=500000 usect_delayed=124000 reason=VMSCAN_THROTTLE_NOPROGRESS 9 usec_timeout=500000 usect_delayed=376000 reason=VMSCAN_THROTTLE_NOPROGRESS 9 usec_timeout=500000 usect_delayed=484000 reason=VMSCAN_THROTTLE_NOPROGRESS 10 usec_timeout=500000 usect_delayed=172000 reason=VMSCAN_THROTTLE_NOPROGRESS 10 usec_timeout=500000 usect_delayed=420000 reason=VMSCAN_THROTTLE_NOPROGRESS 10 usec_timeout=500000 usect_delayed=452000 reason=VMSCAN_THROTTLE_NOPROGRESS 11 usec_timeout=500000 usect_delayed=256000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=112000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=116000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=144000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=152000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=264000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=384000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=424000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=492000 reason=VMSCAN_THROTTLE_NOPROGRESS 13 usec_timeout=500000 usect_delayed=184000 reason=VMSCAN_THROTTLE_NOPROGRESS 13 usec_timeout=500000 usect_delayed=444000 reason=VMSCAN_THROTTLE_NOPROGRESS 14 usec_timeout=500000 usect_delayed=308000 reason=VMSCAN_THROTTLE_NOPROGRESS 14 usec_timeout=500000 usect_delayed=440000 reason=VMSCAN_THROTTLE_NOPROGRESS 14 usec_timeout=500000 usect_delayed=476000 reason=VMSCAN_THROTTLE_NOPROGRESS 16 usec_timeout=500000 usect_delayed=140000 reason=VMSCAN_THROTTLE_NOPROGRESS 17 usec_timeout=500000 usect_delayed=232000 reason=VMSCAN_THROTTLE_NOPROGRESS 17 usec_timeout=500000 usect_delayed=240000 reason=VMSCAN_THROTTLE_NOPROGRESS 17 usec_timeout=500000 usect_delayed=280000 reason=VMSCAN_THROTTLE_NOPROGRESS 18 usec_timeout=500000 usect_delayed=404000 reason=VMSCAN_THROTTLE_NOPROGRESS 20 usec_timeout=500000 usect_delayed=148000 reason=VMSCAN_THROTTLE_NOPROGRESS 20 usec_timeout=500000 usect_delayed=216000 reason=VMSCAN_THROTTLE_NOPROGRESS 20 usec_timeout=500000 usect_delayed=468000 reason=VMSCAN_THROTTLE_NOPROGRESS 21 usec_timeout=500000 usect_delayed=448000 reason=VMSCAN_THROTTLE_NOPROGRESS 23 usec_timeout=500000 usect_delayed=168000 reason=VMSCAN_THROTTLE_NOPROGRESS 23 usec_timeout=500000 usect_delayed=296000 reason=VMSCAN_THROTTLE_NOPROGRESS 25 usec_timeout=500000 usect_delayed=132000 reason=VMSCAN_THROTTLE_NOPROGRESS 25 usec_timeout=500000 usect_delayed=352000 reason=VMSCAN_THROTTLE_NOPROGRESS 26 usec_timeout=500000 usect_delayed=180000 reason=VMSCAN_THROTTLE_NOPROGRESS 27 usec_timeout=500000 usect_delayed=284000 reason=VMSCAN_THROTTLE_NOPROGRESS 28 usec_timeout=500000 usect_delayed=164000 reason=VMSCAN_THROTTLE_NOPROGRESS 29 usec_timeout=500000 usect_delayed=136000 reason=VMSCAN_THROTTLE_NOPROGRESS 30 usec_timeout=500000 usect_delayed=200000 reason=VMSCAN_THROTTLE_NOPROGRESS 30 usec_timeout=500000 usect_delayed=400000 reason=VMSCAN_THROTTLE_NOPROGRESS 31 usec_timeout=500000 usect_delayed=196000 reason=VMSCAN_THROTTLE_NOPROGRESS 32 usec_timeout=500000 usect_delayed=156000 reason=VMSCAN_THROTTLE_NOPROGRESS 33 usec_timeout=500000 usect_delayed=224000 reason=VMSCAN_THROTTLE_NOPROGRESS 35 usec_timeout=500000 usect_delayed=128000 reason=VMSCAN_THROTTLE_NOPROGRESS 35 usec_timeout=500000 usect_delayed=176000 reason=VMSCAN_THROTTLE_NOPROGRESS 36 usec_timeout=500000 usect_delayed=368000 reason=VMSCAN_THROTTLE_NOPROGRESS 36 usec_timeout=500000 usect_delayed=496000 reason=VMSCAN_THROTTLE_NOPROGRESS 37 usec_timeout=500000 usect_delayed=312000 reason=VMSCAN_THROTTLE_NOPROGRESS 38 usec_timeout=500000 usect_delayed=304000 reason=VMSCAN_THROTTLE_NOPROGRESS 40 usec_timeout=500000 usect_delayed=288000 reason=VMSCAN_THROTTLE_NOPROGRESS 43 usec_timeout=500000 usect_delayed=408000 reason=VMSCAN_THROTTLE_NOPROGRESS 55 usec_timeout=500000 usect_delayed=416000 reason=VMSCAN_THROTTLE_NOPROGRESS 56 usec_timeout=500000 usect_delayed=76000 reason=VMSCAN_THROTTLE_NOPROGRESS 58 usec_timeout=500000 usect_delayed=120000 reason=VMSCAN_THROTTLE_NOPROGRESS 59 usec_timeout=500000 usect_delayed=208000 reason=VMSCAN_THROTTLE_NOPROGRESS 61 usec_timeout=500000 usect_delayed=68000 reason=VMSCAN_THROTTLE_NOPROGRESS 71 usec_timeout=500000 usect_delayed=192000 reason=VMSCAN_THROTTLE_NOPROGRESS 71 usec_timeout=500000 usect_delayed=480000 reason=VMSCAN_THROTTLE_NOPROGRESS 79 usec_timeout=500000 usect_delayed=60000 reason=VMSCAN_THROTTLE_NOPROGRESS 82 usec_timeout=500000 usect_delayed=320000 reason=VMSCAN_THROTTLE_NOPROGRESS 82 usec_timeout=500000 usect_delayed=92000 reason=VMSCAN_THROTTLE_NOPROGRESS 85 usec_timeout=500000 usect_delayed=64000 reason=VMSCAN_THROTTLE_NOPROGRESS 85 usec_timeout=500000 usect_delayed=80000 reason=VMSCAN_THROTTLE_NOPROGRESS 88 usec_timeout=500000 usect_delayed=84000 reason=VMSCAN_THROTTLE_NOPROGRESS 90 usec_timeout=500000 usect_delayed=160000 reason=VMSCAN_THROTTLE_NOPROGRESS 90 usec_timeout=500000 usect_delayed=292000 reason=VMSCAN_THROTTLE_NOPROGRESS 94 usec_timeout=500000 usect_delayed=56000 reason=VMSCAN_THROTTLE_NOPROGRESS 118 usec_timeout=500000 usect_delayed=88000 reason=VMSCAN_THROTTLE_NOPROGRESS 119 usec_timeout=500000 usect_delayed=72000 reason=VMSCAN_THROTTLE_NOPROGRESS 126 usec_timeout=500000 usect_delayed=108000 reason=VMSCAN_THROTTLE_NOPROGRESS 146 usec_timeout=500000 usect_delayed=52000 reason=VMSCAN_THROTTLE_NOPROGRESS 148 usec_timeout=500000 usect_delayed=36000 reason=VMSCAN_THROTTLE_NOPROGRESS 148 usec_timeout=500000 usect_delayed=48000 reason=VMSCAN_THROTTLE_NOPROGRESS 159 usec_timeout=500000 usect_delayed=28000 reason=VMSCAN_THROTTLE_NOPROGRESS 178 usec_timeout=500000 usect_delayed=44000 reason=VMSCAN_THROTTLE_NOPROGRESS 183 usec_timeout=500000 usect_delayed=40000 reason=VMSCAN_THROTTLE_NOPROGRESS 237 usec_timeout=500000 usect_delayed=100000 reason=VMSCAN_THROTTLE_NOPROGRESS 266 usec_timeout=500000 usect_delayed=32000 reason=VMSCAN_THROTTLE_NOPROGRESS 313 usec_timeout=500000 usect_delayed=24000 reason=VMSCAN_THROTTLE_NOPROGRESS 347 usec_timeout=500000 usect_delayed=96000 reason=VMSCAN_THROTTLE_NOPROGRESS 470 usec_timeout=500000 usect_delayed=20000 reason=VMSCAN_THROTTLE_NOPROGRESS 559 usec_timeout=500000 usect_delayed=16000 reason=VMSCAN_THROTTLE_NOPROGRESS 964 usec_timeout=500000 usect_delayed=12000 reason=VMSCAN_THROTTLE_NOPROGRESS 2001 usec_timeout=500000 usect_delayed=104000 reason=VMSCAN_THROTTLE_NOPROGRESS 2447 usec_timeout=500000 usect_delayed=8000 reason=VMSCAN_THROTTLE_NOPROGRESS 7888 usec_timeout=500000 usect_delayed=4000 reason=VMSCAN_THROTTLE_NOPROGRESS 22727 usec_timeout=500000 usect_delayed=0 reason=VMSCAN_THROTTLE_NOPROGRESS 51305 usec_timeout=500000 usect_delayed=500000 reason=VMSCAN_THROTTLE_NOPROGRESS The full timeout is often hit but a large number also do not stall at all. The remainder slept a little allowing other reclaim tasks to make progress. While this timeout could be further increased, it could also negatively impact worst-case behaviour when there is no prioritisation of what task should make progress. For VMSCAN_THROTTLE_WRITEBACK, the breakdown was 1 usec_timeout=100000 usect_delayed=44000 reason=VMSCAN_THROTTLE_WRITEBACK 2 usec_timeout=100000 usect_delayed=76000 reason=VMSCAN_THROTTLE_WRITEBACK 3 usec_timeout=100000 usect_delayed=80000 reason=VMSCAN_THROTTLE_WRITEBACK 5 usec_timeout=100000 usect_delayed=48000 reason=VMSCAN_THROTTLE_WRITEBACK 5 usec_timeout=100000 usect_delayed=84000 reason=VMSCAN_THROTTLE_WRITEBACK 6 usec_timeout=100000 usect_delayed=72000 reason=VMSCAN_THROTTLE_WRITEBACK 7 usec_timeout=100000 usect_delayed=88000 reason=VMSCAN_THROTTLE_WRITEBACK 11 usec_timeout=100000 usect_delayed=56000 reason=VMSCAN_THROTTLE_WRITEBACK 12 usec_timeout=100000 usect_delayed=64000 reason=VMSCAN_THROTTLE_WRITEBACK 16 usec_timeout=100000 usect_delayed=92000 reason=VMSCAN_THROTTLE_WRITEBACK 24 usec_timeout=100000 usect_delayed=68000 reason=VMSCAN_THROTTLE_WRITEBACK 28 usec_timeout=100000 usect_delayed=32000 reason=VMSCAN_THROTTLE_WRITEBACK 30 usec_timeout=100000 usect_delayed=60000 reason=VMSCAN_THROTTLE_WRITEBACK 30 usec_timeout=100000 usect_delayed=96000 reason=VMSCAN_THROTTLE_WRITEBACK 32 usec_timeout=100000 usect_delayed=52000 reason=VMSCAN_THROTTLE_WRITEBACK 42 usec_timeout=100000 usect_delayed=40000 reason=VMSCAN_THROTTLE_WRITEBACK 77 usec_timeout=100000 usect_delayed=28000 reason=VMSCAN_THROTTLE_WRITEBACK 99 usec_timeout=100000 usect_delayed=36000 reason=VMSCAN_THROTTLE_WRITEBACK 137 usec_timeout=100000 usect_delayed=24000 reason=VMSCAN_THROTTLE_WRITEBACK 190 usec_timeout=100000 usect_delayed=20000 reason=VMSCAN_THROTTLE_WRITEBACK 339 usec_timeout=100000 usect_delayed=16000 reason=VMSCAN_THROTTLE_WRITEBACK 518 usec_timeout=100000 usect_delayed=12000 reason=VMSCAN_THROTTLE_WRITEBACK 852 usec_timeout=100000 usect_delayed=8000 reason=VMSCAN_THROTTLE_WRITEBACK 3359 usec_timeout=100000 usect_delayed=4000 reason=VMSCAN_THROTTLE_WRITEBACK 7147 usec_timeout=100000 usect_delayed=0 reason=VMSCAN_THROTTLE_WRITEBACK 83962 usec_timeout=100000 usect_delayed=100000 reason=VMSCAN_THROTTLE_WRITEBACK The majority hit the timeout in direct reclaim context although a sizable number did not stall at all. This is very different to kswapd where only a tiny percentage of stalls due to writeback reached the timeout. Bottom line, the throttling appears to work and the wakeup events may limit worst case stalls. There might be some grounds for adjusting timeouts but it's likely futile as the worst-case scenarios depend on the workload, memory size and the speed of the storage. A better approach to improve the series further would be to prioritise tasks based on their rate of allocation with the caveat that it may be very expensive to track. This patch (of 5): Page reclaim throttles on wait_iff_congested under the following conditions: - kswapd is encountering pages under writeback and marked for immediate reclaim implying that pages are cycling through the LRU faster than pages can be cleaned. - Direct reclaim will stall if all dirty pages are backed by congested inodes. wait_iff_congested is almost completely broken with few exceptions. This patch adds a new node-based workqueue and tracks the number of throttled tasks and pages written back since throttling started. If enough pages belonging to the node are written back then the throttled tasks will wake early. If not, the throttled tasks sleeps until the timeout expires. [neilb@suse.de: Uninterruptible sleep and simpler wakeups] [hdanton@sina.com: Avoid race when reclaim starts] [vbabka@suse.cz: vmstat irq-safe api, clarifications] Link: https://lore.kernel.org/linux-mm/45d8b7a6-8548-65f5-cccf-9f451d4ae3d4@kernel.dk/ [1] Link: https://lkml.kernel.org/r/20211022144651.19914-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20211022144651.19914-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: NeilBrown Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Dave Chinner Cc: Rik van Riel Cc: Johannes Weiner Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 1 - include/linux/mmzone.h | 13 +++++ include/trace/events/vmscan.h | 34 +++++++++++++ include/trace/events/writeback.h | 7 --- mm/backing-dev.c | 48 ------------------- mm/filemap.c | 1 + mm/internal.h | 11 +++++ mm/page_alloc.c | 5 ++ mm/vmscan.c | 82 +++++++++++++++++++++++++++----- mm/vmstat.c | 1 + 10 files changed, 135 insertions(+), 68 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ac7f231b8825..9fb1f0ae273c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -154,7 +154,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) } long congestion_wait(int sync, long timeout); -long wait_iff_congested(int sync, long timeout); static inline bool mapping_can_writeback(struct address_space *mapping) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fb36a29e3aae..5a5e19b90dab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -199,6 +199,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ @@ -272,6 +273,11 @@ enum lru_list { NR_LRU_LISTS }; +enum vmscan_throttle_state { + VMSCAN_THROTTLE_WRITEBACK, + NR_VMSCAN_THROTTLE, +}; + #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) @@ -841,6 +847,13 @@ typedef struct pglist_data { int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; + + /* workqueues for throttling reclaim for different reasons. */ + wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; + + atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ + unsigned long nr_reclaim_start; /* nr pages written while throttled + * when throttling started. */ struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 88faf2400ec2..c317f9fe0d17 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -27,6 +27,14 @@ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" +#define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) + +#define show_throttle_flags(flags) \ + (flags) ? __print_flags(flags, "|", \ + {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"} \ + ) : "VMSCAN_THROTTLE_NONE" + + #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ @@ -454,6 +462,32 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, TP_ARGS(nr_reclaimed) ); +TRACE_EVENT(mm_vmscan_throttled, + + TP_PROTO(int nid, int usec_timeout, int usec_delayed, int reason), + + TP_ARGS(nid, usec_timeout, usec_delayed, reason), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, usec_timeout) + __field(int, usec_delayed) + __field(int, reason) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->usec_timeout = usec_timeout; + __entry->usec_delayed = usec_delayed; + __entry->reason = 1U << reason; + ), + + TP_printk("nid=%d usec_timeout=%d usect_delayed=%d reason=%s", + __entry->nid, + __entry->usec_timeout, + __entry->usec_delayed, + show_throttle_flags(__entry->reason)) +); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 840d1ba84cf5..3bc759b81897 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -763,13 +763,6 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, TP_ARGS(usec_timeout, usec_delayed) ); -DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, - - TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), - - TP_ARGS(usec_timeout, usec_delayed) -); - DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5ccb25089808..3d2983752e24 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1038,51 +1038,3 @@ long congestion_wait(int sync, long timeout) return ret; } EXPORT_SYMBOL(congestion_wait); - -/** - * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes - * @sync: SYNC or ASYNC IO - * @timeout: timeout in jiffies - * - * In the event of a congested backing_dev (any backing_dev) this waits - * for up to @timeout jiffies for either a BDI to exit congestion of the - * given @sync queue or a write to complete. - * - * The return value is 0 if the sleep is for the full timeout. Otherwise, - * it is the number of jiffies that were still remaining when the function - * returned. return_value == timeout implies the function did not sleep. - */ -long wait_iff_congested(int sync, long timeout) -{ - long ret; - unsigned long start = jiffies; - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[sync]; - - /* - * If there is no congestion, yield if necessary instead - * of sleeping on the congestion queue - */ - if (atomic_read(&nr_wb_congested[sync]) == 0) { - cond_resched(); - - /* In case we scheduled, work out time remaining */ - ret = timeout - (jiffies - start); - if (ret < 0) - ret = 0; - - goto out; - } - - /* Sleep until uncongested or a write happens */ - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - ret = io_schedule_timeout(timeout); - finish_wait(wqh, &wait); - -out: - trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), - jiffies_to_usecs(jiffies - start)); - - return ret; -} -EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/filemap.c b/mm/filemap.c index 6f2c4bd6a571..b6140debc2da 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1612,6 +1612,7 @@ void end_page_writeback(struct page *page) smp_mb__after_atomic(); wake_up_page(page, PG_writeback); + acct_reclaim_writeback(page); put_page(page); } EXPORT_SYMBOL(end_page_writeback); diff --git a/mm/internal.h b/mm/internal.h index 6c3e1a9f8c5a..a59b5626f968 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -34,6 +34,17 @@ void page_writeback_init(void); +void __acct_reclaim_writeback(pg_data_t *pgdat, struct page *page, + int nr_throttled); +static inline void acct_reclaim_writeback(struct page *page) +{ + pg_data_t *pgdat = page_pgdat(page); + int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); + + if (nr_throttled) + __acct_reclaim_writeback(pgdat, page, nr_throttled); +} + vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 09a0f1c5d5d2..0f1f1f353211 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7408,6 +7408,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} static void __meminit pgdat_init_internals(struct pglist_data *pgdat) { + int i; + pgdat_resize_init(pgdat); pgdat_init_split_queue(pgdat); @@ -7416,6 +7418,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); + for (i = 0; i < NR_VMSCAN_THROTTLE; i++) + init_waitqueue_head(&pgdat->reclaim_wait[i]); + pgdat_page_ext_init(pgdat); lruvec_init(&pgdat->__lruvec); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9483dd6af550..09beeb55546c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1006,6 +1006,64 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +static void +reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, + long timeout) +{ + wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; + long ret; + DEFINE_WAIT(wait); + + /* + * Do not throttle IO workers, kthreads other than kswapd or + * workqueues. They may be required for reclaim to make + * forward progress (e.g. journalling workqueues or kthreads). + */ + if (!current_is_kswapd() && + current->flags & (PF_IO_WORKER|PF_KTHREAD)) + return; + + if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { + WRITE_ONCE(pgdat->nr_reclaim_start, + node_page_state(pgdat, NR_THROTTLED_WRITTEN)); + } + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = schedule_timeout(timeout); + finish_wait(wqh, &wait); + atomic_dec(&pgdat->nr_writeback_throttled); + + trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), + jiffies_to_usecs(timeout - ret), + reason); +} + +/* + * Account for pages written if tasks are throttled waiting on dirty + * pages to clean. If enough pages have been cleaned since throttling + * started then wakeup the throttled tasks. + */ +void __acct_reclaim_writeback(pg_data_t *pgdat, struct page *page, + int nr_throttled) +{ + unsigned long nr_written; + + inc_node_page_state(page, NR_THROTTLED_WRITTEN); + + /* + * This is an inaccurate read as the per-cpu deltas may not + * be synchronised. However, given that the system is + * writeback throttled, it is not worth taking the penalty + * of getting an accurate count. At worst, the throttle + * timeout guarantees forward progress. + */ + nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - + READ_ONCE(pgdat->nr_reclaim_start); + + if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) + wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); +} + /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -1411,9 +1469,8 @@ retry: /* * The number of dirty pages determines if a node is marked - * reclaim_congested which affects wait_iff_congested. kswapd - * will stall and start writing pages if the tail of the LRU - * is all dirty unqueued pages. + * reclaim_congested. kswapd will stall and start writing + * pages if the tail of the LRU is all dirty unqueued pages. */ page_check_dirty_writeback(page, &dirty, &writeback); if (dirty || writeback) @@ -3179,19 +3236,19 @@ again: * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it * implies that pages are cycling through the LRU - * faster than they are written so also forcibly stall. + * faster than they are written so forcibly stall + * until some pages complete writeback. */ if (sc->nr.immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10); } /* - * Tag a node/memcg as congested if all the dirty pages - * scanned were backed by a congested BDI and - * wait_iff_congested will stall. + * Tag a node/memcg as congested if all the dirty pages were marked + * for writeback and immediate reclaim (counted in nr.congested). * * Legacy memcg will stall in page writeback so avoid forcibly - * stalling in wait_iff_congested(). + * stalling in reclaim_throttle(). */ if ((current_is_kswapd() || (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && @@ -3199,15 +3256,15 @@ again: set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); /* - * Stall direct reclaim for IO completions if underlying BDIs - * and node is congested. Allow kswapd to continue until it + * Stall direct reclaim for IO completions if the lruvec is + * node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) - wait_iff_congested(BLK_RW_ASYNC, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc)) @@ -4285,6 +4342,7 @@ static int kswapd(void *p) WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); + atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { bool ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index ae87c90e0b4e..0f4643e5324d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1225,6 +1225,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", + "nr_throttled_written", "nr_kernel_misc_reclaimable", "nr_foll_pin_acquired", "nr_foll_pin_released", From d818fca1cac31b1fc9301bda83e195a46fb4ebaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:29 -0700 Subject: [PATCH 400/512] mm/vmscan: throttle reclaim and compaction when too may pages are isolated Page reclaim throttles on congestion if too many parallel reclaim instances have isolated too many pages. This makes no sense, excessive parallelisation has nothing to do with writeback or congestion. This patch creates an additional workqueue to sleep on when too many pages are isolated. The throttled tasks are woken when the number of isolated pages is reduced or a timeout occurs. There may be some false positive wakeups for GFP_NOIO/GFP_NOFS callers but the tasks will throttle again if necessary. [shy828301@gmail.com: Wake up from compaction context] [vbabka@suse.cz: Account number of throttled tasks only for writeback] Link: https://lkml.kernel.org/r/20211022144651.19914-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + include/trace/events/vmscan.h | 4 +++- mm/compaction.c | 10 ++++++++-- mm/internal.h | 11 +++++++++++ mm/vmscan.c | 22 ++++++++++++++++------ 5 files changed, 39 insertions(+), 9 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a5e19b90dab..312c1ea9aafa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -275,6 +275,7 @@ enum lru_list { enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, + VMSCAN_THROTTLE_ISOLATED, NR_VMSCAN_THROTTLE, }; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c317f9fe0d17..d4905bd9e9c4 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -28,10 +28,12 @@ ) : "RECLAIM_WB_NONE" #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) +#define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) #define show_throttle_flags(flags) \ (flags) ? __print_flags(flags, "|", \ - {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"} \ + {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ + {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"} \ ) : "VMSCAN_THROTTLE_NONE" diff --git a/mm/compaction.c b/mm/compaction.c index bfc93da1c2c7..7359093d8ac0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -761,6 +761,8 @@ isolate_freepages_range(struct compact_control *cc, /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(pg_data_t *pgdat) { + bool too_many; + unsigned long active, inactive, isolated; inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + @@ -770,7 +772,11 @@ static bool too_many_isolated(pg_data_t *pgdat) isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + node_page_state(pgdat, NR_ISOLATED_ANON); - return isolated > (inactive + active) / 2; + too_many = isolated > (inactive + active) / 2; + if (!too_many) + wake_throttle_isolated(pgdat); + + return too_many; } /** @@ -822,7 +828,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (cc->mode == MIGRATE_ASYNC) return -EAGAIN; - congestion_wait(BLK_RW_ASYNC, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10); if (fatal_signal_pending(current)) return -EINTR; diff --git a/mm/internal.h b/mm/internal.h index a59b5626f968..7dfe74f827bf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -45,6 +45,15 @@ static inline void acct_reclaim_writeback(struct page *page) __acct_reclaim_writeback(pgdat, page, nr_throttled); } +static inline void wake_throttle_isolated(pg_data_t *pgdat) +{ + wait_queue_head_t *wqh; + + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; + if (waitqueue_active(wqh)) + wake_up(wqh); +} + vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, @@ -121,6 +130,8 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); +extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, + long timeout); /* * in mm/rmap.c: diff --git a/mm/vmscan.c b/mm/vmscan.c index 09beeb55546c..7bfd62f81e16 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1006,12 +1006,12 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -static void -reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, +void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, long timeout) { wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; long ret; + bool acct_writeback = (reason == VMSCAN_THROTTLE_WRITEBACK); DEFINE_WAIT(wait); /* @@ -1023,7 +1023,8 @@ reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, current->flags & (PF_IO_WORKER|PF_KTHREAD)) return; - if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { + if (acct_writeback && + atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { WRITE_ONCE(pgdat->nr_reclaim_start, node_page_state(pgdat, NR_THROTTLED_WRITTEN)); } @@ -1031,7 +1032,9 @@ reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = schedule_timeout(timeout); finish_wait(wqh, &wait); - atomic_dec(&pgdat->nr_writeback_throttled); + + if (acct_writeback) + atomic_dec(&pgdat->nr_writeback_throttled); trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), jiffies_to_usecs(timeout - ret), @@ -2175,6 +2178,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; + bool too_many; if (current_is_kswapd()) return 0; @@ -2198,7 +2202,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; - return isolated > inactive; + too_many = isolated > inactive; + + /* Wake up tasks throttled due to too_many_isolated. */ + if (!too_many) + wake_throttle_isolated(pgdat); + + return too_many; } /* @@ -2307,8 +2317,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; /* wait a bit for the reclaimer. */ - msleep(100); stalled = true; + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) From 69392a403f49e6e33f9dfb1d6edb87c8006f83c2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:32 -0700 Subject: [PATCH 401/512] mm/vmscan: throttle reclaim when no progress is being made Memcg reclaim throttles on congestion if no reclaim progress is made. This makes little sense, it might be due to writeback or a host of other factors. For !memcg reclaim, it's messy. Direct reclaim primarily is throttled in the page allocator if it is failing to make progress. Kswapd throttles if too many pages are under writeback and marked for immediate reclaim. This patch explicitly throttles if reclaim is failing to make progress. [vbabka@suse.cz: Remove redundant code] Link: https://lkml.kernel.org/r/20211022144651.19914-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + include/trace/events/vmscan.h | 4 +++- mm/memcontrol.c | 10 +--------- mm/vmscan.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 312c1ea9aafa..58e744b78c2c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -276,6 +276,7 @@ enum lru_list { enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, + VMSCAN_THROTTLE_NOPROGRESS, NR_VMSCAN_THROTTLE, }; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index d4905bd9e9c4..f25a6149d3ba 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -29,11 +29,13 @@ #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) #define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) +#define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS) #define show_throttle_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ - {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"} \ + {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \ + {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"} \ ) : "VMSCAN_THROTTLE_NONE" diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf0321d7a784..965b3cf7046b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3487,19 +3487,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { - int progress; - if (signal_pending(current)) return -EINTR; - progress = try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, true); - if (!progress) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) nr_retries--; - /* maybe some writeback is necessary */ - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - } return 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 7bfd62f81e16..7d3fe5938e3b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3322,6 +3322,33 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); } +static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) +{ + /* If reclaim is making progress, wake any throttled tasks. */ + if (sc->nr_reclaimed) { + wait_queue_head_t *wqh; + + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; + if (waitqueue_active(wqh)) + wake_up(wqh); + + return; + } + + /* + * Do not throttle kswapd on NOPROGRESS as it will throttle on + * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under + * writeback and marked for immediate reclaim at the tail of + * the LRU. + */ + if (current_is_kswapd()) + return; + + /* Throttle if making no progress at high prioities. */ + if (sc->priority < DEF_PRIORITY - 2) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10); +} + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -3406,6 +3433,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); + consider_reclaim_throttle(zone->zone_pgdat, sc); } /* From 8d58802fc9de1b416601d90da794a3feaad1898d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:35 -0700 Subject: [PATCH 402/512] mm/writeback: throttle based on page writeback instead of congestion do_writepages throttles on congestion if the writepages() fails due to a lack of memory but congestion_wait() is partially broken as the congestion state is not updated for all BDIs. This patch stalls waiting for a number of pages to complete writeback that located on the local node. The main weakness is that there is no correlation between the location of the inode's pages and locality but that is still better than congestion_wait. Link: https://lkml.kernel.org/r/20211022144651.19914-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4812a17b288c..f34f54fcd5b4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2366,8 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) ret = generic_writepages(mapping, wbc); if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) break; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + + /* + * Lacking an allocation context or the locality or writeback + * state of any of the inode's pages, throttle based on + * writeback activity on the local node. It's as good a + * guess as any. + */ + reclaim_throttle(NODE_DATA(numa_node_id()), + VMSCAN_THROTTLE_WRITEBACK, HZ/50); } /* * Usually few pages are written by now from those we've just submitted From 132b0d21d21f14f74fbe44dd5b8b1848215fff09 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:38 -0700 Subject: [PATCH 403/512] mm/page_alloc: remove the throttling logic from the page allocator The page allocator stalls based on the number of pages that are waiting for writeback to start but this should now be redundant. shrink_inactive_list() will wake flusher threads if the LRU tail are unqueued dirty pages so the flusher should be active. If it fails to make progress due to pages under writeback not being completed quickly then it should stall on VMSCAN_THROTTLE_WRITEBACK. Link: https://lkml.kernel.org/r/20211022144651.19914-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0f1f1f353211..0f74a66bed19 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4791,30 +4791,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { - /* - * If we didn't make any progress and have a lot of - * dirty + writeback pages then we should wait for - * an IO to complete to slow down the reclaim and - * prevent from pre mature OOM - */ - if (!did_some_progress) { - unsigned long write_pending; - - write_pending = zone_page_state_snapshot(zone, - NR_ZONE_WRITE_PENDING); - - if (2 * write_pending > reclaimable) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - return true; - } - } - ret = true; - goto out; + break; } } -out: /* * Memory allocation/reclaim might be called from a WQ context and the * current implementation of the WQ concurrency control doesn't From c3f4a9a2b082c5392fbff17c6d8551154add5fdb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:42 -0700 Subject: [PATCH 404/512] mm/vmscan: centralise timeout values for reclaim_throttle Neil Brown raised concerns about callers of reclaim_throttle specifying a timeout value. The original timeout values to congestion_wait() were probably pulled out of thin air or copy&pasted from somewhere else. This patch centralises the timeout values and selects a timeout based on the reason for reclaim throttling. These figures are also pulled out of the same thin air but better values may be derived Running a workload that is throttling for inappropriate periods and tracing mm_vmscan_throttled can be used to pick a more appropriate value. Excessive throttling would pick a lower timeout where as excessive CPU usage in reclaim context would select a larger timeout. Ideally a large value would always be used and the wakeups would occur before a timeout but that requires careful testing. Link: https://lkml.kernel.org/r/20211022144651.19914-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- mm/internal.h | 3 +-- mm/page-writeback.c | 2 +- mm/vmscan.c | 50 +++++++++++++++++++++++++++++++++------------ 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 7359093d8ac0..151b04c4dab3 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -828,7 +828,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (cc->mode == MIGRATE_ASYNC) return -EAGAIN; - reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); if (fatal_signal_pending(current)) return -EINTR; diff --git a/mm/internal.h b/mm/internal.h index 7dfe74f827bf..f3de3a2f3e30 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -130,8 +130,7 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, - long timeout); +extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* * in mm/rmap.c: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f34f54fcd5b4..4b01a6872f9e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2374,7 +2374,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) * guess as any. */ reclaim_throttle(NODE_DATA(numa_node_id()), - VMSCAN_THROTTLE_WRITEBACK, HZ/50); + VMSCAN_THROTTLE_WRITEBACK); } /* * Usually few pages are written by now from those we've just submitted diff --git a/mm/vmscan.c b/mm/vmscan.c index 7d3fe5938e3b..599e5616b123 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1006,12 +1006,10 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, - long timeout) +void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) { wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; - long ret; - bool acct_writeback = (reason == VMSCAN_THROTTLE_WRITEBACK); + long timeout, ret; DEFINE_WAIT(wait); /* @@ -1023,17 +1021,43 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, current->flags & (PF_IO_WORKER|PF_KTHREAD)) return; - if (acct_writeback && - atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { - WRITE_ONCE(pgdat->nr_reclaim_start, - node_page_state(pgdat, NR_THROTTLED_WRITTEN)); + /* + * These figures are pulled out of thin air. + * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many + * parallel reclaimers which is a short-lived event so the timeout is + * short. Failing to make progress or waiting on writeback are + * potentially long-lived events so use a longer timeout. This is shaky + * logic as a failure to make progress could be due to anything from + * writeback to a slow device to excessive references pages at the tail + * of the inactive LRU. + */ + switch(reason) { + case VMSCAN_THROTTLE_WRITEBACK: + timeout = HZ/10; + + if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { + WRITE_ONCE(pgdat->nr_reclaim_start, + node_page_state(pgdat, NR_THROTTLED_WRITTEN)); + } + + break; + case VMSCAN_THROTTLE_NOPROGRESS: + timeout = HZ/10; + break; + case VMSCAN_THROTTLE_ISOLATED: + timeout = HZ/50; + break; + default: + WARN_ON_ONCE(1); + timeout = HZ; + break; } prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = schedule_timeout(timeout); finish_wait(wqh, &wait); - if (acct_writeback) + if (reason == VMSCAN_THROTTLE_WRITEBACK) atomic_dec(&pgdat->nr_writeback_throttled); trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), @@ -2318,7 +2342,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, /* wait a bit for the reclaimer. */ stalled = true; - reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) @@ -3250,7 +3274,7 @@ again: * until some pages complete writeback. */ if (sc->nr.immediate) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } /* @@ -3274,7 +3298,7 @@ again: if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc)) @@ -3346,7 +3370,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) /* Throttle if making no progress at high prioities. */ if (sc->priority < DEF_PRIORITY - 2) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } /* From a19594ca4a8bfb5980de7bcc6ae6cbce8ff9680e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:45 -0700 Subject: [PATCH 405/512] mm/vmscan: increase the timeout if page reclaim is not making progress Tracing of the stutterp workload showed the following delays 1 usect_delayed=124000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usect_delayed=128000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usect_delayed=176000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usect_delayed=536000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usect_delayed=544000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usect_delayed=556000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usect_delayed=624000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usect_delayed=716000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usect_delayed=772000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usect_delayed=512000 reason=VMSCAN_THROTTLE_NOPROGRESS 16 usect_delayed=120000 reason=VMSCAN_THROTTLE_NOPROGRESS 53 usect_delayed=116000 reason=VMSCAN_THROTTLE_NOPROGRESS 116 usect_delayed=112000 reason=VMSCAN_THROTTLE_NOPROGRESS 5907 usect_delayed=108000 reason=VMSCAN_THROTTLE_NOPROGRESS 71741 usect_delayed=104000 reason=VMSCAN_THROTTLE_NOPROGRESS All the throttling hit the full timeout and then there was wakeup delays meaning that the wakeups are premature as no other reclaimer such as kswapd has made progress. This patch increases the maximum timeout. Link: https://lkml.kernel.org/r/20211022144651.19914-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 599e5616b123..9c125f793bf5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1042,7 +1042,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) break; case VMSCAN_THROTTLE_NOPROGRESS: - timeout = HZ/10; + timeout = HZ/2; break; case VMSCAN_THROTTLE_ISOLATED: timeout = HZ/50; From 66ce520bb7c22848bfdf3180d7e760a066dbcfbe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:49 -0700 Subject: [PATCH 406/512] mm/vmscan: delay waking of tasks throttled on NOPROGRESS Tracing indicates that tasks throttled on NOPROGRESS are woken prematurely resulting in occasional massive spikes in direct reclaim activity. This patch wakes tasks throttled on NOPROGRESS if reclaim efficiency is at least 12%. Link: https://lkml.kernel.org/r/20211022144651.19914-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9c125f793bf5..41f5f6007c30 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3348,8 +3348,11 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) { - /* If reclaim is making progress, wake any throttled tasks. */ - if (sc->nr_reclaimed) { + /* + * If reclaim is making progress greater than 12% efficiency then + * wake all the NOPROGRESS throttled tasks. + */ + if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; From 7e6ec49c18988f1b8dab0677271dafde5f8d9a43 Mon Sep 17 00:00:00 2001 From: Yuanzheng Song Date: Fri, 5 Nov 2021 13:42:52 -0700 Subject: [PATCH 407/512] mm/vmpressure: fix data-race with memcg->socket_pressure When reading memcg->socket_pressure in mem_cgroup_under_socket_pressure() and writing memcg->socket_pressure in vmpressure() at the same time, the following data-race occurs: BUG: KCSAN: data-race in __sk_mem_reduce_allocated / vmpressure write to 0xffff8881286f4938 of 8 bytes by task 24550 on cpu 3: vmpressure+0x218/0x230 mm/vmpressure.c:307 shrink_node_memcgs+0x2b9/0x410 mm/vmscan.c:2658 shrink_node+0x9d2/0x11d0 mm/vmscan.c:2769 shrink_zones+0x29f/0x470 mm/vmscan.c:2972 do_try_to_free_pages+0x193/0x6e0 mm/vmscan.c:3027 try_to_free_mem_cgroup_pages+0x1c0/0x3f0 mm/vmscan.c:3345 reclaim_high mm/memcontrol.c:2440 [inline] mem_cgroup_handle_over_high+0x18b/0x4d0 mm/memcontrol.c:2624 tracehook_notify_resume include/linux/tracehook.h:197 [inline] exit_to_user_mode_loop kernel/entry/common.c:164 [inline] exit_to_user_mode_prepare+0x110/0x170 kernel/entry/common.c:191 syscall_exit_to_user_mode+0x16/0x30 kernel/entry/common.c:266 ret_from_fork+0x15/0x30 arch/x86/entry/entry_64.S:289 read to 0xffff8881286f4938 of 8 bytes by interrupt on cpu 1: mem_cgroup_under_socket_pressure include/linux/memcontrol.h:1483 [inline] sk_under_memory_pressure include/net/sock.h:1314 [inline] __sk_mem_reduce_allocated+0x1d2/0x270 net/core/sock.c:2696 __sk_mem_reclaim+0x44/0x50 net/core/sock.c:2711 sk_mem_reclaim include/net/sock.h:1490 [inline] ...... net_rx_action+0x17a/0x480 net/core/dev.c:6864 __do_softirq+0x12c/0x2af kernel/softirq.c:298 run_ksoftirqd+0x13/0x20 kernel/softirq.c:653 smpboot_thread_fn+0x33f/0x510 kernel/smpboot.c:165 kthread+0x1fc/0x220 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296 Fix it by using READ_ONCE() and WRITE_ONCE() to read and write memcg->socket_pressure. Link: https://lkml.kernel.org/r/20211025082843.671690-1-songyuanzheng@huawei.com Signed-off-by: Yuanzheng Song Reviewed-by: Muchun Song Cc: Shakeel Butt Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- mm/vmpressure.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f207f98bdb76..9d96238d9c21 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1606,7 +1606,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) return true; do { - if (time_before(jiffies, memcg->socket_pressure)) + if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 76518e4166dc..b52644771cc4 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -308,7 +308,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, * asserted for a second in which subsequent * pressure events can occur. */ - memcg->socket_pressure = jiffies + HZ; + WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); } } } From f7df2b1cf03af680354bd4300f48f7ea11316ce8 Mon Sep 17 00:00:00 2001 From: Zhenliang Wei Date: Fri, 5 Nov 2021 13:42:55 -0700 Subject: [PATCH 408/512] tools/vm/page_owner_sort.c: count and sort by mem When viewing page owner information, we may be more concerned about the total memory rather than the times of stack appears. Therefore, the following adjustments are made: 1. Added the statistics on the total number of pages. 2. Added the optional parameter "-m" to configure the program to sort by memory (total pages). The general output of page_owner is as follows: Page allocated via order XXX, ... PFN XXX ... // Detailed stack Page allocated via order XXX, ... PFN XXX ... // Detailed stack The original page_owner_sort ignores PFN rows, puts the remaining rows in buf, counts the times of buf, and finally sorts them according to the times. General output: XXX times: Page allocated via order XXX, ... // Detailed stack Now, we use regexp to extract the page order value from the buf, and count the total pages for the buf. General output: XXX times, XXX pages: Page allocated via order XXX, ... // Detailed stack By default, it is still sorted by the times of buf; If you want to sort by the pages nums of buf, use the new -m parameter. Link: https://lkml.kernel.org/r/1631678242-41033-1-git-send-email-weizhenliang@huawei.com Signed-off-by: Zhenliang Wei Cc: Tang Bin Cc: Zhang Shengju Cc: Zhenliang Wei Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_owner.rst | 23 +++++++- tools/vm/page_owner_sort.c | 96 +++++++++++++++++++++++++++++---- 2 files changed, 108 insertions(+), 11 deletions(-) diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 2175465c9bf2..9837fc8147dd 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -85,5 +85,26 @@ Usage cat /sys/kernel/debug/page_owner > page_owner_full.txt ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + The general output of ``page_owner_full.txt`` is as follows: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows + in buf, uses regexp to extract the page order value, counts the times + and pages of buf, and finally sorts them according to the times. + See the result about who allocated each page - in the ``sorted_page_owner.txt``. + in the ``sorted_page_owner.txt``. General output: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + By default, ``page_owner_sort`` is sorted according to the times of buf. + If you want to sort by the pages nums of buf, use the ``-m`` parameter. diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 0e75f22c9475..9ebb84a9c731 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -5,6 +5,8 @@ * Example use: * cat /sys/kernel/debug/page_owner > page_owner_full.txt * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + * Or sort by total memory: + * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt * * See Documentation/vm/page_owner.rst */ @@ -16,14 +18,18 @@ #include #include #include +#include +#include struct block_list { char *txt; int len; int num; + int page_num; }; - +static int sort_by_memory; +static regex_t order_pattern; static struct block_list *list; static int list_size; static int max_size; @@ -59,12 +65,50 @@ static int compare_num(const void *p1, const void *p2) return l2->num - l1->num; } +static int compare_page_num(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l2->page_num - l1->page_num; +} + +static int get_page_num(char *buf) +{ + int err, val_len, order_val; + char order_str[4] = {0}; + char *endptr; + regmatch_t pmatch[2]; + + err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL); + if (err != 0 || pmatch[1].rm_so == -1) { + printf("no order pattern in %s\n", buf); + return 0; + } + val_len = pmatch[1].rm_eo - pmatch[1].rm_so; + if (val_len > 2) /* max_order should not exceed 2 digits */ + goto wrong_order; + + memcpy(order_str, buf + pmatch[1].rm_so, val_len); + + errno = 0; + order_val = strtol(order_str, &endptr, 10); + if (errno != 0 || endptr == order_str || *endptr != '\0') + goto wrong_order; + + return 1 << order_val; + +wrong_order: + printf("wrong order in follow buf:\n%s\n", buf); + return 0; +} + static void add_list(char *buf, int len) { if (list_size != 0 && len == list[list_size-1].len && memcmp(buf, list[list_size-1].txt, len) == 0) { list[list_size-1].num++; + list[list_size-1].page_num += get_page_num(buf); return; } if (list_size == max_size) { @@ -74,6 +118,7 @@ static void add_list(char *buf, int len) list[list_size].txt = malloc(len+1); list[list_size].len = len; list[list_size].num = 1; + list[list_size].page_num = get_page_num(buf); memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; list_size++; @@ -85,6 +130,13 @@ static void add_list(char *buf, int len) #define BUF_SIZE (128 * 1024) +static void usage(void) +{ + printf("Usage: ./page_owner_sort [-m] \n" + "-m Sort by total memory. If this option is unset, sort by times\n" + ); +} + int main(int argc, char **argv) { FILE *fin, *fout; @@ -92,18 +144,36 @@ int main(int argc, char **argv) int ret, i, count; struct block_list *list2; struct stat st; + int err; + int opt; - if (argc < 3) { - printf("Usage: ./program \n"); + while ((opt = getopt(argc, argv, "m")) != -1) + switch (opt) { + case 'm': + sort_by_memory = 1; + break; + default: + usage(); + exit(1); + } + + if (optind >= (argc - 1)) { + usage(); + exit(1); + } + + fin = fopen(argv[optind], "r"); + fout = fopen(argv[optind + 1], "w"); + if (!fin || !fout) { + usage(); perror("open: "); exit(1); } - fin = fopen(argv[1], "r"); - fout = fopen(argv[2], "w"); - if (!fin || !fout) { - printf("Usage: ./program \n"); - perror("open: "); + err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE); + if (err != 0 || order_pattern.re_nsub != 1) { + printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n", + argv[0], err); exit(1); } @@ -145,13 +215,19 @@ int main(int argc, char **argv) list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; + list2[count-1].page_num += list[i].page_num; } } - qsort(list2, count, sizeof(list[0]), compare_num); + if (sort_by_memory) + qsort(list2, count, sizeof(list[0]), compare_page_num); + else + qsort(list2, count, sizeof(list[0]), compare_num); for (i = 0; i < count; i++) - fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt); + fprintf(fout, "%d times, %d pages:\n%s\n", + list2[i].num, list2[i].page_num, list2[i].txt); + regfree(&order_pattern); return 0; } From a62f5ecbfb70645e1afa281202d989c68f050e0c Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 5 Nov 2021 13:42:58 -0700 Subject: [PATCH 409/512] tools/vm/page-types.c: make walk_file() aware of address range option Patch series "tools/vm/page-types.c: a few improvements". This patchset adds some improvements on tools/vm/page-types.c. Patch 1/3 makes -a option (specify address range) work with -f (file cache mode). Patch 2/3 and 3/3 are to fix minor formatting issues of this tool. These would make life a little easier for the users of this tool. Please see individual patches for more details about specific issues. This patch (of 3): -a|--addr option is used to limit the range of address to be scanned for page status. It works now for physical address space (dafult mode) or for virtual address space (with -p option), but not for file address space (with -f option). So make walk_file() aware of -a option. Link: https://lkml.kernel.org/r/20211004061325.1525902-1-naoya.horiguchi@linux.dev Link: https://lkml.kernel.org/r/20211004061325.1525902-2-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Cc: Konstantin Khlebnikov Cc: Christian Hansen Cc: Changbin Du Cc: Bin Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page-types.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index f62f10c988db..b14376af1f16 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -967,22 +967,19 @@ static struct sigaction sigbus_action = { .sa_flags = SA_SIGINFO, }; -static void walk_file(const char *name, const struct stat *st) +static void walk_file_range(const char *name, int fd, + unsigned long off, unsigned long end) { uint8_t vec[PAGEMAP_BATCH]; uint64_t buf[PAGEMAP_BATCH], flags; uint64_t cgroup = 0; uint64_t mapcnt = 0; unsigned long nr_pages, pfn, i; - off_t off, end = st->st_size; - int fd; ssize_t len; void *ptr; int first = 1; - fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); - - for (off = 0; off < end; off += len) { + for (; off < end; off += len) { nr_pages = (end - off + page_size - 1) / page_size; if (nr_pages > PAGEMAP_BATCH) nr_pages = PAGEMAP_BATCH; @@ -1043,6 +1040,21 @@ got_sigbus: flags, cgroup, mapcnt, buf[i]); } } +} + +static void walk_file(const char *name, const struct stat *st) +{ + int i; + int fd; + + fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); + + if (!nr_addr_ranges) + add_addr_range(0, st->st_size / page_size); + + for (i = 0; i < nr_addr_ranges; i++) + walk_file_range(name, fd, opt_offset[i] * page_size, + (opt_offset[i] + opt_size[i]) * page_size); close(fd); } From b76901db7b3d28a1ca6f8690642187c56cb50018 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 5 Nov 2021 13:43:01 -0700 Subject: [PATCH 410/512] tools/vm/page-types.c: move show_file() to summary output Currently file info from show_file() is printed out within page list like below, but this is inconvenient a little to utilize the page list from other scripts (maybe needs additional filtering). $ ./page-types -f page-types.c -l foffset offset len flags page-types.c Inode: 15108680 Size: 30953 (8 pages) Modify: Sat Oct 2 23:11:20 2021 (2399 seconds ago) Access: Sat Oct 2 23:11:28 2021 (2391 seconds ago) 0 d9f59e 1 ___U_lA____________________________________ 1 1031eb5 1 __RU_l_____________________________________ 2 13bf717 1 __RU_l_____________________________________ 3 13ac333 1 ___U_lA____________________________________ 4 d9f59f 1 __RU_l_____________________________________ 5 183fd49 1 ___U_lA____________________________________ 6 13cbf69 1 ___U_lA____________________________________ 7 d9ef05 1 ___U_lA____________________________________ flags page-count MB symbolic-flags long-symbolic-flags 0x000000000000002c 3 0 __RU_l_____________________________________ referenced,uptodate,lru 0x0000000000000068 5 0 ___U_lA____________________________________ uptodate,lru,active total 8 0 With this patch file info is printed out in summary part like below: $ ./page-types -f page-types.c -l foffset offset len flags 0 d9f59e 1 ___U_lA_____________________________________ 1 1031eb5 1 __RU_l______________________________________ 2 13bf717 1 __RU_l______________________________________ 3 13ac333 1 ___U_lA_____________________________________ 4 d9f59f 1 __RU_l______________________________________ 5 183fd49 1 ___U_lA_____________________________________ 6 13cbf69 1 ___U_lA_____________________________________ page-types.c Inode: 15108680 Size: 30953 (8 pages) Modify: Sat Oct 2 23:11:20 2021 (2435 seconds ago) Access: Sat Oct 2 23:11:28 2021 (2427 seconds ago) flags page-count MB symbolic-flags long-symbolic-flags 0x000000000000002c 3 0 __RU_l______________________________________ referenced,uptodate,lru 0x0000000000000068 4 0 ___U_lA_____________________________________ uptodate,lru,active total 7 0 Link: https://lkml.kernel.org/r/20211004061325.1525902-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Cc: Bin Wang Cc: Changbin Du Cc: Christian Hansen Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page-types.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index b14376af1f16..fdb1891faf90 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -1034,7 +1034,6 @@ got_sigbus: if (first && opt_list) { first = 0; flush_page_range(); - show_file(name, st); } add_page(off / page_size + i, pfn, flags, cgroup, mapcnt, buf[i]); @@ -1074,10 +1073,10 @@ int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f) return 0; } +struct stat st; + static void walk_page_cache(void) { - struct stat st; - kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); sigaction(SIGBUS, &sigbus_action, NULL); @@ -1374,6 +1373,11 @@ int main(int argc, char *argv[]) if (opt_list) printf("\n\n"); + if (opt_file) { + show_file(opt_file, &st); + printf("\n"); + } + show_summary(); if (opt_list_mapcnt) From 41d4613b378cc10c087d8af621ace891d96d0907 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 5 Nov 2021 13:43:04 -0700 Subject: [PATCH 411/512] tools/vm/page-types.c: print file offset in hexadecimal In page list mode (with -l and -L option), virtual address and physical address are printed in hexadecimal, but file offset is not, which is confusing, so let's align it. Link: https://lkml.kernel.org/r/20211004061325.1525902-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Cc: Bin Wang Cc: Changbin Du Cc: Christian Hansen Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page-types.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index fdb1891faf90..b1ed76d9a979 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -390,7 +390,7 @@ static void show_page_range(unsigned long voffset, unsigned long offset, if (opt_pid) printf("%lx\t", voff); if (opt_file) - printf("%lu\t", voff); + printf("%lx\t", voff); if (opt_list_cgroup) printf("@%llu\t", (unsigned long long)cgroup0); if (opt_list_mapcnt) @@ -418,7 +418,7 @@ static void show_page(unsigned long voffset, unsigned long offset, if (opt_pid) printf("%lx\t", voffset); if (opt_file) - printf("%lu\t", voffset); + printf("%lx\t", voffset); if (opt_list_cgroup) printf("@%llu\t", (unsigned long long)cgroup); if (opt_list_mapcnt) From 5787ea5bed7607cbee26da492e574f94975e4d76 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:07 -0700 Subject: [PATCH 412/512] arch_numa: simplify numa_distance allocation Patch series "memblock: cleanup memblock_free interface", v2. This is the fix for memblock freeing APIs mismatch [1]. The first patch is a cleanup of numa_distance allocation in arch_numa I've spotted during the conversion. The second patch is a fix for Xen memory freeing on some of the error paths. [1] https://lore.kernel.org/all/CAHk-=wj9k4LZTz+svCxLYs5Y1=+yKrbAUArH1+ghyG3OLd8VVg@mail.gmail.com This patch (of 6): Memory allocation of numa_distance uses memblock_phys_alloc_range() without actual range limits, converts the returned physical address to virtual and then only uses the virtual address for further initialization. Simplify this by replacing memblock_phys_alloc_range() with memblock_alloc(). Link: https://lkml.kernel.org/r/20210930185031.18648-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210930185031.18648-2-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/arch_numa.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 28222688ace7..a6491673dc1f 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -337,15 +337,13 @@ void __init numa_free_distance(void) static int __init numa_alloc_distance(void) { size_t size; - u64 phys; int i, j; size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); - phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn)); - if (WARN_ON(!phys)) + numa_distance = memblock_alloc(size, PAGE_SIZE); + if (WARN_ON(!numa_distance)) return -ENOMEM; - numa_distance = __va(phys); numa_distance_cnt = nr_node_ids; /* fill with the default distances */ From c486514dd40980b2dbb0e15fabddfe1324ed0197 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:10 -0700 Subject: [PATCH 413/512] xen/x86: free_p2m_page: use memblock_free_ptr() to free a virtual pointer free_p2m_page() wrongly passes a virtual pointer to memblock_free() that treats it as a physical address. Call memblock_free_ptr() instead that gets a virtual address to free the memory. Link: https://lkml.kernel.org/r/20210930185031.18648-3-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Juergen Gross Cc: Christophe Leroy Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 5e6e236977c7..141bb9dbd2fb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void) static void __ref free_p2m_page(void *p) { if (unlikely(!slab_is_available())) { - memblock_free((unsigned long)p, PAGE_SIZE); + memblock_free_ptr(p, PAGE_SIZE); return; } From fa27717110ae51b9b9013ced0b5143888257bb79 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:13 -0700 Subject: [PATCH 414/512] memblock: drop memblock_free_early_nid() and memblock_free_early() memblock_free_early_nid() is unused and memblock_free_early() is an alias for memblock_free(). Replace calls to memblock_free_early() with calls to memblock_free() and remove memblock_free_early() and memblock_free_early_nid(). Link: https://lkml.kernel.org/r/20210930185031.18648-4-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/init.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 3 +-- arch/s390/kernel/smp.c | 2 +- drivers/base/arch_numa.c | 2 +- drivers/s390/char/sclp_early.c | 2 +- include/linux/memblock.h | 12 ------------ kernel/dma/swiotlb.c | 2 +- lib/cpumask.c | 2 +- mm/percpu.c | 8 ++++---- mm/sparse.c | 2 +- 10 files changed, 12 insertions(+), 25 deletions(-) diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 19347dc6bbf8..21a5a7ac0037 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_early(__pa(ptr), size); + memblock_free(__pa(ptr), size); } void __init setup_per_cpu_areas(void) diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 87f001b4c4e4..f12229ce7301 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -56,8 +56,7 @@ void __init svm_swiotlb_init(void) return; - memblock_free_early(__pa(vstart), - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free(__pa(vstart), PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); panic("SVM: Cannot allocate SWIOTLB buffer"); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1a04e5bdf655..066efd6d9345 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -880,7 +880,7 @@ void __init smp_detect_cpus(void) /* Add CPUs present at boot */ __smp_rescan_cpus(info, true); - memblock_free_early((unsigned long)info, sizeof(*info)); + memblock_free((unsigned long)info, sizeof(*info)); } /* diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index a6491673dc1f..ade8934764f6 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -166,7 +166,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_early(__pa(ptr), size); + memblock_free(__pa(ptr), size); } #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index f3d5c7f4c13d..f01d942e1c1d 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -139,7 +139,7 @@ int __init sclp_early_get_core_info(struct sclp_core_info *info) } sclp_fill_core_info(info, sccb); out: - memblock_free_early((unsigned long)sccb, length); + memblock_free((unsigned long)sccb, length); return rc; } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 34de69b3b8ba..fc8183be340c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -441,18 +441,6 @@ static inline void *memblock_alloc_node(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } -static inline void memblock_free_early(phys_addr_t base, - phys_addr_t size) -{ - memblock_free(base, size); -} - -static inline void memblock_free_early_nid(phys_addr_t base, - phys_addr_t size, int nid) -{ - memblock_free(base, size); -} - static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) { __memblock_free_late(base, size); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 87c40517e822..430d2f78d540 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -247,7 +247,7 @@ swiotlb_init(int verbose) return; fail_free_mem: - memblock_free_early(__pa(tlb), bytes); + memblock_free(__pa(tlb), bytes); fail: pr_warn("Cannot allocate buffer"); } diff --git a/lib/cpumask.c b/lib/cpumask.c index c3c76b833384..045779446a18 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -188,7 +188,7 @@ EXPORT_SYMBOL(free_cpumask_var); */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { - memblock_free_early(__pa(mask), cpumask_size()); + memblock_free(__pa(mask), cpumask_size()); } #endif diff --git a/mm/percpu.c b/mm/percpu.c index e0a986818903..f58318cb04c0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2472,7 +2472,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - memblock_free_early(__pa(ai), ai->__ai_size); + memblock_free(__pa(ai), ai->__ai_size); } /** @@ -3134,7 +3134,7 @@ out_free_areas: out_free: pcpu_free_alloc_info(ai); if (areas) - memblock_free_early(__pa(areas), areas_size); + memblock_free(__pa(areas), areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ @@ -3256,7 +3256,7 @@ enomem: free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: - memblock_free_early(__pa(pages), pages_size); + memblock_free(__pa(pages), pages_size); pcpu_free_alloc_info(ai); return rc; } @@ -3286,7 +3286,7 @@ static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { - memblock_free_early(__pa(ptr), size); + memblock_free(__pa(ptr), size); } void __init setup_per_cpu_areas(void) diff --git a/mm/sparse.c b/mm/sparse.c index 120bc8ea5293..55fea0c2f927 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -451,7 +451,7 @@ static void *sparsemap_buf_end __meminitdata; static inline void __meminit sparse_buffer_free(unsigned long size) { WARN_ON(!sparsemap_buf || size == 0); - memblock_free_early(__pa(sparsemap_buf), size); + memblock_free(__pa(sparsemap_buf), size); } static void __init sparse_buffer_init(unsigned long size, int nid) From 621d973901cf9fa6c6e31b31bdd36c5c5f3c9c9e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:16 -0700 Subject: [PATCH 415/512] memblock: stop aliasing __memblock_free_late with memblock_free_late memblock_free_late() is a NOP wrapper for __memblock_free_late(), there is no point to keep this indirection. Drop the wrapper and rename __memblock_free_late() to memblock_free_late(). Link: https://lkml.kernel.org/r/20210930185031.18648-5-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 7 +------ mm/memblock.c | 8 ++++---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index fc8183be340c..e25f964fdd60 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -133,7 +133,7 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __memblock_free_late(phys_addr_t base, phys_addr_t size); +void memblock_free_late(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, @@ -441,11 +441,6 @@ static inline void *memblock_alloc_node(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } -static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) -{ - __memblock_free_late(base, size); -} - /* * Set the allocation direction to bottom-up or top-down. */ diff --git a/mm/memblock.c b/mm/memblock.c index 5096500b2647..849060013d3c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -366,14 +366,14 @@ void __init memblock_discard(void) addr = __pa(memblock.reserved.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); - __memblock_free_late(addr, size); + memblock_free_late(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { addr = __pa(memblock.memory.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); - __memblock_free_late(addr, size); + memblock_free_late(addr, size); } memblock_memory = NULL; @@ -1589,7 +1589,7 @@ void * __init memblock_alloc_try_nid( } /** - * __memblock_free_late - free pages directly to buddy allocator + * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * @@ -1597,7 +1597,7 @@ void * __init memblock_alloc_try_nid( * down, but we are still initializing the system. Pages are released directly * to the buddy allocator. */ -void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +void __init memblock_free_late(phys_addr_t base, phys_addr_t size) { phys_addr_t cursor, end; From 3ecc68349bbab6bff1d12cbc7951ca6019b2faf6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:19 -0700 Subject: [PATCH 416/512] memblock: rename memblock_free to memblock_phys_free Since memblock_free() operates on a physical range, make its name reflect it and rename it to memblock_phys_free(), so it will be a logical counterpart to memblock_phys_alloc(). The callers are updated with the below semantic patch: @@ expression addr; expression size; @@ - memblock_free(addr, size); + memblock_phys_free(addr, size); Link: https://lkml.kernel.org/r/20210930185031.18648-6-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/core_irongate.c | 3 ++- arch/arc/mm/init.c | 2 +- arch/arm/mach-hisi/platmcpm.c | 2 +- arch/arm/mm/init.c | 2 +- arch/arm64/mm/mmu.c | 4 ++-- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip30/ip30-setup.c | 6 +++--- arch/powerpc/kernel/dt_cpu_ftrs.c | 4 ++-- arch/powerpc/kernel/paca.c | 8 ++++---- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 3 ++- arch/riscv/kernel/setup.c | 5 +++-- arch/s390/kernel/setup.c | 8 ++++---- arch/s390/kernel/smp.c | 4 ++-- arch/s390/kernel/uv.c | 2 +- arch/s390/mm/kasan_init.c | 2 +- arch/sh/boards/mach-ap325rxa/setup.c | 2 +- arch/sh/boards/mach-ecovec24/setup.c | 4 ++-- arch/sh/boards/mach-kfr2r09/setup.c | 2 +- arch/sh/boards/mach-migor/setup.c | 2 +- arch/sh/boards/mach-se/7724/setup.c | 4 ++-- arch/sparc/kernel/smp_64.c | 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- arch/x86/mm/init.c | 2 +- arch/x86/xen/mmu_pv.c | 6 +++--- arch/x86/xen/setup.c | 6 +++--- drivers/base/arch_numa.c | 2 +- drivers/firmware/efi/memmap.c | 2 +- drivers/of/kexec.c | 3 +-- drivers/of/of_reserved_mem.c | 5 +++-- drivers/s390/char/sclp_early.c | 2 +- drivers/usb/early/xhci-dbc.c | 10 +++++----- drivers/xen/swiotlb-xen.c | 2 +- include/linux/memblock.h | 2 +- init/initramfs.c | 2 +- kernel/dma/swiotlb.c | 2 +- lib/cpumask.c | 2 +- mm/cma.c | 2 +- mm/memblock.c | 8 ++++---- mm/memory_hotplug.c | 2 +- mm/percpu.c | 8 ++++---- mm/sparse.c | 2 +- 45 files changed, 79 insertions(+), 76 deletions(-) diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index 72af1e72d833..ee26dcc49418 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c @@ -233,7 +233,8 @@ albacore_init_arch(void) unsigned long size; size = initrd_end - initrd_start; - memblock_free(__pa(initrd_start), PAGE_ALIGN(size)); + memblock_phys_free(__pa(initrd_start), + PAGE_ALIGN(size)); if (!move_initrd(pci_mem)) printk("irongate_init_arch: initrd too big " "(%ldK)\ndisabling initrd\n", diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 699ecf119641..59408f6a02d4 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -173,7 +173,7 @@ static void __init highmem_init(void) #ifdef CONFIG_HIGHMEM unsigned long tmp; - memblock_free(high_mem_start, high_mem_sz); + memblock_phys_free(high_mem_start, high_mem_sz); for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++) free_highmem_page(pfn_to_page(tmp)); #endif diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c index 96a484095194..258586e31333 100644 --- a/arch/arm/mach-hisi/platmcpm.c +++ b/arch/arm/mach-hisi/platmcpm.c @@ -339,7 +339,7 @@ err_fabric: err_sysctrl: iounmap(relocation); err_reloc: - memblock_free(hip04_boot_method[0], hip04_boot_method[1]); + memblock_phys_free(hip04_boot_method[0], hip04_boot_method[1]); err: return ret; } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 6162a070a410..6d0cb0f7bc54 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -158,7 +158,7 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align) panic("Failed to steal %pa bytes at %pS\n", &size, (void *)_RET_IP_); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); return phys; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index cfd9deb347c3..f68c2d953617 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -738,8 +738,8 @@ void __init paging_init(void) cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); init_mm.pgd = swapper_pg_dir; - memblock_free(__pa_symbol(init_pg_dir), - __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir)); + memblock_phys_free(__pa_symbol(init_pg_dir), + __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir)); memblock_allow_resize(); } diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 21a5a7ac0037..3be1c29084fa 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_phys_free(__pa(ptr), size); } void __init setup_per_cpu_areas(void) diff --git a/arch/mips/sgi-ip30/ip30-setup.c b/arch/mips/sgi-ip30/ip30-setup.c index 44b1607e964d..75a34684e704 100644 --- a/arch/mips/sgi-ip30/ip30-setup.c +++ b/arch/mips/sgi-ip30/ip30-setup.c @@ -69,10 +69,10 @@ static void __init ip30_mem_init(void) total_mem += size; if (addr >= IP30_REAL_MEMORY_START) - memblock_free(addr, size); + memblock_phys_free(addr, size); else if ((addr + size) > IP30_REAL_MEMORY_START) - memblock_free(IP30_REAL_MEMORY_START, - size - IP30_MAX_PROM_MEMORY); + memblock_phys_free(IP30_REAL_MEMORY_START, + size - IP30_MAX_PROM_MEMORY); } pr_info("Detected %luMB of physical memory.\n", MEM_SHIFT(total_mem)); } diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 358aee7c2d79..42839d6bd486 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -1095,8 +1095,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char cpufeatures_setup_finished(); - memblock_free(__pa(dt_cpu_features), - sizeof(struct dt_cpu_feature)*nr_dt_cpu_features); + memblock_phys_free(__pa(dt_cpu_features), + sizeof(struct dt_cpu_feature) * nr_dt_cpu_features); return 0; } diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 9bd30cac852b..4208b4044d12 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -322,8 +322,8 @@ void __init free_unused_pacas(void) new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids; if (new_ptrs_size < paca_ptrs_size) - memblock_free(__pa(paca_ptrs) + new_ptrs_size, - paca_ptrs_size - new_ptrs_size); + memblock_phys_free(__pa(paca_ptrs) + new_ptrs_size, + paca_ptrs_size - new_ptrs_size); paca_nr_cpu_ids = nr_cpu_ids; paca_ptrs_size = new_ptrs_size; @@ -331,8 +331,8 @@ void __init free_unused_pacas(void) #ifdef CONFIG_PPC_BOOK3S_64 if (early_radix_enabled()) { /* Ugly fixup, see new_slb_shadow() */ - memblock_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr), - sizeof(struct slb_shadow)); + memblock_phys_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr), + sizeof(struct slb_shadow)); paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL; } #endif diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index b1e43b69a559..5af8993a8e6d 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -825,7 +825,7 @@ static void __init smp_setup_pacas(void) set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]); } - memblock_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32)); + memblock_phys_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32)); cpu_to_phys_id = NULL; } #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index eaa79a0996d1..75bc294ac40d 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -812,7 +812,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, static void __init pcpu_free_bootmem(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_phys_free(__pa(ptr), size); } static int pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3dd35c327d1c..b5a9d343b720 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2981,7 +2981,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, if (!phb->hose) { pr_err(" Can't allocate PCI controller for %pOF\n", np); - memblock_free(__pa(phb), sizeof(struct pnv_phb)); + memblock_phys_free(__pa(phb), sizeof(struct pnv_phb)); return; } diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index f12229ce7301..b7c017bb40f7 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -56,7 +56,8 @@ void __init svm_swiotlb_init(void) return; - memblock_free(__pa(vstart), PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_phys_free(__pa(vstart), + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); panic("SVM: Cannot allocate SWIOTLB buffer"); } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index b9620e5f00ba..6ea7c53b82cd 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -230,13 +230,14 @@ static void __init init_resources(void) /* Clean-up any unused pre-allocated resources */ if (res_idx >= 0) - memblock_free(__pa(mem_res), (res_idx + 1) * sizeof(*mem_res)); + memblock_phys_free(__pa(mem_res), + (res_idx + 1) * sizeof(*mem_res)); return; error: /* Better an empty resource tree than an inconsistent one */ release_child_resources(&iomem_resource); - memblock_free(__pa(mem_res), mem_res_sz); + memblock_phys_free(__pa(mem_res), mem_res_sz); } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 67e5fff96ee0..7fc836e9e194 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -693,7 +693,7 @@ static void __init reserve_crashkernel(void) } if (register_memory_notifier(&kdump_mem_nb)) { - memblock_free(crash_base, crash_size); + memblock_phys_free(crash_base, crash_size); return; } @@ -748,7 +748,7 @@ static void __init free_mem_detect_info(void) get_mem_detect_reserved(&start, &size); if (size) - memblock_free(start, size); + memblock_phys_free(start, size); } static const char * __init get_mem_info_source(void) @@ -793,7 +793,7 @@ static void __init check_initrd(void) if (initrd_data.start && initrd_data.size && !memblock_is_region_memory(initrd_data.start, initrd_data.size)) { pr_err("The initial RAM disk does not fit into the memory\n"); - memblock_free(initrd_data.start, initrd_data.size); + memblock_phys_free(initrd_data.start, initrd_data.size); initrd_start = initrd_end = 0; } #endif @@ -890,7 +890,7 @@ static void __init setup_randomness(void) if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); - memblock_free((unsigned long) vmms, PAGE_SIZE); + memblock_phys_free((unsigned long)vmms, PAGE_SIZE); } /* diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 066efd6d9345..78a8ea6fd582 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -723,7 +723,7 @@ void __init smp_save_dump_cpus(void) /* Get the CPU registers */ smp_save_cpu_regs(sa, addr, is_boot_cpu, page); } - memblock_free(page, PAGE_SIZE); + memblock_phys_free(page, PAGE_SIZE); diag_amode31_ops.diag308_reset(); pcpu_set_smt(0); } @@ -880,7 +880,7 @@ void __init smp_detect_cpus(void) /* Add CPUs present at boot */ __smp_rescan_cpus(info, true); - memblock_free((unsigned long)info, sizeof(*info)); + memblock_phys_free((unsigned long)info, sizeof(*info)); } /* diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 5a656c7b7a67..d57457b16fe5 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -64,7 +64,7 @@ void __init setup_uv(void) } if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) { - memblock_free(uv_stor_base, uv_info.uv_base_stor_len); + memblock_phys_free(uv_stor_base, uv_info.uv_base_stor_len); goto fail; } diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 3e4735168019..483b9dbe0970 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -399,5 +399,5 @@ void __init kasan_copy_shadow_mapping(void) void __init kasan_free_early_identity(void) { - memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); + memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); } diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c index bac8a058ebd7..c77b5f00a66a 100644 --- a/arch/sh/boards/mach-ap325rxa/setup.c +++ b/arch/sh/boards/mach-ap325rxa/setup.c @@ -560,7 +560,7 @@ static void __init ap325rxa_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu_dma_membase = phys; diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index bab91a99124e..2b22ce792147 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -1502,7 +1502,7 @@ static void __init ecovec_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU0 memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu0_dma_membase = phys; @@ -1510,7 +1510,7 @@ static void __init ecovec_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU1 memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu1_dma_membase = phys; } diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c index eeb5ce341efd..20f4db778ed6 100644 --- a/arch/sh/boards/mach-kfr2r09/setup.c +++ b/arch/sh/boards/mach-kfr2r09/setup.c @@ -633,7 +633,7 @@ static void __init kfr2r09_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu_dma_membase = phys; diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c index 6703a2122c0d..f60061283c48 100644 --- a/arch/sh/boards/mach-migor/setup.c +++ b/arch/sh/boards/mach-migor/setup.c @@ -633,7 +633,7 @@ static void __init migor_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu_dma_membase = phys; diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index 8d6541ba0186..8bbf5a6aa423 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -966,7 +966,7 @@ static void __init ms7724se_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU0 memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu0_dma_membase = phys; @@ -974,7 +974,7 @@ static void __init ms7724se_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU1 memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu1_dma_membase = phys; } diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 0224d8f19ed6..2507549538df 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1567,7 +1567,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, static void __init pcpu_free_bootmem(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_phys_free(__pa(ptr), size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 8e636ce02949..d1710ebb44f4 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -47,7 +47,7 @@ void __init mem_init(void) */ brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); - memblock_free(__pa(brk_end), uml_reserved - brk_end); + memblock_phys_free(__pa(brk_end), uml_reserved - brk_end); uml_reserved = brk_end; /* this will put all low memory onto the freelists */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 40ed44ead063..49b596db5631 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -322,7 +322,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else @@ -521,7 +521,7 @@ static void __init reserve_crashkernel(void) } if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { - memblock_free(crash_base, crash_size); + memblock_phys_free(crash_base, crash_size); return; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 23a14d82e783..1895986842b9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -618,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start, */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); - memblock_free(addr, PMD_SIZE); + memblock_phys_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 3359c23573c5..676d8d292f8a 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1025,7 +1025,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) make_lowmem_page_readwrite(vaddr); - memblock_free(paddr, size); + memblock_phys_free(paddr, size); } static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) @@ -1151,7 +1151,7 @@ static void __init xen_pagetable_p2m_free(void) xen_cleanhighmap(addr, addr + size); size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(addr), size); + memblock_phys_free(__pa(addr), size); } else { xen_cleanmfnmap(addr); } @@ -1955,7 +1955,7 @@ void __init xen_relocate_p2m(void) pfn_end = p2m_pfn_end; } - memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); while (pfn < pfn_end) { if (pfn == p2m_pfn) { pfn = p2m_pfn_end; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8bfc10330107..f387fc7e5250 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn, break; } } - memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); + memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -719,7 +719,7 @@ static void __init xen_reserve_xen_mfnlist(void) return; xen_relocate_p2m(); - memblock_free(start, size); + memblock_phys_free(start, size); } /** @@ -885,7 +885,7 @@ char * __init xen_memory_setup(void) xen_phys_memcpy(new_area, start, size); pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", start, start + size, new_area, new_area + size); - memblock_free(start, size); + memblock_phys_free(start, size); boot_params.hdr.ramdisk_image = new_area; boot_params.ext_ramdisk_image = new_area >> 32; } diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index ade8934764f6..712edef03929 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -166,7 +166,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_phys_free(__pa(ptr), size); } #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index 2ff1883dc788..4df55a55da84 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -35,7 +35,7 @@ void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags) if (slab_is_available()) memblock_free_late(phys, size); else - memblock_free(phys, size); + memblock_phys_free(phys, size); } else if (flags & EFI_MEMMAP_SLAB) { struct page *p = pfn_to_page(PHYS_PFN(phys)); unsigned int order = get_order(size); diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index 053e241f593c..b9bd1cff1793 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -171,8 +171,7 @@ int ima_free_kexec_buffer(void) if (ret) return ret; - return memblock_free(addr, size); - + return memblock_phys_free(addr, size); } /** diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 9da8835ba5a5..9c0fb962c22b 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -46,7 +46,7 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, if (nomap) { err = memblock_mark_nomap(base, size); if (err) - memblock_free(base, size); + memblock_phys_free(base, size); kmemleak_ignore_phys(base); } @@ -284,7 +284,8 @@ void __init fdt_init_reserved_mem(void) if (nomap) memblock_clear_nomap(rmem->base, rmem->size); else - memblock_free(rmem->base, rmem->size); + memblock_phys_free(rmem->base, + rmem->size); } } } diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index f01d942e1c1d..c0052655fc4f 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -139,7 +139,7 @@ int __init sclp_early_get_core_info(struct sclp_core_info *info) } sclp_fill_core_info(info, sccb); out: - memblock_free((unsigned long)sccb, length); + memblock_phys_free((unsigned long)sccb, length); return rc; } diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c index be4ecbabdd58..933d77ad0a64 100644 --- a/drivers/usb/early/xhci-dbc.c +++ b/drivers/usb/early/xhci-dbc.c @@ -185,7 +185,7 @@ static void __init xdbc_free_ring(struct xdbc_ring *ring) if (!seg) return; - memblock_free(seg->dma, PAGE_SIZE); + memblock_phys_free(seg->dma, PAGE_SIZE); ring->segment = NULL; } @@ -665,10 +665,10 @@ int __init early_xdbc_setup_hardware(void) xdbc_free_ring(&xdbc.in_ring); if (xdbc.table_dma) - memblock_free(xdbc.table_dma, PAGE_SIZE); + memblock_phys_free(xdbc.table_dma, PAGE_SIZE); if (xdbc.out_dma) - memblock_free(xdbc.out_dma, PAGE_SIZE); + memblock_phys_free(xdbc.out_dma, PAGE_SIZE); xdbc.table_base = NULL; xdbc.out_buf = NULL; @@ -987,8 +987,8 @@ free_and_quit: xdbc_free_ring(&xdbc.evt_ring); xdbc_free_ring(&xdbc.out_ring); xdbc_free_ring(&xdbc.in_ring); - memblock_free(xdbc.table_dma, PAGE_SIZE); - memblock_free(xdbc.out_dma, PAGE_SIZE); + memblock_phys_free(xdbc.table_dma, PAGE_SIZE); + memblock_phys_free(xdbc.out_dma, PAGE_SIZE); writel(0, &xdbc.xdbc_reg->control); early_iounmap(xdbc.xhci_base, xdbc.xhci_length); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index e56a5faac395..4b671cc0a7ea 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -241,7 +241,7 @@ retry: */ rc = xen_swiotlb_fixup(start, nslabs); if (rc) { - memblock_free(__pa(start), PAGE_ALIGN(bytes)); + memblock_phys_free(__pa(start), PAGE_ALIGN(bytes)); if (nslabs > 1024 && repeat--) { /* Min is 2MB */ nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e25f964fdd60..d32d41709513 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -103,7 +103,7 @@ void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); -int memblock_free(phys_addr_t base, phys_addr_t size); +int memblock_phys_free(phys_addr_t base, phys_addr_t size); int memblock_reserve(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP int memblock_physmem_add(phys_addr_t base, phys_addr_t size); diff --git a/init/initramfs.c b/init/initramfs.c index a842c0544745..1a971f070dd4 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -607,7 +607,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end) unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); unsigned long aligned_end = ALIGN(end, PAGE_SIZE); - memblock_free(__pa(aligned_start), aligned_end - aligned_start); + memblock_phys_free(__pa(aligned_start), aligned_end - aligned_start); #endif free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 430d2f78d540..b9fa173e5e56 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -247,7 +247,7 @@ swiotlb_init(int verbose) return; fail_free_mem: - memblock_free(__pa(tlb), bytes); + memblock_phys_free(__pa(tlb), bytes); fail: pr_warn("Cannot allocate buffer"); } diff --git a/lib/cpumask.c b/lib/cpumask.c index 045779446a18..a90786b77c1c 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -188,7 +188,7 @@ EXPORT_SYMBOL(free_cpumask_var); */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { - memblock_free(__pa(mask), cpumask_size()); + memblock_phys_free(__pa(mask), cpumask_size()); } #endif diff --git a/mm/cma.c b/mm/cma.c index 11152c3fb23c..bc9ca8f3c487 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -378,7 +378,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, return 0; free_mem: - memblock_free(base, size); + memblock_phys_free(base, size); err: pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; diff --git a/mm/memblock.c b/mm/memblock.c index 849060013d3c..52e34abc4abe 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -806,18 +806,18 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) void __init_memblock memblock_free_ptr(void *ptr, size_t size) { if (ptr) - memblock_free(__pa(ptr), size); + memblock_phys_free(__pa(ptr), size); } /** - * memblock_free - free boot memory block + * memblock_phys_free - free boot memory block * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ -int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; @@ -1937,7 +1937,7 @@ static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn) * memmap array. */ if (pg < pgend) - memblock_free(pg, pgend - pg); + memblock_phys_free(pg, pgend - pg); } /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9fd0be32a281..feffaa9423fe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -2204,7 +2204,7 @@ static int __ref try_remove_memory(u64 start, u64 size) arch_remove_memory(start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { - memblock_free(start, size); + memblock_phys_free(start, size); memblock_remove(start, size); } diff --git a/mm/percpu.c b/mm/percpu.c index f58318cb04c0..d65ddf6f2a35 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2472,7 +2472,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - memblock_free(__pa(ai), ai->__ai_size); + memblock_phys_free(__pa(ai), ai->__ai_size); } /** @@ -3134,7 +3134,7 @@ out_free_areas: out_free: pcpu_free_alloc_info(ai); if (areas) - memblock_free(__pa(areas), areas_size); + memblock_phys_free(__pa(areas), areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ @@ -3256,7 +3256,7 @@ enomem: free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: - memblock_free(__pa(pages), pages_size); + memblock_phys_free(__pa(pages), pages_size); pcpu_free_alloc_info(ai); return rc; } @@ -3286,7 +3286,7 @@ static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_phys_free(__pa(ptr), size); } void __init setup_per_cpu_areas(void) diff --git a/mm/sparse.c b/mm/sparse.c index 55fea0c2f927..fc3ab8d3b6bc 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -451,7 +451,7 @@ static void *sparsemap_buf_end __meminitdata; static inline void __meminit sparse_buffer_free(unsigned long size) { WARN_ON(!sparsemap_buf || size == 0); - memblock_free(__pa(sparsemap_buf), size); + memblock_phys_free(__pa(sparsemap_buf), size); } static void __init sparse_buffer_init(unsigned long size, int nid) From 4421cca0a3e4833b3bf0f20de98eb580ab8c7290 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:22 -0700 Subject: [PATCH 417/512] memblock: use memblock_free for freeing virtual pointers Rename memblock_free_ptr() to memblock_free() and use memblock_free() when freeing a virtual pointer so that memblock_free() will be a counterpart of memblock_alloc() The callers are updated with the below semantic patch and manual addition of (void *) casting to pointers that are represented by unsigned long variables. @@ identifier vaddr; expression size; @@ ( - memblock_phys_free(__pa(vaddr), size); + memblock_free(vaddr, size); | - memblock_free_ptr(vaddr, size); + memblock_free(vaddr, size); ) [sfr@canb.auug.org.au: fixup] Link: https://lkml.kernel.org/r/20211018192940.3d1d532f@canb.auug.org.au Link: https://lkml.kernel.org/r/20210930185031.18648-7-rppt@kernel.org Signed-off-by: Mike Rapoport Signed-off-by: Stephen Rothwell Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/core_irongate.c | 3 +-- arch/mips/mm/init.c | 2 +- arch/powerpc/kernel/dt_cpu_ftrs.c | 4 ++-- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 3 +-- arch/riscv/kernel/setup.c | 5 ++--- arch/sparc/kernel/smp_64.c | 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/mm/kasan_init_64.c | 4 ++-- arch/x86/mm/numa.c | 2 +- arch/x86/mm/numa_emulation.c | 2 +- arch/x86/xen/mmu_pv.c | 2 +- arch/x86/xen/p2m.c | 2 +- drivers/base/arch_numa.c | 4 ++-- drivers/macintosh/smu.c | 2 +- drivers/xen/swiotlb-xen.c | 2 +- include/linux/memblock.h | 2 +- init/initramfs.c | 2 +- init/main.c | 4 ++-- kernel/dma/swiotlb.c | 2 +- kernel/printk/printk.c | 4 ++-- lib/bootconfig.c | 2 +- lib/cpumask.c | 2 +- mm/memblock.c | 6 +++--- mm/percpu.c | 8 ++++---- mm/sparse.c | 2 +- 29 files changed, 40 insertions(+), 43 deletions(-) diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index ee26dcc49418..6b8ed12936b6 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c @@ -233,8 +233,7 @@ albacore_init_arch(void) unsigned long size; size = initrd_end - initrd_start; - memblock_phys_free(__pa(initrd_start), - PAGE_ALIGN(size)); + memblock_free((void *)initrd_start, PAGE_ALIGN(size)); if (!move_initrd(pci_mem)) printk("irongate_init_arch: initrd too big " "(%ldK)\ndisabling initrd\n", diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 3be1c29084fa..325e1552cbea 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_phys_free(__pa(ptr), size); + memblock_free(ptr, size); } void __init setup_per_cpu_areas(void) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 42839d6bd486..ba527fb52993 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -1095,8 +1095,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char cpufeatures_setup_finished(); - memblock_phys_free(__pa(dt_cpu_features), - sizeof(struct dt_cpu_feature) * nr_dt_cpu_features); + memblock_free(dt_cpu_features, + sizeof(struct dt_cpu_feature) * nr_dt_cpu_features); return 0; } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 5af8993a8e6d..6b1338db8779 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -825,7 +825,7 @@ static void __init smp_setup_pacas(void) set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]); } - memblock_phys_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32)); + memblock_free(cpu_to_phys_id, nr_cpu_ids * sizeof(u32)); cpu_to_phys_id = NULL; } #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 75bc294ac40d..1777e992b20b 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -812,7 +812,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, static void __init pcpu_free_bootmem(void *ptr, size_t size) { - memblock_phys_free(__pa(ptr), size); + memblock_free(ptr, size); } static int pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b5a9d343b720..004cd6a96c8a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2981,7 +2981,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, if (!phb->hose) { pr_err(" Can't allocate PCI controller for %pOF\n", np); - memblock_phys_free(__pa(phb), sizeof(struct pnv_phb)); + memblock_free(phb, sizeof(struct pnv_phb)); return; } diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index b7c017bb40f7..6332365d2891 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -56,8 +56,7 @@ void __init svm_swiotlb_init(void) return; - memblock_phys_free(__pa(vstart), - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); panic("SVM: Cannot allocate SWIOTLB buffer"); } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 6ea7c53b82cd..b42bfdc67482 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -230,14 +230,13 @@ static void __init init_resources(void) /* Clean-up any unused pre-allocated resources */ if (res_idx >= 0) - memblock_phys_free(__pa(mem_res), - (res_idx + 1) * sizeof(*mem_res)); + memblock_free(mem_res, (res_idx + 1) * sizeof(*mem_res)); return; error: /* Better an empty resource tree than an inconsistent one */ release_child_resources(&iomem_resource); - memblock_phys_free(__pa(mem_res), mem_res_sz); + memblock_free(mem_res, mem_res_sz); } diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 2507549538df..b98a7bbe6728 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1567,7 +1567,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, static void __init pcpu_free_bootmem(void *ptr, size_t size) { - memblock_phys_free(__pa(ptr), size); + memblock_free(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index d1710ebb44f4..0039771eb01c 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -47,7 +47,7 @@ void __init mem_init(void) */ brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); - memblock_phys_free(__pa(brk_end), uml_reserved - brk_end); + memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; /* this will put all low memory onto the freelists */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5afd98559193..7b65275544b2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_ptr(ptr, size); + memblock_free(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index ef885370719a..e7b9b464a82f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -49,7 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; - memblock_free_ptr(p, PMD_SIZE); + memblock_free(p, PMD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); @@ -85,7 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; - memblock_free_ptr(p, PUD_SIZE); + memblock_free(p, PUD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1e9b93b088db..c6b1213086d6 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -355,7 +355,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free_ptr(numa_distance, size); + memblock_free(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index e801e30089c4..1a02b791d273 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -517,7 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } /* free the copied physical distance table */ - memblock_free_ptr(phys_dist, phys_size); + memblock_free(phys_dist, phys_size); return; no_emu: diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 676d8d292f8a..173de1e29bda 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1151,7 +1151,7 @@ static void __init xen_pagetable_p2m_free(void) xen_cleanhighmap(addr, addr + size); size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_phys_free(__pa(addr), size); + memblock_free((void *)addr, size); } else { xen_cleanmfnmap(addr); } diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 141bb9dbd2fb..58db86f7b384 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void) static void __ref free_p2m_page(void *p) { if (unlikely(!slab_is_available())) { - memblock_free_ptr(p, PAGE_SIZE); + memblock_free(p, PAGE_SIZE); return; } diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 712edef03929..bc1876915457 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -166,7 +166,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_phys_free(__pa(ptr), size); + memblock_free(ptr, size); } #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK @@ -326,7 +326,7 @@ void __init numa_free_distance(void) size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - memblock_free_ptr(numa_distance, size); + memblock_free(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; } diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index fe63d5ee201b..f62152111236 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -570,7 +570,7 @@ fail_msg_node: fail_db_node: of_node_put(smu->db_node); fail_bootmem: - memblock_free_ptr(smu, sizeof(struct smu_device)); + memblock_free(smu, sizeof(struct smu_device)); smu = NULL; fail_np: of_node_put(np); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 4b671cc0a7ea..f083194e2634 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -241,7 +241,7 @@ retry: */ rc = xen_swiotlb_fixup(start, nslabs); if (rc) { - memblock_phys_free(__pa(start), PAGE_ALIGN(bytes)); + memblock_free(start, PAGE_ALIGN(bytes)); if (nslabs > 1024 && repeat--) { /* Min is 2MB */ nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d32d41709513..484650681bee 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -118,7 +118,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); void memblock_free_all(void); -void memblock_free_ptr(void *ptr, size_t size); +void memblock_free(void *ptr, size_t size); void reset_node_managed_pages(pg_data_t *pgdat); void reset_all_zones_managed_pages(void); diff --git a/init/initramfs.c b/init/initramfs.c index 1a971f070dd4..2f3d96dc3db6 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -607,7 +607,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end) unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); unsigned long aligned_end = ALIGN(end, PAGE_SIZE); - memblock_phys_free(__pa(aligned_start), aligned_end - aligned_start); + memblock_free((void *)aligned_start, aligned_end - aligned_start); #endif free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, diff --git a/init/main.c b/init/main.c index 767ee2672176..f0001af8ebb9 100644 --- a/init/main.c +++ b/init/main.c @@ -382,7 +382,7 @@ static char * __init xbc_make_cmdline(const char *key) ret = xbc_snprint_cmdline(new_cmdline, len + 1, root); if (ret < 0 || ret > len) { pr_err("Failed to print extra kernel cmdline.\n"); - memblock_free_ptr(new_cmdline, len + 1); + memblock_free(new_cmdline, len + 1); return NULL; } @@ -925,7 +925,7 @@ static void __init print_unknown_bootoptions(void) end += sprintf(end, " %s", *p); pr_notice("Unknown command line parameters:%s\n", unknown_options); - memblock_free_ptr(unknown_options, len); + memblock_free(unknown_options, len); } asmlinkage __visible void __init __no_sanitize_address start_kernel(void) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b9fa173e5e56..02656d7ccbfd 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -247,7 +247,7 @@ swiotlb_init(int verbose) return; fail_free_mem: - memblock_phys_free(__pa(tlb), bytes); + memblock_free(tlb, bytes); fail: pr_warn("Cannot allocate buffer"); } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a8d0a58deebc..2cae1bfa6be7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early) return; err_free_descs: - memblock_free_ptr(new_descs, new_descs_size); + memblock_free(new_descs, new_descs_size); err_free_log_buf: - memblock_free_ptr(new_log_buf, new_log_buf_len); + memblock_free(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 5ae248b29373..547558d80e64 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -792,7 +792,7 @@ void __init xbc_destroy_all(void) xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; - memblock_free_ptr(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); + memblock_free(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); xbc_nodes = NULL; brace_index = 0; } diff --git a/lib/cpumask.c b/lib/cpumask.c index a90786b77c1c..a971a82d2f43 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -188,7 +188,7 @@ EXPORT_SYMBOL(free_cpumask_var); */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { - memblock_phys_free(__pa(mask), cpumask_size()); + memblock_free(mask, cpumask_size()); } #endif diff --git a/mm/memblock.c b/mm/memblock.c index 52e34abc4abe..fb0c7f48e627 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -472,7 +472,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) - memblock_free_ptr(old_array, old_alloc_size); + memblock_free(old_array, old_alloc_size); /* * Reserve the new array if that comes from the memblock. Otherwise, we @@ -796,14 +796,14 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) } /** - * memblock_free_ptr - free boot memory allocation + * memblock_free - free boot memory allocation * @ptr: starting address of the boot memory allocation * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ -void __init_memblock memblock_free_ptr(void *ptr, size_t size) +void __init_memblock memblock_free(void *ptr, size_t size) { if (ptr) memblock_phys_free(__pa(ptr), size); diff --git a/mm/percpu.c b/mm/percpu.c index d65ddf6f2a35..f5b2c2ea5a54 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2472,7 +2472,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - memblock_phys_free(__pa(ai), ai->__ai_size); + memblock_free(ai, ai->__ai_size); } /** @@ -3134,7 +3134,7 @@ out_free_areas: out_free: pcpu_free_alloc_info(ai); if (areas) - memblock_phys_free(__pa(areas), areas_size); + memblock_free(areas, areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ @@ -3256,7 +3256,7 @@ enomem: free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: - memblock_phys_free(__pa(pages), pages_size); + memblock_free(pages, pages_size); pcpu_free_alloc_info(ai); return rc; } @@ -3286,7 +3286,7 @@ static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { - memblock_phys_free(__pa(ptr), size); + memblock_free(ptr, size); } void __init setup_per_cpu_areas(void) diff --git a/mm/sparse.c b/mm/sparse.c index fc3ab8d3b6bc..e5c84b0cf0c9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -451,7 +451,7 @@ static void *sparsemap_buf_end __meminitdata; static inline void __meminit sparse_buffer_free(unsigned long size) { WARN_ON(!sparsemap_buf || size == 0); - memblock_phys_free(__pa(sparsemap_buf), size); + memblock_free(sparsemap_buf, size); } static void __init sparse_buffer_init(unsigned long size, int nid) From 3723929eb0f50e2101de739cdb66458a4f1f4b27 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 5 Nov 2021 13:43:25 -0700 Subject: [PATCH 418/512] mm: mark the OOM reaper thread as freezable The OOM reaper alters user address space which might theoretically alter the snapshot if reaping is allowed to happen after the freezer quiescent state. To this end, the reaper kthread uses wait_event_freezable() while waiting for any work so that it cannot run while the system freezes. However, the current implementation doesn't respect the freezer because all kernel threads are created with the PF_NOFREEZE flag, so they are automatically excluded from freezing operations. This means that the OOM reaper can race with system snapshotting if it has work to do while the system is being frozen. Fix this by adding a set_freezable() call which will clear the PF_NOFREEZE flag and thus make the OOM reaper visible to the freezer. Please note that the OOM reaper altering the snapshot this way is mostly a theoretical concern and has not been observed in practice. Link: https://lkml.kernel.org/r/20210921165758.6154-1-sultan@kerneltoast.com Link: https://lkml.kernel.org/r/20210918233920.9174-1-sultan@kerneltoast.com Fixes: aac453635549 ("mm, oom: introduce oom reaper") Signed-off-by: Sultan Alsawaf Acked-by: Michal Hocko Cc: David Rientjes Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bfa9e348c3a3..d365cc84a486 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -641,6 +641,8 @@ done: static int oom_reaper(void *unused) { + set_freezable(); + while (true) { struct task_struct *tsk = NULL; From b5389086ad7be0453c55e0069a89856d1fbdf605 Mon Sep 17 00:00:00 2001 From: Zhenguo Yao Date: Fri, 5 Nov 2021 13:43:28 -0700 Subject: [PATCH 419/512] hugetlbfs: extend the definition of hugepages parameter to support node allocation We can specify the number of hugepages to allocate at boot. But the hugepages is balanced in all nodes at present. In some scenarios, we only need hugepages in one node. For example: DPDK needs hugepages which are in the same node as NIC. If DPDK needs four hugepages of 1G size in node1 and system has 16 numa nodes we must reserve 64 hugepages on the kernel cmdline. But only four hugepages are used. The others should be free after boot. If the system memory is low(for example: 64G), it will be an impossible task. So extend the hugepages parameter to support specifying hugepages on a specific node. For example add following parameter: hugepagesz=1G hugepages=0:1,1:3 It will allocate 1 hugepage in node0 and 3 hugepages in node1. Link: https://lkml.kernel.org/r/20211005054729.86457-1-yaozhenguo1@gmail.com Signed-off-by: Zhenguo Yao Reviewed-by: Mike Kravetz Cc: Zhenguo Yao Cc: Dan Carpenter Cc: Nathan Chancellor Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 8 +- Documentation/admin-guide/mm/hugetlbpage.rst | 12 +- arch/powerpc/mm/hugetlbpage.c | 9 +- include/linux/hugetlb.h | 6 +- mm/hugetlb.c | 153 +++++++++++++++--- 5 files changed, 155 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e7f7904edf20..02eeda2a11ad 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1601,9 +1601,11 @@ the number of pages of hugepagesz to be allocated. If this is the first HugeTLB parameter on the command line, it specifies the number of pages to allocate for - the default huge page size. See also - Documentation/admin-guide/mm/hugetlbpage.rst. - Format: + the default huge page size. If using node format, the + number of pages to allocate per-node can be specified. + See also Documentation/admin-guide/mm/hugetlbpage.rst. + Format: or (node format) + :[,:] hugepagesz= [HW] The size of the HugeTLB pages. This is used in diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index bb90de3885d1..0166f9de3428 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -128,7 +128,9 @@ hugepages implicitly specifies the number of huge pages of default size to allocate. If the number of huge pages of default size is implicitly specified, it can not be overwritten by a hugepagesz,hugepages - parameter pair for the default size. + parameter pair for the default size. This parameter also has a + node format. The node format specifies the number of huge pages + to allocate on specific nodes. For example, on an architecture with 2M default huge page size:: @@ -138,6 +140,14 @@ hugepages indicating that the hugepages=512 parameter is ignored. If a hugepages parameter is preceded by an invalid hugepagesz parameter, it will be ignored. + + Node format example:: + + hugepagesz=2M hugepages=0:1,1:2 + + It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1. + If the node number is invalid, the parameter will be ignored. + default_hugepagesz Specify the default huge page size. This parameter can only be specified once on the command line. default_hugepagesz can diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9a75ba078e1b..82d8b368ca6d 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -229,17 +229,22 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) m->hstate = hstate; return 1; } + +bool __init hugetlb_node_alloc_supported(void) +{ + return false; +} #endif -int __init alloc_bootmem_huge_page(struct hstate *h) +int __init alloc_bootmem_huge_page(struct hstate *h, int nid) { #ifdef CONFIG_PPC_BOOK3S_64 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) return pseries_alloc_bootmem_huge_page(h); #endif - return __alloc_bootmem_huge_page(h); + return __alloc_bootmem_huge_page(h, nid); } #ifndef CONFIG_PPC_BOOK3S_64 diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c0a20781b28e..44c2ab0dfa59 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -615,6 +615,7 @@ struct hstate { unsigned long nr_overcommit_huge_pages; struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; + unsigned int max_huge_pages_node[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; @@ -647,8 +648,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page); /* arch callback */ -int __init __alloc_bootmem_huge_page(struct hstate *h); -int __init alloc_bootmem_huge_page(struct hstate *h); +int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); +int __init alloc_bootmem_huge_page(struct hstate *h, int nid); +bool __init hugetlb_node_alloc_supported(void); void __init hugetlb_add_hstate(unsigned order); bool __init arch_hugetlb_valid_size(unsigned long size); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5eab627a170b..24543dab9c4b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -77,6 +77,7 @@ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; +static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -2963,33 +2964,39 @@ out_subpool_put: return ERR_PTR(-ENOSPC); } -int alloc_bootmem_huge_page(struct hstate *h) +int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); -int __alloc_bootmem_huge_page(struct hstate *h) +int __alloc_bootmem_huge_page(struct hstate *h, int nid) { - struct huge_bootmem_page *m; + struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node; + if (nid >= nr_online_nodes) + return 0; + /* do node specific alloc */ + if (nid != NUMA_NO_NODE) { + m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + if (!m) + return 0; + goto found; + } + /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { - void *addr; - - addr = memblock_alloc_try_nid_raw( + m = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); - if (addr) { - /* - * Use the beginning of the huge page to store the - * huge_bootmem_page struct (until gather_bootmem - * puts them into the mem_map). - */ - m = addr; - goto found; - } + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + if (!m) + return 0; + goto found; } - return 0; found: - BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); @@ -3029,12 +3036,61 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } } +static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) +{ + unsigned long i; + char buf[32]; + + for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { + if (hstate_is_gigantic(h)) { + if (!alloc_bootmem_huge_page(h, nid)) + break; + } else { + struct page *page; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + + page = alloc_fresh_huge_page(h, gfp_mask, nid, + &node_states[N_MEMORY], NULL); + if (!page) + break; + put_page(page); /* free it into the hugepage allocator */ + } + cond_resched(); + } + if (i == h->max_huge_pages_node[nid]) + return; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", + h->max_huge_pages_node[nid], buf, nid, i); + h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); + h->max_huge_pages_node[nid] = i; +} static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; nodemask_t *node_alloc_noretry; + bool node_specific_alloc = false; + /* skip gigantic hugepages allocation if hugetlb_cma enabled */ + if (hstate_is_gigantic(h) && hugetlb_cma_size) { + pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); + return; + } + + /* do node specific alloc */ + for (i = 0; i < nr_online_nodes; i++) { + if (h->max_huge_pages_node[i] > 0) { + hugetlb_hstate_alloc_pages_onenode(h, i); + node_specific_alloc = true; + } + } + + if (node_specific_alloc) + return; + + /* below will do all node balanced alloc */ if (!hstate_is_gigantic(h)) { /* * Bit mask controlling how hard we retry per-node allocations. @@ -3055,11 +3111,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { - if (hugetlb_cma_size) { - pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); - goto free; - } - if (!alloc_bootmem_huge_page(h)) + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; } else if (!alloc_pool_huge_page(h, &node_states[N_MEMORY], @@ -3075,7 +3127,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) h->max_huge_pages, buf, i); h->max_huge_pages = i; } -free: kfree(node_alloc_noretry); } @@ -3990,6 +4041,10 @@ static int __init hugetlb_init(void) } default_hstate.max_huge_pages = default_hstate_max_huge_pages; + + for (i = 0; i < nr_online_nodes; i++) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; } } @@ -4050,6 +4105,10 @@ void __init hugetlb_add_hstate(unsigned int order) parsed_hstate = h; } +bool __init __weak hugetlb_node_alloc_supported(void) +{ + return true; +} /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz @@ -4061,6 +4120,10 @@ static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; + int node = NUMA_NO_NODE; + int count; + unsigned long tmp; + char *p = s; if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); @@ -4084,8 +4147,40 @@ static int __init hugepages_setup(char *s) return 0; } - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; + while (*p) { + count = 0; + if (sscanf(p, "%lu%n", &tmp, &count) != 1) + goto invalid; + /* Parameter is node format */ + if (p[count] == ':') { + if (!hugetlb_node_alloc_supported()) { + pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); + return 0; + } + node = tmp; + p += count + 1; + if (node < 0 || node >= nr_online_nodes) + goto invalid; + /* Parse hugepages */ + if (sscanf(p, "%lu%n", &tmp, &count) != 1) + goto invalid; + if (!hugetlb_max_hstate) + default_hugepages_in_node[node] = tmp; + else + parsed_hstate->max_huge_pages_node[node] = tmp; + *mhp += tmp; + /* Go to parse next node*/ + if (p[count] == ',') + p += count + 1; + else + break; + } else { + if (p != s) + goto invalid; + *mhp = tmp; + break; + } + } /* * Global state is always initialized later in hugetlb_init. @@ -4098,6 +4193,10 @@ static int __init hugepages_setup(char *s) last_mhp = mhp; return 1; + +invalid: + pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); + return 0; } __setup("hugepages=", hugepages_setup); @@ -4159,6 +4258,7 @@ __setup("hugepagesz=", hugepagesz_setup); static int __init default_hugepagesz_setup(char *s) { unsigned long size; + int i; parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { @@ -4187,6 +4287,9 @@ static int __init default_hugepagesz_setup(char *s) */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; + for (i = 0; i < nr_online_nodes; i++) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; if (hstate_is_gigantic(&default_hstate)) hugetlb_hstate_alloc_pages(&default_hstate); default_hstate_max_huge_pages = 0; From 8eb42beac8d369f5443addd469879d152422a28d Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 5 Nov 2021 13:43:32 -0700 Subject: [PATCH 420/512] mm/migrate: de-duplicate migrate_reason strings In order to remove the need to manually keep three different files in synch, provide a common definition of the mapping between enum migrate_reason, and the associated strings for each enum item. 1. Use the tracing system's mapping of enums to strings, by redefining and reusing the MIGRATE_REASON and supporting macros, and using that to populate the string array in mm/debug.c. 2. Move enum migrate_reason to migrate_mode.h. This is not strictly necessary for this patch, but migrate mode and migrate reason go together, so this will slightly clarify things. Link: https://lkml.kernel.org/r/20210922041755.141817-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Weizhao Ouyang Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 19 +------------------ include/linux/migrate_mode.h | 13 +++++++++++++ mm/debug.c | 20 +++++++++++--------- 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index c8077e936691..3d154fe03c96 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,24 +19,7 @@ struct migration_target_control; */ #define MIGRATEPAGE_SUCCESS 0 -/* - * Keep sync with: - * - macro MIGRATE_REASON in include/trace/events/migrate.h - * - migrate_reason_names[MR_TYPES] in mm/debug.c - */ -enum migrate_reason { - MR_COMPACTION, - MR_MEMORY_FAILURE, - MR_MEMORY_HOTPLUG, - MR_SYSCALL, /* also applies to cpusets */ - MR_MEMPOLICY_MBIND, - MR_NUMA_MISPLACED, - MR_CONTIG_RANGE, - MR_LONGTERM_PIN, - MR_DEMOTION, - MR_TYPES -}; - +/* Defined in mm/debug.c: */ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index 883c99249033..f37cc03f9369 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -19,4 +19,17 @@ enum migrate_mode { MIGRATE_SYNC_NO_COPY, }; +enum migrate_reason { + MR_COMPACTION, + MR_MEMORY_FAILURE, + MR_MEMORY_HOTPLUG, + MR_SYSCALL, /* also applies to cpusets */ + MR_MEMPOLICY_MBIND, + MR_NUMA_MISPLACED, + MR_CONTIG_RANGE, + MR_LONGTERM_PIN, + MR_DEMOTION, + MR_TYPES +}; + #endif /* MIGRATE_MODE_H_INCLUDED */ diff --git a/mm/debug.c b/mm/debug.c index fae0f81ad831..4333b6784a20 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -16,17 +16,19 @@ #include #include "internal.h" +#include + +/* + * Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can + * be used to populate migrate_reason_names[]. + */ +#undef EM +#undef EMe +#define EM(a, b) b, +#define EMe(a, b) b const char *migrate_reason_names[MR_TYPES] = { - "compaction", - "memory_failure", - "memory_hotplug", - "syscall_or_cpuset", - "mempolicy_mbind", - "numa_misplaced", - "contig_range", - "longterm_pin", - "demotion", + MIGRATE_REASON }; const struct trace_print_flags pageflag_names[] = { From 20f9ba4f995247bb79e243741b8fdddbd76dd923 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:43:35 -0700 Subject: [PATCH 421/512] mm: migrate: make demotion knob depend on migration The memory demotion needs to call migrate_pages() to do the jobs. And it is controlled by a knob, however, the knob doesn't depend on CONFIG_MIGRATION. The knob could be truned on even though MIGRATION is disabled, this will not cause any crash since migrate_pages() would just return -ENOSYS. But it is definitely not optimal to go through demotion path then retry regular swap every time. And it doesn't make too much sense to have the knob visible to the users when !MIGRATION. Move the related code from mempolicy.[h|c] to migrate.[h|c]. Link: https://lkml.kernel.org/r/20211015005559.246709-1-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: "Huang, Ying" Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 4 --- include/linux/migrate.h | 4 +++ mm/mempolicy.c | 61 --------------------------------------- mm/migrate.c | 61 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 65 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 097bbbb37493..3c7595e81150 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -183,8 +183,6 @@ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); -extern bool numa_demotion_enabled; - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); @@ -300,8 +298,6 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) return NULL; } -#define numa_demotion_enabled false - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3d154fe03c96..2d8130e05dc0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -40,6 +40,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); + +extern bool numa_demotion_enabled; #else static inline void putback_movable_pages(struct list_head *l) {} @@ -65,6 +67,8 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, { return -ENOSYS; } + +#define numa_demotion_enabled false #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_COMPACTION diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ce722333d9f6..f1080e0a566a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3057,64 +3057,3 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } - -bool numa_demotion_enabled = false; - -#ifdef CONFIG_SYSFS -static ssize_t numa_demotion_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%s\n", - numa_demotion_enabled? "true" : "false"); -} - -static ssize_t numa_demotion_enabled_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - numa_demotion_enabled = true; - else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - numa_demotion_enabled = false; - else - return -EINVAL; - - return count; -} - -static struct kobj_attribute numa_demotion_enabled_attr = - __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, - numa_demotion_enabled_store); - -static struct attribute *numa_attrs[] = { - &numa_demotion_enabled_attr.attr, - NULL, -}; - -static const struct attribute_group numa_attr_group = { - .attrs = numa_attrs, -}; - -static int __init numa_init_sysfs(void) -{ - int err; - struct kobject *numa_kobj; - - numa_kobj = kobject_create_and_add("numa", mm_kobj); - if (!numa_kobj) { - pr_err("failed to create numa kobject\n"); - return -ENOMEM; - } - err = sysfs_create_group(numa_kobj, &numa_attr_group); - if (err) { - pr_err("failed to register numa group\n"); - goto delete_obj; - } - return 0; - -delete_obj: - kobject_put(numa_kobj); - return err; -} -subsys_initcall(numa_init_sysfs); -#endif diff --git a/mm/migrate.c b/mm/migrate.c index 1852d787e6ab..51e310a94516 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -3306,3 +3306,64 @@ static int __init migrate_on_reclaim_init(void) } late_initcall(migrate_on_reclaim_init); #endif /* CONFIG_HOTPLUG_CPU */ + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled ? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + numa_demotion_enabled = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + numa_demotion_enabled = false; + else + return -EINVAL; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif From 39cad8878a058070c79f3a824436de2de095672a Mon Sep 17 00:00:00 2001 From: "George G. Davis" Date: Fri, 5 Nov 2021 13:43:38 -0700 Subject: [PATCH 422/512] selftests/vm/transhuge-stress: fix ram size thinko When executing transhuge-stress with an argument to specify the virtual memory size for testing, the ram size is reported as 0, e.g. transhuge-stress 384 thp-mmap: allocate 192 transhuge pages, using 384 MiB virtual memory and 0 MiB of ram thp-mmap: 0.184 s/loop, 0.957 ms/page, 2090.265 MiB/s 192 succeed, 0 failed This appears to be due to a thinko in commit 0085d61fe05e ("selftests/vm/transhuge-stress: stress test for memory compaction"), where, at a guess, the intent was to base "xyz MiB of ram" on `ram` size. Here are results after using `ram` size: thp-mmap: allocate 192 transhuge pages, using 384 MiB virtual memory and 14 MiB of ram Link: https://lkml.kernel.org/r/20210825135843.29052-1-george_davis@mentor.com Fixes: 0085d61fe05e ("selftests/vm/transhuge-stress: stress test for memory compaction") Signed-off-by: George G. Davis Cc: Konstantin Khlebnikov Cc: Eugeniu Rosca Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/transhuge-stress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c index fd7f1b4a96f9..5e4c036f6ad3 100644 --- a/tools/testing/selftests/vm/transhuge-stress.c +++ b/tools/testing/selftests/vm/transhuge-stress.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, - len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); + ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) From 55fc0d91746759c71bc165bba62a2db64ac98e35 Mon Sep 17 00:00:00 2001 From: Rongwei Wang Date: Fri, 5 Nov 2021 13:43:41 -0700 Subject: [PATCH 423/512] mm, thp: lock filemap when truncating page cache Patch series "fix two bugs for file THP". This patch (of 2): Transparent huge page has supported read-only non-shmem files. The file- backed THP is collapsed by khugepaged and truncated when written (for shared libraries). However, there is a race when multiple writers truncate the same page cache concurrently. In that case, subpage(s) of file THP can be revealed by find_get_entry in truncate_inode_pages_range, which will trigger PageTail BUG_ON in truncate_inode_page, as follows: page:000000009e420ff2 refcount:1 mapcount:0 mapping:0000000000000000 index:0x7ff pfn:0x50c3ff head:0000000075ff816d order:9 compound_mapcount:0 compound_pincount:0 flags: 0x37fffe0000010815(locked|uptodate|lru|arch_1|head) raw: 37fffe0000000000 fffffe0013108001 dead000000000122 dead000000000400 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 head: 37fffe0000010815 fffffe001066bd48 ffff000404183c20 0000000000000000 head: 0000000000000600 0000000000000000 00000001ffffffff ffff000c0345a000 page dumped because: VM_BUG_ON_PAGE(PageTail(page)) ------------[ cut here ]------------ kernel BUG at mm/truncate.c:213! Internal error: Oops - BUG: 0 [#1] SMP Modules linked in: xfs(E) libcrc32c(E) rfkill(E) ... CPU: 14 PID: 11394 Comm: check_madvise_d Kdump: ... Hardware name: ECS, BIOS 0.0.0 02/06/2015 pstate: 60400005 (nZCv daif +PAN -UAO -TCO BTYPE=--) Call trace: truncate_inode_page+0x64/0x70 truncate_inode_pages_range+0x550/0x7e4 truncate_pagecache+0x58/0x80 do_dentry_open+0x1e4/0x3c0 vfs_open+0x38/0x44 do_open+0x1f0/0x310 path_openat+0x114/0x1dc do_filp_open+0x84/0x134 do_sys_openat2+0xbc/0x164 __arm64_sys_openat+0x74/0xc0 el0_svc_common.constprop.0+0x88/0x220 do_el0_svc+0x30/0xa0 el0_svc+0x20/0x30 el0_sync_handler+0x1a4/0x1b0 el0_sync+0x180/0x1c0 Code: aa0103e0 900061e1 910ec021 9400d300 (d4210000) This patch mainly to lock filemap when one enter truncate_pagecache(), avoiding truncating the same page cache concurrently. Link: https://lkml.kernel.org/r/20211025092134.18562-1-rongwei.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/20211025092134.18562-2-rongwei.wang@linux.alibaba.com Fixes: eb6ecbed0aa2 ("mm, thp: relax the VM_DENYWRITE constraint on file-backed THPs") Signed-off-by: Xu Yu Signed-off-by: Rongwei Wang Suggested-by: Matthew Wilcox (Oracle) Tested-by: Song Liu Cc: Collin Fijalkovich Cc: Hugh Dickins Cc: Mike Kravetz Cc: William Kucharski Cc: Yang Shi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index daa324606a41..9ec3cfca3b1a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -856,8 +856,11 @@ static int do_dentry_open(struct file *f, * of THPs into the page cache will fail. */ smp_mb(); - if (filemap_nr_thps(inode->i_mapping)) + if (filemap_nr_thps(inode->i_mapping)) { + filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, 0); + filemap_invalidate_unlock(inode->i_mapping); + } } return 0; From 8468e937df1f31411d1e127fa38db064af051fe5 Mon Sep 17 00:00:00 2001 From: Rongwei Wang Date: Fri, 5 Nov 2021 13:43:44 -0700 Subject: [PATCH 424/512] mm, thp: fix incorrect unmap behavior for private pages When truncating pagecache on file THP, the private pages of a process should not be unmapped mapping. This incorrect behavior on a dynamic shared libraries which will cause related processes to happen core dump. A simple test for a DSO (Prerequisite is the DSO mapped in file THP): int main(int argc, char *argv[]) { int fd; fd = open(argv[1], O_WRONLY); if (fd < 0) { perror("open"); } close(fd); return 0; } The test only to open a target DSO, and do nothing. But this operation will lead one or more process to happen core dump. This patch mainly to fix this bug. Link: https://lkml.kernel.org/r/20211025092134.18562-3-rongwei.wang@linux.alibaba.com Fixes: eb6ecbed0aa2 ("mm, thp: relax the VM_DENYWRITE constraint on file-backed THPs") Signed-off-by: Rongwei Wang Tested-by: Xu Yu Cc: Matthew Wilcox (Oracle) Cc: Song Liu Cc: William Kucharski Cc: Hugh Dickins Cc: Yang Shi Cc: Mike Kravetz Cc: Collin Fijalkovich Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index 9ec3cfca3b1a..e0df1536eb69 100644 --- a/fs/open.c +++ b/fs/open.c @@ -857,8 +857,17 @@ static int do_dentry_open(struct file *f, */ smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { + struct address_space *mapping = inode->i_mapping; + filemap_invalidate_lock(inode->i_mapping); - truncate_pagecache(inode, 0); + /* + * unmap_mapping_range just need to be called once + * here, because the private pages is not need to be + * unmapped mapping (e.g. data segment of dynamic + * shared libraries here). + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } From fb25a77dde78dbcaa70c828ea8c2f7cf182510ae Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Fri, 5 Nov 2021 13:43:47 -0700 Subject: [PATCH 425/512] mm/readahead.c: fix incorrect comments for get_init_ra_size In fact, formated values returned by get_init_ra_size are not that intuitive. This patch make the comments reflect its truth. Link: https://lkml.kernel.org/r/20211019104812.135602-1-linf@wangsu.com Signed-off-by: Lin Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/readahead.c b/mm/readahead.c index 41b75d76d36e..88a84f254ed6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -309,7 +309,7 @@ void force_page_cache_ra(struct readahead_control *ractl, * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra - * 1-8 page = 32k initial, > 8 page = 128k initial + * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { From 916caa127cb2239989b2ec45e4288db61de01ed9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 5 Nov 2021 13:43:50 -0700 Subject: [PATCH 426/512] mm: nommu: kill arch_get_unmapped_area() When nommu, the arch_get_unmapped_area() will not be called, just kill it. Link: https://lkml.kernel.org/r/20210910061906.36299-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 02d2427b8f9e..8943dc0e2132 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1639,12 +1639,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - return -ENOMEM; -} - vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); From e3820ab252dd9fef6af875553c49b8d02339421d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 5 Nov 2021 13:43:53 -0700 Subject: [PATCH 427/512] selftest/vm: fix ksm selftest to run with different NUMA topologies Platforms can have non-contiguous NUMA nodes like below #numactl -H available: 2 nodes (0,8) ..... node distances: node 0 8 0: 10 40 8: 40 10 #numactl -H available: 1 nodes (1) .... node distances: node 1 1: 10 Hence update the test to not assume the presence of Node 0 and 1 and also use numa_num_configured_nodes() instead of numa_max_node for finding whether to skip the test. Link: https://lkml.kernel.org/r/20210914141414.350759-1-aneesh.kumar@linux.ibm.com Fixes: 82e717ad3501 ("selftests: vm: add KSM merging across nodes test") Signed-off-by: Aneesh Kumar K.V Reviewed-by: Pasha Tatashin Cc: Zhansaya Bagdauletkyzy Cc: Pavel Tatashin Cc: Tyler Hicks Cc: Hugh Dickins Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/ksm_tests.c | 29 +++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index b61dcdb44c5b..0e142d25b031 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -354,12 +354,34 @@ err_out: return KSFT_FAIL; } +static int get_next_mem_node(int node) +{ + + long node_size; + int mem_node = 0; + int i, max_node = numa_max_node(); + + for (i = node + 1; i <= max_node + node; i++) { + mem_node = i % (max_node + 1); + node_size = numa_node_size(mem_node, NULL); + if (node_size > 0) + break; + } + return mem_node; +} + +static int get_first_mem_node(void) +{ + return get_next_mem_node(numa_max_node()); +} + static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, size_t page_size) { void *numa1_map_ptr, *numa2_map_ptr; struct timespec start_time; int page_count = 2; + int first_node; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { perror("clock_gettime"); @@ -370,7 +392,7 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a perror("NUMA support not enabled"); return KSFT_SKIP; } - if (numa_max_node() < 1) { + if (numa_num_configured_nodes() <= 1) { printf("At least 2 NUMA nodes must be available\n"); return KSFT_SKIP; } @@ -378,8 +400,9 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a return KSFT_FAIL; /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ - numa1_map_ptr = numa_alloc_onnode(page_size, 0); - numa2_map_ptr = numa_alloc_onnode(page_size, 1); + first_node = get_first_mem_node(); + numa1_map_ptr = numa_alloc_onnode(page_size, first_node); + numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); if (!numa1_map_ptr || !numa2_map_ptr) { perror("numa_alloc_onnode"); return KSFT_FAIL; From 325254899684adf32b95ae59000dec4a6853e930 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Fri, 5 Nov 2021 13:43:56 -0700 Subject: [PATCH 428/512] selftests: vm: add KSM huge pages merging time test Add test case of KSM merging time using mostly huge pages Link: https://lkml.kernel.org/r/20211013044045.360251-1-pedrodemargomes@gmail.com Signed-off-by: Pedro Demarchi Gomes Cc: Zhansaya Bagdauletkyzy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/ksm_tests.c | 125 ++++++++++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 0e142d25b031..1436e1a9a3d3 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -5,6 +5,10 @@ #include #include #include +#include +#include +#include +#include #include "../kselftest.h" #include "../../../../include/vdso/time64.h" @@ -18,6 +22,15 @@ #define KSM_MERGE_ACROSS_NODES_DEFAULT true #define MB (1ul << 20) +#define PAGE_SHIFT 12 +#define HPAGE_SHIFT 21 + +#define PAGE_SIZE (1 << PAGE_SHIFT) +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) + struct ksm_sysfs { unsigned long max_page_sharing; unsigned long merge_across_nodes; @@ -34,6 +47,7 @@ enum ksm_test_name { CHECK_KSM_ZERO_PAGE_MERGE, CHECK_KSM_NUMA_MERGE, KSM_MERGE_TIME, + KSM_MERGE_TIME_HUGE_PAGES, KSM_COW_TIME }; @@ -99,6 +113,9 @@ static void print_help(void) " -U (page unmerging)\n" " -P evaluate merging time and speed.\n" " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " For this test, the size of duplicated memory area (in MiB)\n" " must be provided using -s option\n" " -C evaluate the time required to break COW of merged pages.\n\n"); @@ -439,6 +456,101 @@ err_out: return KSFT_FAIL; } +int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + errx(2, "mmap transhuge"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) + err(2, "read pagemap"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} + +static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr, *map_ptr_orig; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + int pagemap_fd, n_normal_pages, n_huge_pages; + + map_size *= MB; + size_t len = map_size; + + len -= len % HPAGE_SIZE; + map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); + map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; + + if (map_ptr_orig == MAP_FAILED) + err(2, "initial mmap"); + + if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + n_normal_pages = 0; + n_huge_pages = 0; + for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { + if (allocate_transhuge(p, pagemap_fd) < 0) + n_normal_pages++; + else + n_huge_pages++; + } + printf("Number of normal pages: %d\n", n_normal_pages); + printf("Number of huge pages: %d\n", n_huge_pages); + + memset(map_ptr, '*', len); + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_FAIL; +} + static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) { void *map_ptr; @@ -564,7 +676,7 @@ int main(int argc, char *argv[]) bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; long size_MB = 0; - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPC")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCH")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -618,6 +730,9 @@ int main(int argc, char *argv[]) case 'P': test_name = KSM_MERGE_TIME; break; + case 'H': + test_name = KSM_MERGE_TIME_HUGE_PAGES; + break; case 'C': test_name = KSM_COW_TIME; break; @@ -670,6 +785,14 @@ int main(int argc, char *argv[]) ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; + case KSM_MERGE_TIME_HUGE_PAGES: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; case KSM_COW_TIME: ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, page_size); From af1c31acc85325aec2bcc5941fb7a02e19143503 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 5 Nov 2021 13:43:59 -0700 Subject: [PATCH 429/512] mm/vmstat: annotate data race for zone->free_area[order].nr_free KCSAN reports a data-race on v5.10 which also exists on mainline: BUG: KCSAN: data-race in extfrag_for_order+0x33/0x2d0 race at unknown origin, with read to 0xffff9ee9bfffab48 of 8 bytes by task 34 on cpu 1: extfrag_for_order+0x33/0x2d0 kcompactd+0x5f0/0xce0 kthread+0x1f9/0x220 ret_from_fork+0x22/0x30 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 34 Comm: kcompactd0 Not tainted 5.10.0+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Access to zone->free_area[order].nr_free in extfrag_for_order() and frag_show_print() is lockless. That's intentional and the stats are a rough estimate anyway. Annotate them with data_race(). [liushixin2@huawei.com: add comments] Link: https://lkml.kernel.org/r/20210918084655.2696522-1-liushixin2@huawei.com Link: https://lkml.kernel.org/r/20210908015606.3999871-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: "Paul E . McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 0f4643e5324d..0fde7470368d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1070,8 +1070,13 @@ static void fill_contig_page_info(struct zone *zone, for (order = 0; order < MAX_ORDER; order++) { unsigned long blocks; - /* Count number of free blocks */ - blocks = zone->free_area[order].nr_free; + /* + * Count number of free blocks. + * + * Access to nr_free is lockless as nr_free is used only for + * diagnostic purposes. Use data_race to avoid KCSAN warning. + */ + blocks = data_race(zone->free_area[order].nr_free); info->free_blocks_total += blocks; /* Count free base pages */ @@ -1446,7 +1451,11 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + /* + * Access to nr_free is lockless as nr_free is used only for + * printing purposes. Use data_race to avoid KCSAN warning. + */ + seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free)); seq_putc(m, '\n'); } From a997058679fb2bc244270c02c864595fc41f51d3 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Fri, 5 Nov 2021 13:44:02 -0700 Subject: [PATCH 430/512] mm: vmstat.c: make extfrag_index show more pretty fragmentation_index may return -1000 and the corresponding formated value showed by seq_printf will take a negative signatrue, but other positive formated values don't take a positive signatrue, so the output becomes unaligned. before: Node 0, zone DMA -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 Node 0, zone DMA32 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 Node 0, zone Normal -1.000 -1.000 -1.000 -1.000 0.931 0.966 0.983 0.992 0.996 0.998 0.999 after this patch: Node 0, zone DMA -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 Node 0, zone DMA32 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 Node 0, zone Normal -1.000 -1.000 -1.000 -1.000 0.931 0.966 0.983 0.992 0.996 0.998 0.999 Link: https://lkml.kernel.org/r/20211019103241.134797-1-linf@wangsu.com Signed-off-by: Lin Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 0fde7470368d..d701c335628c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2191,7 +2191,7 @@ static void extfrag_show_print(struct seq_file *m, for (order = 0; order < MAX_ORDER; ++order) { fill_contig_page_info(zone, order, &info); index = __fragmentation_index(order, &info); - seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); } seq_putc(m, '\n'); From 39b2e5cae43dd05462125ff9024a0e0cf431e958 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:05 -0700 Subject: [PATCH 431/512] selftests/vm: make MADV_POPULATE_(READ|WRITE) use in-tree headers The madv_populate selftest currently builds with a warning when the local installed headers (via the distribution) don't include MADV_POPULATE_READ and MADV_POPULATE_WRITE. The warning is correct, because the test cannot locate the necessary header. The reason is that the in-tree installed headers (usr/include) have a "linux" instead of a "sys" subdirectory. Including "linux/mman.h" instead of "sys/mman.h" doesn't work (e.g., mmap() and madvise() are not defined that way). The only thing that seems to work is including "linux/mman.h" in addition to "sys/mman.h". We can get rid of our availability check and simplify. Link: https://lkml.kernel.org/r/20211015165758.41374-1-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/madv_populate.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c index b959e4ebdad4..3ee0e8275600 100644 --- a/tools/testing/selftests/vm/madv_populate.c +++ b/tools/testing/selftests/vm/madv_populate.c @@ -14,12 +14,11 @@ #include #include #include +#include #include #include "../kselftest.h" -#if defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) - /* * For now, we're using 2 MiB of private anonymous memory for all tests. */ @@ -328,15 +327,3 @@ int main(int argc, char **argv) err, ksft_test_num()); return ksft_exit_pass(); } - -#else /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */ - -#warning "missing MADV_POPULATE_READ or MADV_POPULATE_WRITE definition" - -int main(int argc, char **argv) -{ - ksft_print_header(); - ksft_exit_skip("MADV_POPULATE_READ or MADV_POPULATE_WRITE not defined\n"); -} - -#endif /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */ From ac62554ba7060c8824d7821c1342673f1e13c31d Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Fri, 5 Nov 2021 13:44:08 -0700 Subject: [PATCH 432/512] mm/memory_hotplug: add static qualifier for online_policy_to_str() online_policy_to_str is only used in memory_hotplug.c and should be defined as static. Link: https://lkml.kernel.org/r/20210913024534.26161-1-tangyizhou@huawei.com Signed-off-by: Tang Yizhou Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index feffaa9423fe..afaae370b8cd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -57,7 +57,7 @@ enum { ONLINE_POLICY_AUTO_MOVABLE, }; -const char *online_policy_to_str[] = { +static const char * const online_policy_to_str[] = { [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", }; From d83fe3c99d270df6ae40ce21842af0448797f3e0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:11 -0700 Subject: [PATCH 433/512] memory-hotplug.rst: fix two instances of "movablecore" that should be "movable_node" Patch series "memory-hotplug.rst: document the "auto-movable" online policy". Now that the memory-hotplug.rst overhaul is upstream, proper documentation for the "auto-movable" online policy, documenting all new toggles and options. Along, two fixes for the original overhaul. This patch (of 3): We really want to refer to the "movable_node" kernel command line parameter here. Link: https://lkml.kernel.org/r/20210930144117.23641-2-david@redhat.com Fixes: ac3332c44767 ("memory-hotplug.rst: complete admin-guide overhaul") Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport Cc: Jonathan Corbet Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/memory-hotplug.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 03dfbc925252..27d748cb6ee0 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -166,7 +166,7 @@ Or alternatively:: % echo 1 > /sys/devices/system/memory/memoryXXX/online The kernel will select the target zone automatically, usually defaulting to -``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel +``ZONE_NORMAL`` unless ``movable_node`` has been specified on the kernel command line or if the memory block would intersect the ZONE_MOVABLE already. One can explicitly request to associate an offline memory block with @@ -393,7 +393,7 @@ command line parameters are relevant: ======================== ======================================================= ``memhp_default_state`` configure auto-onlining by essentially setting ``/sys/devices/system/memory/auto_online_blocks``. -``movablecore`` configure automatic zone selection of the kernel. When +``movable_node`` configure automatic zone selection in the kernel. When set, the kernel will default to ZONE_MOVABLE, unless other zones can be kept contiguous. ======================== ======================================================= From a8db400f997c6e9ff505fb965a91fdb8427d366f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:14 -0700 Subject: [PATCH 434/512] memory-hotplug.rst: fix wrong /sys/module/memory_hotplug/parameters/ path We accidentially added a superfluous "s". Link: https://lkml.kernel.org/r/20210930144117.23641-3-david@redhat.com Fixes: ac3332c44767 ("memory-hotplug.rst: complete admin-guide overhaul") Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport Cc: Jonathan Corbet Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/memory-hotplug.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 27d748cb6ee0..ee00b70dedde 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -410,7 +410,7 @@ them with ``memory_hotplug.`` such as:: and they can be observed (and some even modified at runtime) via:: - /sys/modules/memory_hotplug/parameters/ + /sys/module/memory_hotplug/parameters/ The following module parameters are currently defined: From 9e122cc1bdc4e8c8e2a97060ab85f9ee657a01a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:17 -0700 Subject: [PATCH 435/512] memory-hotplug.rst: document the "auto-movable" online policy Commit e83a437faa62 ("mm/memory_hotplug: introduce "auto-movable" online policy") introduced a new memory online policy to automatically select a zone for memory blocks to be onlined. It added a way to set the active online policy and tunables for the auto-movable online policy. Follow-up commits tweaked the "auto-movable" policy to also consider memory device details when selecting zones for memory blocks to be onlined. Let's document the new toggles and how the two online policies we have work. [david@redhat.com: updates] Link: https://lkml.kernel.org/r/20211011082058.6076-4-david@redhat.com Link: https://lkml.kernel.org/r/20210930144117.23641-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport Cc: Jonathan Corbet Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/mm/memory-hotplug.rst | 139 +++++++++++++++--- 1 file changed, 120 insertions(+), 19 deletions(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index ee00b70dedde..0f56ecd8ac05 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -165,9 +165,8 @@ Or alternatively:: % echo 1 > /sys/devices/system/memory/memoryXXX/online -The kernel will select the target zone automatically, usually defaulting to -``ZONE_NORMAL`` unless ``movable_node`` has been specified on the kernel -command line or if the memory block would intersect the ZONE_MOVABLE already. +The kernel will select the target zone automatically, depending on the +configured ``online_policy``. One can explicitly request to associate an offline memory block with ZONE_MOVABLE by:: @@ -198,6 +197,9 @@ Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or % echo online > /sys/devices/system/memory/auto_online_blocks +Similarly to manual onlining, with ``online`` the kernel will select the +target zone automatically, depending on the configured ``online_policy``. + Modifying the auto-online behavior will only affect all subsequently added memory blocks only. @@ -393,11 +395,16 @@ command line parameters are relevant: ======================== ======================================================= ``memhp_default_state`` configure auto-onlining by essentially setting ``/sys/devices/system/memory/auto_online_blocks``. -``movable_node`` configure automatic zone selection in the kernel. When - set, the kernel will default to ZONE_MOVABLE, unless - other zones can be kept contiguous. +``movable_node`` configure automatic zone selection in the kernel when + using the ``contig-zones`` online policy. When + set, the kernel will default to ZONE_MOVABLE when + onlining a memory block, unless other zones can be kept + contiguous. ======================== ======================================================= +See Documentation/admin-guide/kernel-parameters.txt for a more generic +description of these command line parameters. + Module Parameters ------------------ @@ -414,20 +421,114 @@ and they can be observed (and some even modified at runtime) via:: The following module parameters are currently defined: -======================== ======================================================= -``memmap_on_memory`` read-write: Allocate memory for the memmap from the - added memory block itself. Even if enabled, actual - support depends on various other system properties and - should only be regarded as a hint whether the behavior - would be desired. +================================ =============================================== +``memmap_on_memory`` read-write: Allocate memory for the memmap from + the added memory block itself. Even if enabled, + actual support depends on various other system + properties and should only be regarded as a + hint whether the behavior would be desired. - While allocating the memmap from the memory block - itself makes memory hotplug less likely to fail and - keeps the memmap on the same NUMA node in any case, it - can fragment physical memory in a way that huge pages - in bigger granularity cannot be formed on hotplugged - memory. -======================== ======================================================= + While allocating the memmap from the memory + block itself makes memory hotplug less likely + to fail and keeps the memmap on the same NUMA + node in any case, it can fragment physical + memory in a way that huge pages in bigger + granularity cannot be formed on hotplugged + memory. +``online_policy`` read-write: Set the basic policy used for + automatic zone selection when onlining memory + blocks without specifying a target zone. + ``contig-zones`` has been the kernel default + before this parameter was added. After an + online policy was configured and memory was + online, the policy should not be changed + anymore. + + When set to ``contig-zones``, the kernel will + try keeping zones contiguous. If a memory block + intersects multiple zones or no zone, the + behavior depends on the ``movable_node`` kernel + command line parameter: default to ZONE_MOVABLE + if set, default to the applicable kernel zone + (usually ZONE_NORMAL) if not set. + + When set to ``auto-movable``, the kernel will + try onlining memory blocks to ZONE_MOVABLE if + possible according to the configuration and + memory device details. With this policy, one + can avoid zone imbalances when eventually + hotplugging a lot of memory later and still + wanting to be able to hotunplug as much as + possible reliably, very desirable in + virtualized environments. This policy ignores + the ``movable_node`` kernel command line + parameter and isn't really applicable in + environments that require it (e.g., bare metal + with hotunpluggable nodes) where hotplugged + memory might be exposed via the + firmware-provided memory map early during boot + to the system instead of getting detected, + added and onlined later during boot (such as + done by virtio-mem or by some hypervisors + implementing emulated DIMMs). As one example, a + hotplugged DIMM will be onlined either + completely to ZONE_MOVABLE or completely to + ZONE_NORMAL, not a mixture. + As another example, as many memory blocks + belonging to a virtio-mem device will be + onlined to ZONE_MOVABLE as possible, + special-casing units of memory blocks that can + only get hotunplugged together. *This policy + does not protect from setups that are + problematic with ZONE_MOVABLE and does not + change the zone of memory blocks dynamically + after they were onlined.* +``auto_movable_ratio`` read-write: Set the maximum MOVABLE:KERNEL + memory ratio in % for the ``auto-movable`` + online policy. Whether the ratio applies only + for the system across all NUMA nodes or also + per NUMA nodes depends on the + ``auto_movable_numa_aware`` configuration. + + All accounting is based on present memory pages + in the zones combined with accounting per + memory device. Memory dedicated to the CMA + allocator is accounted as MOVABLE, although + residing on one of the kernel zones. The + possible ratio depends on the actual workload. + The kernel default is "301" %, for example, + allowing for hotplugging 24 GiB to a 8 GiB VM + and automatically onlining all hotplugged + memory to ZONE_MOVABLE in many setups. The + additional 1% deals with some pages being not + present, for example, because of some firmware + allocations. + + Note that ZONE_NORMAL memory provided by one + memory device does not allow for more + ZONE_MOVABLE memory for a different memory + device. As one example, onlining memory of a + hotplugged DIMM to ZONE_NORMAL will not allow + for another hotplugged DIMM to get onlined to + ZONE_MOVABLE automatically. In contrast, memory + hotplugged by a virtio-mem device that got + onlined to ZONE_NORMAL will allow for more + ZONE_MOVABLE memory within *the same* + virtio-mem device. +``auto_movable_numa_aware`` read-write: Configure whether the + ``auto_movable_ratio`` in the ``auto-movable`` + online policy also applies per NUMA + node in addition to the whole system across all + NUMA nodes. The kernel default is "Y". + + Disabling NUMA awareness can be helpful when + dealing with NUMA nodes that should be + completely hotunpluggable, onlining the memory + completely to ZONE_MOVABLE automatically if + possible. + + Parameter availability depends on CONFIG_NUMA. +================================ =============================================== ZONE_MOVABLE ============ From 71b6f2dda824d044242145db8dc10c2037242899 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:20 -0700 Subject: [PATCH 436/512] mm/memory_hotplug: remove CONFIG_X86_64_ACPI_NUMA dependency from CONFIG_MEMORY_HOTPLUG Patch series "mm/memory_hotplug: Kconfig and 32 bit cleanups". Some cleanups around CONFIG_MEMORY_HOTPLUG, including removing 32 bit leftovers of memory hotplug support. This patch (of 6): SPARSEMEM is the only possible memory model for x86-64, FLATMEM is not possible: config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA And X86_64_ACPI_NUMA (obviously) only supports x86-64: config X86_64_ACPI_NUMA def_bool y depends on X86_64 && NUMA && ACPI && PCI Let's just remove the CONFIG_X86_64_ACPI_NUMA dependency, as it does no longer make sense. Link: https://lkml.kernel.org/r/20210929143600.49379-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Jonathan Corbet Cc: Alex Shi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Shuah Khan Cc: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 9f1e0098522c..b2bf73c90a38 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -123,7 +123,7 @@ config ARCH_ENABLE_MEMORY_HOTPLUG config MEMORY_HOTPLUG bool "Allow for memory hot-add" select MEMORY_ISOLATION - depends on SPARSEMEM || X86_64_ACPI_NUMA + depends on SPARSEMEM depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on 64BIT || BROKEN select NUMA_KEEP_MEMINFO if NUMA From 50f9481ed9fb8a2d2a06a155634c7f9eeff9fa61 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:24 -0700 Subject: [PATCH 437/512] mm/memory_hotplug: remove CONFIG_MEMORY_HOTPLUG_SPARSE CONFIG_MEMORY_HOTPLUG depends on CONFIG_SPARSEMEM, so there is no need for CONFIG_MEMORY_HOTPLUG_SPARSE anymore; adjust all instances to use CONFIG_MEMORY_HOTPLUG and remove CONFIG_MEMORY_HOTPLUG_SPARSE. Link: https://lkml.kernel.org/r/20210929143600.49379-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Shuah Khan [kselftest] Acked-by: Greg Kroah-Hartman Acked-by: Oscar Salvador Cc: Alex Shi Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jonathan Corbet Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/machdep.h | 2 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/platforms/powernv/setup.c | 4 ++-- arch/powerpc/platforms/pseries/setup.c | 2 +- drivers/base/Makefile | 2 +- drivers/base/node.c | 9 ++++--- drivers/virtio/Kconfig | 2 +- include/linux/memory.h | 24 ++++++++----------- include/linux/node.h | 4 ++-- lib/Kconfig.debug | 2 +- mm/Kconfig | 4 ---- mm/memory_hotplug.c | 2 -- tools/testing/selftests/memory-hotplug/config | 1 - 13 files changed, 24 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 764f2732a821..d8a2ca007082 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -32,7 +32,7 @@ struct machdep_calls { void (*iommu_save)(void); void (*iommu_restore)(void); #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG unsigned long (*memory_block_size)(void); #endif #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1777e992b20b..6052f5d5ded3 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -912,7 +912,7 @@ void __init setup_per_cpu_areas(void) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG unsigned long memory_block_size_bytes(void) { if (ppc_md.memory_block_size) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index a8db3f153063..ad56a54ac9c5 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -440,7 +440,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) } #endif /* CONFIG_KEXEC_CORE */ -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG static unsigned long pnv_memory_block_size(void) { /* @@ -553,7 +553,7 @@ define_machine(powernv) { #ifdef CONFIG_KEXEC_CORE .kexec_cpu_down = pnv_kexec_cpu_down, #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG .memory_block_size = pnv_memory_block_size, #endif }; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index f79126f16258..d29f6f1f7f37 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -1089,7 +1089,7 @@ define_machine(pseries) { .machine_kexec = pSeries_machine_kexec, .kexec_cpu_down = pseries_kexec_cpu_down, #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG .memory_block_size = pseries_memory_block_size, #endif }; diff --git a/drivers/base/Makefile b/drivers/base/Makefile index ef8e44a7d288..02f7f1358e86 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -13,7 +13,7 @@ obj-y += power/ obj-$(CONFIG_ISA_BUS_API) += isa.o obj-y += firmware_loader/ obj-$(CONFIG_NUMA) += node.o -obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_SYSFS),y) obj-$(CONFIG_MODULES) += module.o endif diff --git a/drivers/base/node.c b/drivers/base/node.c index c56d34f8158f..b5a4ba18f9f9 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -629,7 +629,7 @@ static void node_device_release(struct device *dev) { struct node *node = to_node(dev); -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS) +#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) /* * We schedule the work only when a memory section is * onlined/offlined on this node. When we come here, @@ -782,7 +782,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) return 0; } -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG static int __ref get_nid_for_pfn(unsigned long pfn) { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -958,10 +958,9 @@ static int node_memory_callback(struct notifier_block *self, return NOTIFY_OK; } #endif /* CONFIG_HUGETLBFS */ -#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ -#if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \ - !defined(CONFIG_HUGETLBFS) +#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS) static inline int node_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index ce1b3f6ec325..3654def9915c 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -98,7 +98,7 @@ config VIRTIO_MEM default m depends on X86_64 depends on VIRTIO - depends on MEMORY_HOTPLUG_SPARSE + depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on CONTIG_ALLOC help diff --git a/include/linux/memory.h b/include/linux/memory.h index 053a530c7bdd..0328ec039c38 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -110,7 +110,7 @@ struct mem_section; #define SLAB_CALLBACK_PRI 1 #define IPC_CALLBACK_PRI 10 -#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) { return; @@ -126,7 +126,14 @@ static inline int memory_notify(unsigned long val, void *v) { return 0; } -#else +static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) +{ + return 0; +} +/* These aren't inline functions due to a GCC bug. */ +#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) +#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) +#else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, @@ -148,9 +155,6 @@ struct memory_group *memory_group_find_by_id(int mgid); typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *); int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, struct memory_group *excluded, void *arg); -#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ - -#ifdef CONFIG_MEMORY_HOTPLUG #define hotplug_memory_notifier(fn, pri) ({ \ static __meminitdata struct notifier_block fn##_mem_nb =\ { .notifier_call = fn, .priority = pri };\ @@ -158,15 +162,7 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, }) #define register_hotmemory_notifier(nb) register_memory_notifier(nb) #define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) -#else -static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) -{ - return 0; -} -/* These aren't inline functions due to a GCC bug. */ -#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) -#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) -#endif +#endif /* CONFIG_MEMORY_HOTPLUG */ /* * Kernel text modification mutex, used for code patching. Users of this lock diff --git a/include/linux/node.h b/include/linux/node.h index 8e5a29897936..bb21fd631b16 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -85,7 +85,7 @@ struct node { struct device dev; struct list_head access_list; -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS) +#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) struct work_struct node_work; #endif #ifdef CONFIG_HMEM_REPORTING @@ -98,7 +98,7 @@ struct memory_block; extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) +#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, enum meminit_context context); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2a9b6dcdac4f..669fee1d26b8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -877,7 +877,7 @@ config DEBUG_MEMORY_INIT config MEMORY_NOTIFIER_ERROR_INJECT tristate "Memory hotplug notifier error injection module" - depends on MEMORY_HOTPLUG_SPARSE && NOTIFIER_ERROR_INJECTION + depends on MEMORY_HOTPLUG && NOTIFIER_ERROR_INJECTION help This option provides the ability to inject artificial errors to memory hotplug notifier chain callbacks. It is controlled through diff --git a/mm/Kconfig b/mm/Kconfig index b2bf73c90a38..0148a9c4fa2a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -128,10 +128,6 @@ config MEMORY_HOTPLUG depends on 64BIT || BROKEN select NUMA_KEEP_MEMINFO if NUMA -config MEMORY_HOTPLUG_SPARSE - def_bool y - depends on SPARSEMEM && MEMORY_HOTPLUG - config MEMORY_HOTPLUG_DEFAULT_ONLINE bool "Online the newly added memory blocks by default" depends on MEMORY_HOTPLUG diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index afaae370b8cd..fc07ce7b5842 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -220,7 +220,6 @@ static void release_memory_resource(struct resource *res) kfree(res); } -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, const char *reason) { @@ -1163,7 +1162,6 @@ failed_addition: mem_hotplug_done(); return ret; } -#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ static void reset_node_present_pages(pg_data_t *pgdat) { diff --git a/tools/testing/selftests/memory-hotplug/config b/tools/testing/selftests/memory-hotplug/config index a7e8cd5bb265..1eef042a31e1 100644 --- a/tools/testing/selftests/memory-hotplug/config +++ b/tools/testing/selftests/memory-hotplug/config @@ -1,5 +1,4 @@ CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y CONFIG_NOTIFIER_ERROR_INJECTION=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m CONFIG_MEMORY_HOTREMOVE=y From 7ec58a2b941ed88986694d037e38012738323171 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:28 -0700 Subject: [PATCH 438/512] mm/memory_hotplug: restrict CONFIG_MEMORY_HOTPLUG to 64 bit 32 bit support is broken in various ways: for example, we can online memory that should actually go to ZONE_HIGHMEM to ZONE_MOVABLE or in some cases even to one of the other kernel zones. We marked it BROKEN in commit b59d02ed0869 ("mm/memory_hotplug: disable the functionality for 32b") almost one year ago. According to that commit it might be broken at least since 2017. Further, there is hardly a sane use case nowadays. Let's just depend completely on 64bit, dropping the "BROKEN" dependency to make clear that we are not going to support it again. Next, we'll remove some HIGHMEM leftovers from memory hotplug code to clean up. Link: https://lkml.kernel.org/r/20210929143600.49379-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alex Shi Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jonathan Corbet Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 0148a9c4fa2a..ae1f151c2924 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -125,7 +125,7 @@ config MEMORY_HOTPLUG select MEMORY_ISOLATION depends on SPARSEMEM depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on 64BIT || BROKEN + depends on 64BIT select NUMA_KEEP_MEMINFO if NUMA config MEMORY_HOTPLUG_DEFAULT_ONLINE From 6b740c6c3aa371cd70ac07f8d071f2a8af28c51c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:31 -0700 Subject: [PATCH 439/512] mm/memory_hotplug: remove HIGHMEM leftovers We don't support CONFIG_MEMORY_HOTPLUG on 32 bit and consequently not HIGHMEM. Let's remove any leftover code -- including the unused "status_change_nid_high" field part of the memory notifier. Link: https://lkml.kernel.org/r/20210929143600.49379-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alex Shi Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jonathan Corbet Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/memory-hotplug.rst | 3 -- .../zh_CN/core-api/memory-hotplug.rst | 4 --- include/linux/memory.h | 1 - mm/memory_hotplug.c | 36 ++----------------- 4 files changed, 2 insertions(+), 42 deletions(-) diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst index de7467e48067..682259ee633a 100644 --- a/Documentation/core-api/memory-hotplug.rst +++ b/Documentation/core-api/memory-hotplug.rst @@ -57,7 +57,6 @@ The third argument (arg) passes a pointer of struct memory_notify:: unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; - int status_change_nid_high; int status_change_nid; } @@ -65,8 +64,6 @@ The third argument (arg) passes a pointer of struct memory_notify:: - nr_pages is # of pages of online/offline memory. - status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask is (will be) set/clear, if this is -1, then nodemask status is not changed. -- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask - is (will be) set/clear, if this is -1, then nodemask status is not changed. - status_change_nid is set node id when N_MEMORY of nodemask is (will be) set/clear. It means a new(memoryless) node gets new memory by online and a node loses all memory. If this is -1, then nodemask status is not changed. diff --git a/Documentation/translations/zh_CN/core-api/memory-hotplug.rst b/Documentation/translations/zh_CN/core-api/memory-hotplug.rst index 161f4d2c18cc..9a204eb196f2 100644 --- a/Documentation/translations/zh_CN/core-api/memory-hotplug.rst +++ b/Documentation/translations/zh_CN/core-api/memory-hotplug.rst @@ -63,7 +63,6 @@ memory_notify结构体的指针:: unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; - int status_change_nid_high; int status_change_nid; } @@ -74,9 +73,6 @@ memory_notify结构体的指针:: - status_change_nid_normal是当nodemask的N_NORMAL_MEMORY被设置/清除时设置节 点id,如果是-1,则nodemask状态不改变。 -- status_change_nid_high是当nodemask的N_HIGH_MEMORY被设置/清除时设置的节点 - id,如果这个值为-1,那么nodemask状态不会改变。 - - status_change_nid是当nodemask的N_MEMORY被(将)设置/清除时设置的节点id。这 意味着一个新的(没上线的)节点通过联机获得新的内存,而一个节点失去了所有的内 存。如果这个值为-1,那么nodemask的状态就不会改变。 diff --git a/include/linux/memory.h b/include/linux/memory.h index 0328ec039c38..88eb587b5143 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,7 +96,6 @@ struct memory_notify { unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; - int status_change_nid_high; int status_change_nid; }; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc07ce7b5842..0aa7ca3dfbc9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -585,10 +584,6 @@ void generic_online_page(struct page *page, unsigned int order) debug_pagealloc_map_pages(page, 1 << order); __free_pages_core(page, order); totalram_pages_add(1UL << order); -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages_add(1UL << order); -#endif } EXPORT_SYMBOL_GPL(generic_online_page); @@ -625,16 +620,11 @@ static void node_states_check_changes_online(unsigned long nr_pages, arg->status_change_nid = NUMA_NO_NODE; arg->status_change_nid_normal = NUMA_NO_NODE; - arg->status_change_nid_high = NUMA_NO_NODE; if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; -#ifdef CONFIG_HIGHMEM - if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY)) - arg->status_change_nid_high = nid; -#endif } static void node_states_set_node(int node, struct memory_notify *arg) @@ -642,9 +632,6 @@ static void node_states_set_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_set_state(node, N_NORMAL_MEMORY); - if (arg->status_change_nid_high >= 0) - node_set_state(node, N_HIGH_MEMORY); - if (arg->status_change_nid >= 0) node_set_state(node, N_MEMORY); } @@ -1801,7 +1788,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages, arg->status_change_nid = NUMA_NO_NODE; arg->status_change_nid_normal = NUMA_NO_NODE; - arg->status_change_nid_high = NUMA_NO_NODE; /* * Check whether node_states[N_NORMAL_MEMORY] will be changed. @@ -1816,24 +1802,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages, if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); -#ifdef CONFIG_HIGHMEM /* - * node_states[N_HIGH_MEMORY] contains nodes which - * have normal memory or high memory. - * Here we add the present_pages belonging to ZONE_HIGHMEM. - * If the zone is within the range of [0..ZONE_HIGHMEM), and - * we determine that the zones in that range become empty, - * we need to clear the node for N_HIGH_MEMORY. - */ - present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages; - if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages) - arg->status_change_nid_high = zone_to_nid(zone); -#endif - - /* - * We have accounted the pages from [0..ZONE_NORMAL), and - * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM - * as well. + * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM + * does not apply as we don't support 32bit. * Here we count the possible pages from ZONE_MOVABLE. * If after having accounted all the pages, we see that the nr_pages * to be offlined is over or equal to the accounted pages, @@ -1851,9 +1822,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); - if (arg->status_change_nid_high >= 0) - node_clear_state(node, N_HIGH_MEMORY); - if (arg->status_change_nid >= 0) node_clear_state(node, N_MEMORY); } From 43e3aa2a3247c9ca348884866a9846d788ec5ed6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:35 -0700 Subject: [PATCH 440/512] mm/memory_hotplug: remove stale function declarations These functions no longer exist. Link: https://lkml.kernel.org/r/20210929143600.49379-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alex Shi Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jonathan Corbet Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e5a867c950b2..be48e003a518 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -98,9 +98,6 @@ static inline void zone_seqlock_init(struct zone *zone) { seqlock_init(&zone->span_seqlock); } -extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); -extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); -extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); extern void adjust_present_page_count(struct page *page, struct memory_group *group, long nr_pages); From 5c11f00b09c10340fcc363b332f6622d22d895ef Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:39 -0700 Subject: [PATCH 441/512] x86: remove memory hotplug support on X86_32 CONFIG_MEMORY_HOTPLUG was marked BROKEN over one year and we just restricted it to 64 bit. Let's remove the unused x86 32bit implementation and simplify the Kconfig. Link: https://lkml.kernel.org/r/20210929143600.49379-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alex Shi Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jonathan Corbet Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 6 +++--- arch/x86/mm/init_32.c | 31 ------------------------------- 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d9830e7e1060..b2fb68da6697 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,7 +62,7 @@ config X86 select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION - select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) + select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE) select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE @@ -1614,7 +1614,7 @@ config ARCH_SELECT_MEMORY_MODEL config ARCH_MEMORY_PROBE bool "Enable sysfs memory/probe interface" - depends on X86_64 && MEMORY_HOTPLUG + depends on MEMORY_HOTPLUG help This option enables a sysfs memory/probe interface for testing. See Documentation/admin-guide/mm/memory-hotplug.rst for more information. @@ -2394,7 +2394,7 @@ endmenu config ARCH_HAS_ADD_PAGES def_bool y - depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG + depends on ARCH_ENABLE_MEMORY_HOTPLUG config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE def_bool y diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd90b8fe81e4..5cd7ea6d645c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -779,37 +779,6 @@ void __init mem_init(void) test_wp_bit(); } -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_params *params) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - int ret; - - /* - * The page tables were already mapped at boot so if the caller - * requests a different mapping type then we must change all the - * pages with __set_memory_prot(). - */ - if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) { - ret = __set_memory_prot(start, nr_pages, params->pgprot); - if (ret) - return ret; - } - - return __add_pages(nid, start_pfn, nr_pages, params); -} - -void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - __remove_pages(start_pfn, nr_pages, altmap); -} -#endif - int kernel_set_to_readonly __read_mostly; static void mark_nxdata_nx(void) From 53d38316ab2017a7c0d733765b521700aa357ec9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:42 -0700 Subject: [PATCH 442/512] mm/memory_hotplug: handle memblock_add_node() failures in add_memory_resource() Patch series "mm/memory_hotplug: full support for add_memory_driver_managed() with CONFIG_ARCH_KEEP_MEMBLOCK", v2. Architectures that require CONFIG_ARCH_KEEP_MEMBLOCK=y, such as arm64, don't cleanly support add_memory_driver_managed() yet. Most prominently, kexec_file can still end up placing kexec images on such driver-managed memory, resulting in undesired behavior, for example, having kexec images located on memory not part of the firmware-provided memory map. Teaching kexec to not place images on driver-managed memory is especially relevant for virtio-mem. Details can be found in commit 7b7b27214bba ("mm/memory_hotplug: introduce add_memory_driver_managed()"). Extend memblock with a new flag and set it from memory hotplug code when applicable. This is required to fully support virtio-mem on arm64, making also kexec_file behave like on x86-64. This patch (of 2): If memblock_add_node() fails, we're most probably running out of memory. While this is unlikely to happen, it can happen and having memory added without a memblock can be problematic for architectures that use memblock to detect valid memory. Let's fail in a nice way instead of silently ignoring the error. Link: https://lkml.kernel.org/r/20211004093605.5830-1-david@redhat.com Link: https://lkml.kernel.org/r/20211004093605.5830-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Mike Rapoport Cc: Michal Hocko Cc: Oscar Salvador Cc: Jianyong Wu Cc: "Aneesh Kumar K . V" Cc: Vineet Gupta Cc: Geert Uytterhoeven Cc: Huacai Chen Cc: Jiaxun Yang Cc: Thomas Bogendoerfer Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Eric Biederman Cc: Arnd Bergmann Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0aa7ca3dfbc9..9d254e88221e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1369,8 +1369,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) mem_hotplug_begin(); - if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) - memblock_add_node(start, size, nid); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + ret = memblock_add_node(start, size, nid); + if (ret) + goto error_mem_hotplug_end; + } ret = __try_online_node(nid, false); if (ret < 0) @@ -1443,6 +1446,7 @@ error: rollback_node_hotadd(nid); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); +error_mem_hotplug_end: mem_hotplug_done(); return ret; } From e14b41556d9ec20af60a9f672ac6f64dd450763c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:46 -0700 Subject: [PATCH 443/512] memblock: improve MEMBLOCK_HOTPLUG documentation The description of MEMBLOCK_HOTPLUG is currently short and consequently misleading: we're actually dealing with a memory region that might get hotunplugged later (i.e., the platform+firmware supports it), yet it is indicated in the firmware-provided memory map as system ram that will just get used by the system for any purpose when not taking special care. The firmware marked this memory region as a hot(un)plugged (e.g., hotplugged before reboot), implying that it might get hotunplugged again later. Whether we consider this information depends on the "movable_node" kernel commandline parameter: only with "movable_node" set, we'll try keeping this memory hotunpluggable, for example, by not serving early allocations from this memory region and by letting the buddy manage it using the ZONE_MOVABLE. Let's make this clearer by extending the documentation. Note: kexec *has to* indicate this memory to the second kernel. With "movable_node" set, we don't want to place kexec-images on this memory. Without "movable_node" set, we don't care and can place kexec-images on this memory. In both cases, after successful memory hotunplug, kexec has to be re-armed to update the memory map for the second kernel and to place the kexec-images somewhere else. Link: https://lkml.kernel.org/r/20211004093605.5830-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Eric Biederman Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Huacai Chen Cc: Jianyong Wu Cc: Jiaxun Yang Cc: Michal Hocko Cc: Oscar Salvador Cc: Shahab Vahedi Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 484650681bee..1d2b3e902a84 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -28,7 +28,11 @@ extern unsigned long long max_possible_pfn; /** * enum memblock_flags - definition of memory region attributes * @MEMBLOCK_NONE: no special request - * @MEMBLOCK_HOTPLUG: hotpluggable region + * @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory + * map during early boot as hot(un)pluggable system RAM (e.g., memory range + * that might get hotunplugged later). With "movable_node" set on the kernel + * commandline, try keeping this memory region hotunpluggable. Does not apply + * to memblocks added ("hotplugged") after early boot. * @MEMBLOCK_MIRROR: mirrored region * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as * reserved in the memory map; refer to memblock_mark_nomap() description From 952eea9b01e4bbb7011329f1b7240844e61e5128 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:49 -0700 Subject: [PATCH 444/512] memblock: allow to specify flags with memblock_add_node() We want to specify flags when hotplugging memory. Let's prepare to pass flags to memblock_add_node() by adjusting all existing users. Note that when hotplugging memory the system is already up and running and we might have concurrent memblock users: for example, while we're hotplugging memory, kexec_file code might search for suitable memory regions to place kexec images. It's important to add the memory directly to memblock via a single call with the right flags, instead of adding the memory first and apply flags later: otherwise, concurrent memblock users might temporarily stumble over memblocks with wrong flags, which will be important in a follow-up patch that introduces a new flag to properly handle add_memory_driver_managed(). Link: https://lkml.kernel.org/r/20211004093605.5830-4-david@redhat.com Acked-by: Geert Uytterhoeven Acked-by: Heiko Carstens Signed-off-by: David Hildenbrand Acked-by: Shahab Vahedi [arch/arc] Reviewed-by: Mike Rapoport Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Eric Biederman Cc: Huacai Chen Cc: Jianyong Wu Cc: Jiaxun Yang Cc: Michal Hocko Cc: Oscar Salvador Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/mm/init.c | 4 ++-- arch/ia64/mm/contig.c | 2 +- arch/ia64/mm/init.c | 2 +- arch/m68k/mm/mcfmmu.c | 3 ++- arch/m68k/mm/motorola.c | 6 ++++-- arch/mips/loongson64/init.c | 4 +++- arch/mips/sgi-ip27/ip27-memory.c | 3 ++- arch/s390/kernel/setup.c | 3 ++- include/linux/memblock.h | 3 ++- include/linux/mm.h | 2 +- mm/memblock.c | 9 +++++---- mm/memory_hotplug.c | 2 +- 12 files changed, 26 insertions(+), 17 deletions(-) diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 59408f6a02d4..ce4e939a7f07 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -59,13 +59,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) low_mem_sz = size; in_use = 1; - memblock_add_node(base, size, 0); + memblock_add_node(base, size, 0, MEMBLOCK_NONE); } else { #ifdef CONFIG_HIGHMEM high_mem_start = base; high_mem_sz = size; in_use = 1; - memblock_add_node(base, size, 1); + memblock_add_node(base, size, 1, MEMBLOCK_NONE); memblock_reserve(base, size); #endif } diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 42e025cfbd08..24901d809301 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -153,7 +153,7 @@ find_memory (void) efi_memmap_walk(find_max_min_low_pfn, NULL); max_pfn = max_low_pfn; - memblock_add_node(0, PFN_PHYS(max_low_pfn), 0); + memblock_add_node(0, PFN_PHYS(max_low_pfn), 0, MEMBLOCK_NONE); find_initrd(); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 5c6da8d83c1a..5d165607bf35 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -378,7 +378,7 @@ int __init register_active_ranges(u64 start, u64 len, int nid) #endif if (start < end) - memblock_add_node(__pa(start), end - start, nid); + memblock_add_node(__pa(start), end - start, nid, MEMBLOCK_NONE); return 0; } diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index eac9dde65193..6f1f25125294 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -174,7 +174,8 @@ void __init cf_bootmem_alloc(void) m68k_memory[0].addr = _rambase; m68k_memory[0].size = _ramend - _rambase; - memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0); + memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0, + MEMBLOCK_NONE); /* compute total pages in system */ num_pages = PFN_DOWN(_ramend - _rambase); diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 9f3f77785aa7..2b05bb2bac00 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -410,7 +410,8 @@ void __init paging_init(void) min_addr = m68k_memory[0].addr; max_addr = min_addr + m68k_memory[0].size; - memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0); + memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0, + MEMBLOCK_NONE); for (i = 1; i < m68k_num_memory;) { if (m68k_memory[i].addr < min_addr) { printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n", @@ -421,7 +422,8 @@ void __init paging_init(void) (m68k_num_memory - i) * sizeof(struct m68k_mem_info)); continue; } - memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i); + memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i, + MEMBLOCK_NONE); addr = m68k_memory[i].addr + m68k_memory[i].size; if (addr > max_addr) max_addr = addr; diff --git a/arch/mips/loongson64/init.c b/arch/mips/loongson64/init.c index 76e0a9636a0e..4ac5ba80bbf6 100644 --- a/arch/mips/loongson64/init.c +++ b/arch/mips/loongson64/init.c @@ -77,7 +77,9 @@ void __init szmem(unsigned int node) (u32)node_id, mem_type, mem_start, mem_size); pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", start_pfn, end_pfn, num_physpages); - memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(node_psize), node); + memblock_add_node(PFN_PHYS(start_pfn), + PFN_PHYS(node_psize), node, + MEMBLOCK_NONE); break; case SYSTEM_RAM_RESERVED: pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n", diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 6173684b5aaa..adc2faeecf7c 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -341,7 +341,8 @@ static void __init szmem(void) continue; } memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)), - PFN_PHYS(slot_psize), node); + PFN_PHYS(slot_psize), node, + MEMBLOCK_NONE); } } } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 7fc836e9e194..8a378d426239 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -593,7 +593,8 @@ static void __init setup_resources(void) * part of the System RAM resource. */ if (crashk_res.end) { - memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0); + memblock_add_node(crashk_res.start, resource_size(&crashk_res), + 0, MEMBLOCK_NONE); memblock_reserve(crashk_res.start, resource_size(&crashk_res)); insert_resource(&iomem_resource, &crashk_res); } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 1d2b3e902a84..8b6560dc7a9e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -104,7 +104,8 @@ static inline void memblock_discard(void) {} #endif void memblock_allow_resize(void); -int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); +int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid, + enum memblock_flags flags); int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_phys_free(phys_addr_t base, phys_addr_t size); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7e37c726b9db..15058d9cec99 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2425,7 +2425,7 @@ static inline unsigned long get_num_physpages(void) * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * memblock_add_node(base, size, nid) + * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); diff --git a/mm/memblock.c b/mm/memblock.c index fb0c7f48e627..1b0cd8a52a13 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -655,6 +655,7 @@ repeat: * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region + * @flags: flags of the new region * * Add new memblock region [@base, @base + @size) to the "memory" * type. See memblock_add_range() description for mode details @@ -663,14 +664,14 @@ repeat: * 0 on success, -errno on failure. */ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, - int nid) + int nid, enum memblock_flags flags) { phys_addr_t end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__, - &base, &end, nid, (void *)_RET_IP_); + memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, + &base, &end, nid, flags, (void *)_RET_IP_); - return memblock_add_range(&memblock.memory, base, size, nid, 0); + return memblock_add_range(&memblock.memory, base, size, nid, flags); } /** diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9d254e88221e..cedeaa4d6f0e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1370,7 +1370,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) mem_hotplug_begin(); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { - ret = memblock_add_node(start, size, nid); + ret = memblock_add_node(start, size, nid, MEMBLOCK_NONE); if (ret) goto error_mem_hotplug_end; } From f7892d8e288d4b090176f26d9bf7943dbbb639a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:53 -0700 Subject: [PATCH 445/512] memblock: add MEMBLOCK_DRIVER_MANAGED to mimic IORESOURCE_SYSRAM_DRIVER_MANAGED Let's add a flag that corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED, indicating that we're dealing with a memory region that is never indicated in the firmware-provided memory map, but always detected and added by a driver. Similar to MEMBLOCK_HOTPLUG, most infrastructure has to treat such memory regions like ordinary MEMBLOCK_NONE memory regions -- for example, when selecting memory regions to add to the vmcore for dumping in the crashkernel via for_each_mem_range(). However, especially kexec_file is not supposed to select such memblocks via for_each_free_mem_range() / for_each_free_mem_range_reverse() to place kexec images, similar to how we handle IORESOURCE_SYSRAM_DRIVER_MANAGED without CONFIG_ARCH_KEEP_MEMBLOCK. We'll make sure that memory hotplug code sets the flag where applicable (IORESOURCE_SYSRAM_DRIVER_MANAGED) next. This prepares architectures that need CONFIG_ARCH_KEEP_MEMBLOCK, such as arm64, for virtio-mem support. Note that kexec *must not* indicate this memory to the second kernel and *must not* place kexec-images on this memory. Let's add a comment to kexec_walk_memblock(), documenting how we handle MEMBLOCK_DRIVER_MANAGED now just like using IORESOURCE_SYSRAM_DRIVER_MANAGED in locate_mem_hole_callback() for kexec_walk_resources(). Also note that MEMBLOCK_HOTPLUG cannot be reused due to different semantics: MEMBLOCK_HOTPLUG: memory is indicated as "System RAM" in the firmware-provided memory map and added to the system early during boot; kexec *has to* indicate this memory to the second kernel and can place kexec-images on this memory. After memory hotunplug, kexec has to be re-armed. We mostly ignore this flag when "movable_node" is not set on the kernel command line, because then we're told to not care about hotunpluggability of such memory regions. MEMBLOCK_DRIVER_MANAGED: memory is not indicated as "System RAM" in the firmware-provided memory map; this memory is always detected and added to the system by a driver; memory might not actually be physically hotunpluggable. kexec *must not* indicate this memory to the second kernel and *must not* place kexec-images on this memory. Link: https://lkml.kernel.org/r/20211004093605.5830-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Eric Biederman Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Huacai Chen Cc: Jianyong Wu Cc: Jiaxun Yang Cc: Michal Hocko Cc: Oscar Salvador Cc: Shahab Vahedi Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 16 ++++++++++++++-- kernel/kexec_file.c | 5 +++++ mm/memblock.c | 4 ++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8b6560dc7a9e..7df557b16c1e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -37,12 +37,17 @@ extern unsigned long long max_possible_pfn; * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as * reserved in the memory map; refer to memblock_mark_nomap() description * for further details + * @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added + * via a driver, and never indicated in the firmware-provided memory map as + * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the + * kernel resource tree. */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ MEMBLOCK_MIRROR = 0x2, /* mirrored region */ MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ + MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ }; /** @@ -213,7 +218,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range(i, p_start, p_end) \ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_HOTPLUG, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED, \ + p_start, p_end, NULL) /** * for_each_mem_range_rev - reverse iterate through memblock areas from @@ -224,7 +230,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range_rev(i, p_start, p_end) \ __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_HOTPLUG, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED,\ + p_start, p_end, NULL) /** * for_each_reserved_mem_range - iterate over all reserved memblock areas @@ -254,6 +261,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m) return m->flags & MEMBLOCK_NOMAP; } +static inline bool memblock_is_driver_managed(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_DRIVER_MANAGED; +} + int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 33400ff051a8..8347fc158d2b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -556,6 +556,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, if (kbuf->image->type == KEXEC_TYPE_CRASH) return func(&crashk_res, kbuf); + /* + * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See + * IORESOURCE_SYSRAM_DRIVER_MANAGED handling in + * locate_mem_hole_callback(). + */ if (kbuf->top_down) { for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE, &mstart, &mend, NULL) { diff --git a/mm/memblock.c b/mm/memblock.c index 1b0cd8a52a13..659bf0ffb086 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -982,6 +982,10 @@ static bool should_skip_region(struct memblock_type *type, if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) return true; + /* skip driver-managed memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) + return true; + return false; } From 32befe9e27859b87e70e0aba9b60bfb8000d9a66 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:56 -0700 Subject: [PATCH 446/512] mm/memory_hotplug: indicate MEMBLOCK_DRIVER_MANAGED with IORESOURCE_SYSRAM_DRIVER_MANAGED Let's communicate driver-managed regions to memblock, to properly teach kexec_file with CONFIG_ARCH_KEEP_MEMBLOCK to not place images on these memory regions. Link: https://lkml.kernel.org/r/20211004093605.5830-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Eric Biederman Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Huacai Chen Cc: Jianyong Wu Cc: Jiaxun Yang Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Shahab Vahedi Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cedeaa4d6f0e..852041f6be41 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1342,6 +1342,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; + enum memblock_flags memblock_flags = MEMBLOCK_NONE; struct vmem_altmap mhp_altmap = {}; struct memory_group *group = NULL; u64 start, size; @@ -1370,7 +1371,9 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) mem_hotplug_begin(); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { - ret = memblock_add_node(start, size, nid, MEMBLOCK_NONE); + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) + memblock_flags = MEMBLOCK_DRIVER_MANAGED; + ret = memblock_add_node(start, size, nid, memblock_flags); if (ret) goto error_mem_hotplug_end; } From 3d88705c10677d1fc3f14786361aee649839aa7e Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 5 Nov 2021 13:45:00 -0700 Subject: [PATCH 447/512] mm/rmap.c: avoid double faults migrating device private pages During migration special page table entries are installed for each page being migrated. These entries store the pfn and associated permissions of ptes mapping the page being migarted. Device-private pages use special swap pte entries to distinguish read-only vs. writeable pages which the migration code checks when creating migration entries. Normally this follows a fast path in migrate_vma_collect_pmd() which correctly copies the permissions of device-private pages over to migration entries when migrating pages back to the CPU. However the slow-path falls back to using try_to_migrate() which unconditionally creates read-only migration entries for device-private pages. This leads to unnecessary double faults on the CPU as the new pages are always mapped read-only even when they could be mapped writeable. Fix this by correctly copying device-private permissions in try_to_migrate_one(). Link: https://lkml.kernel.org/r/20211018045247.3128058-1-apopple@nvidia.com Signed-off-by: Alistair Popple Reported-by: Ralph Campbell Reviewed-by: John Hubbard Cc: Jerome Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 6aebd1747251..d65a74e140f9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1807,6 +1807,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (is_zone_device_page(page)) { + unsigned long pfn = page_to_pfn(page); swp_entry_t entry; pte_t swp_pte; @@ -1815,8 +1816,11 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - entry = make_readable_migration_entry( - page_to_pfn(page)); + entry = pte_to_swp_entry(pteval); + if (is_writable_device_private_entry(entry)) + entry = make_writable_migration_entry(pfn); + else + entry = make_readable_migration_entry(pfn); swp_pte = swp_entry_to_pte(entry); /* From afe8605ca45424629fdddfd85984b442c763dc47 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 5 Nov 2021 13:45:03 -0700 Subject: [PATCH 448/512] mm/zsmalloc.c: close race window between zs_pool_dec_isolated() and zs_unregister_migration() There is one possible race window between zs_pool_dec_isolated() and zs_unregister_migration() because wait_for_isolated_drain() checks the isolated count without holding class->lock and there is no order inside zs_pool_dec_isolated(). Thus the below race window could be possible: zs_pool_dec_isolated zs_unregister_migration check pool->destroying != 0 pool->destroying = true; smp_mb(); wait_for_isolated_drain() wait for pool->isolated_pages == 0 atomic_long_dec(&pool->isolated_pages); atomic_long_read(&pool->isolated_pages) == 0 Since we observe the pool->destroying (false) before atomic_long_dec() for pool->isolated_pages, waking pool->migration_wait up is missed. Fix this by ensure checking pool->destroying happens after the atomic_long_dec(&pool->isolated_pages). Link: https://lkml.kernel.org/r/20210708115027.7557-1-linmiaohe@huawei.com Fixes: 701d678599d0 ("mm/zsmalloc.c: fix race condition in zs_destroy_pool") Signed-off-by: Miaohe Lin Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Henry Burns Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 68e8831068f4..b897ce3b399a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1830,10 +1830,11 @@ static inline void zs_pool_dec_isolated(struct zs_pool *pool) VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); atomic_long_dec(&pool->isolated_pages); /* - * There's no possibility of racing, since wait_for_isolated_drain() - * checks the isolated count under &class->lock after enqueuing - * on migration_wait. + * Checking pool->destroying must happen after atomic_long_dec() + * for pool->isolated_pages above. Paired with the smp_mb() in + * zs_unregister_migration(). */ + smp_mb__after_atomic(); if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) wake_up_all(&pool->migration_wait); } From d2c20e51e3966bc668ef1ef21fbe90704286c8d0 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 5 Nov 2021 13:45:06 -0700 Subject: [PATCH 449/512] mm/highmem: remove deprecated kmap_atomic kmap_atomic() is being deprecated in favor of kmap_local_page(). Replace the uses of kmap_atomic() within the highmem code. On profiling clear_huge_page() using ftrace an improvement of 62% was observed on the below setup. Setup:- Below data has been collected on Qualcomm's SM7250 SoC THP enabled (kernel v4.19.113) with only CPU-0(Cortex-A55) and CPU-7(Cortex-A76) switched on and set to max frequency, also DDR set to perf governor. FTRACE Data:- Base data:- Number of iterations: 48 Mean of allocation time: 349.5 us std deviation: 74.5 us v4 data:- Number of iterations: 48 Mean of allocation time: 131 us std deviation: 32.7 us The following simple userspace experiment to allocate 100MB(BUF_SZ) of pages and writing to it gave us a good insight, we observed an improvement of 42% in allocation and writing timings. ------------------------------------------------------------- Test code snippet ------------------------------------------------------------- clock_start(); buf = malloc(BUF_SZ); /* Allocate 100 MB of memory */ for(i=0; i < BUF_SZ_PAGES; i++) { *((int *)(buf + (i*PAGE_SIZE))) = 1; } clock_end(); ------------------------------------------------------------- Malloc test timings for 100MB anon allocation:- Base data:- Number of iterations: 100 Mean of allocation time: 31831 us std deviation: 4286 us v4 data:- Number of iterations: 100 Mean of allocation time: 18193 us std deviation: 4915 us [willy@infradead.org: fix zero_user_segments()] Link: https://lkml.kernel.org/r/YYVhHCJcm2DM2G9u@casper.infradead.org Link: https://lkml.kernel.org/r/20210204073255.20769-2-prathu.baronia@oneplus.com Signed-off-by: Ira Weiny Signed-off-by: Prathu Baronia Cc: Thomas Gleixner Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 28 ++++++++++++++-------------- mm/highmem.c | 6 +++--- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index b4c49f9cc379..31ebee36f26c 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -143,9 +143,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); - kunmap_atomic(addr); + kunmap_local(addr); } #endif @@ -177,9 +177,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, static inline void clear_highpage(struct page *page) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); clear_page(kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE @@ -202,7 +202,7 @@ static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); @@ -213,7 +213,7 @@ static inline void zero_user_segments(struct page *page, if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); - kunmap_atomic(kaddr); + kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } @@ -238,11 +238,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from, { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif @@ -253,11 +253,11 @@ static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_page(vto, vfrom); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif diff --git a/mm/highmem.c b/mm/highmem.c index 4212ad0e4a19..eb3b8c288de4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -383,7 +383,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); if (end1 > start1) { - kaddr = kmap_atomic(page + i); + kaddr = kmap_local_page(page + i); memset(kaddr + start1, 0, this_end - start1); } end1 -= this_end; @@ -398,7 +398,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, if (end2 > start2) { if (!kaddr) - kaddr = kmap_atomic(page + i); + kaddr = kmap_local_page(page + i); memset(kaddr + start2, 0, this_end - start2); } end2 -= this_end; @@ -406,7 +406,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, } if (kaddr) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); flush_dcache_page(page + i); } From 4aabdc14c4d2a01c2968bdc8919f81d9f9b4f790 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Fri, 5 Nov 2021 13:45:09 -0700 Subject: [PATCH 450/512] zram_drv: allow reclaim on bio_alloc The read_from_bdev_async is not called on atomic context. So GFP_NOIO is available rather than GFP_ATOMIC. If there were reclaimable pages with GFP_NOIO, we can avoid allocation failure and page fault failure. Link: https://lkml.kernel.org/r/20210908005241.28062-1-jaewon31.kim@samsung.com Signed-off-by: Jaewon Kim Reported-by: Yong-Taek Lee Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index fcaf2750f68f..081e77d595d7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -587,7 +587,7 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, { struct bio *bio; - bio = bio_alloc(GFP_ATOMIC, 1); + bio = bio_alloc(GFP_NOIO, 1); if (!bio) return -ENOMEM; From a88e03cf3d190cf46bc4063a9b7efe87590de5f4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Nov 2021 13:45:12 -0700 Subject: [PATCH 451/512] zram: off by one in read_block_state() snprintf() returns the number of bytes it would have printed if there were space. But it does not count the NUL terminator. So that means that if "count == copied" then this has already overflowed by one character. This bug likely isn't super harmful in real life. Link: https://lkml.kernel.org/r/20210916130404.GA25094@kili Fixes: c0265342bff4 ("zram: introduce zram memory tracking") Signed-off-by: Dan Carpenter Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 081e77d595d7..f61910c65f0f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -910,7 +910,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf, zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.'); - if (count < copied) { + if (count <= copied) { zram_slot_unlock(zram, index); break; } From 755804d16965888bb069a4cd39005b97ba9e11fd Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Fri, 5 Nov 2021 13:45:15 -0700 Subject: [PATCH 452/512] zram: introduce an aged idle interface This change introduces an aged idle interface to the existing idle sysfs file for zram. When CONFIG_ZRAM_MEMORY_TRACKING is enabled the idle file now also accepts an integer argument. This integer is the age (in seconds) of pages to mark as idle. The idle file still supports 'all' as it always has. This new approach allows for much more control over which pages get marked as idle. [bgeffon@google.com: use IS_ENABLED and cleanup comment] Link: https://lkml.kernel.org/r/20210924161128.1508015-1-bgeffon@google.com [bgeffon@google.com: Sergey's cleanup suggestions] Link: https://lkml.kernel.org/r/20210929143056.13067-1-bgeffon@google.com Link: https://lkml.kernel.org/r/20210923130115.1344361-1-bgeffon@google.com Signed-off-by: Brian Geffon Acked-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Nitin Gupta Cc: Jonathan Corbet Cc: Suleiman Souhlal Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/blockdev/zram.rst | 8 +++ drivers/block/zram/zram_drv.c | 62 +++++++++++++++------ 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 700329d25f57..3e11926a4df9 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -328,6 +328,14 @@ as idle:: From now on, any pages on zram are idle pages. The idle mark will be removed until someone requests access of the block. IOW, unless there is access request, those pages are still idle pages. +Additionally, when CONFIG_ZRAM_MEMORY_TRACKING is enabled pages can be +marked as idle based on how long (in seconds) it's been since they were +last accessed:: + + echo 86400 > /sys/block/zramX/idle + +In this example all pages which haven't been accessed in more than 86400 +seconds (one day) will be marked idle. Admin can request writeback of those idle pages at right timing via:: diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f61910c65f0f..0a9309a2ef54 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -291,22 +291,16 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } -static ssize_t idle_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +/* + * Mark all pages which are older than or equal to cutoff as IDLE. + * Callers should hold the zram init lock in read mode + */ +static void mark_idle(struct zram *zram, ktime_t cutoff) { - struct zram *zram = dev_to_zram(dev); + int is_idle = 1; unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; int index; - if (!sysfs_streq(buf, "all")) - return -EINVAL; - - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); - return -EINVAL; - } - for (index = 0; index < nr_pages; index++) { /* * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race. @@ -314,14 +308,50 @@ static ssize_t idle_store(struct device *dev, */ zram_slot_lock(zram, index); if (zram_allocated(zram, index) && - !zram_test_flag(zram, index, ZRAM_UNDER_WB)) - zram_set_flag(zram, index, ZRAM_IDLE); + !zram_test_flag(zram, index, ZRAM_UNDER_WB)) { +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time); +#endif + if (is_idle) + zram_set_flag(zram, index, ZRAM_IDLE); + } zram_slot_unlock(zram, index); } +} +static ssize_t idle_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + ktime_t cutoff_time = 0; + ssize_t rv = -EINVAL; + + if (!sysfs_streq(buf, "all")) { + /* + * If it did not parse as 'all' try to treat it as an integer when + * we have memory tracking enabled. + */ + u64 age_sec; + + if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec)) + cutoff_time = ktime_sub(ktime_get_boottime(), + ns_to_ktime(age_sec * NSEC_PER_SEC)); + else + goto out; + } + + down_read(&zram->init_lock); + if (!init_done(zram)) + goto out_unlock; + + /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */ + mark_idle(zram, cutoff_time); + rv = len; + +out_unlock: up_read(&zram->init_lock); - - return len; +out: + return rv; } #ifdef CONFIG_ZRAM_WRITEBACK From 53944f171a89dff4e2a3d76f42e6eedb551bb861 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 5 Nov 2021 13:45:18 -0700 Subject: [PATCH 453/512] mm: remove HARDENED_USERCOPY_FALLBACK This has served its purpose and is no longer used. All usercopy violations appear to have been handled by now, any remaining instances (or new bugs) will cause copies to be rejected. This isn't a direct revert of commit 2d891fbc3bb6 ("usercopy: Allow strict enforcement of whitelists"); since usercopy_fallback is effectively 0, the fallback handling is removed too. This also removes the usercopy_fallback module parameter on slab_common. Link: https://github.com/KSPP/linux/issues/153 Link: https://lkml.kernel.org/r/20210921061149.1091163-1-steve@sk2.org Signed-off-by: Stephen Kitt Suggested-by: Kees Cook Acked-by: Kees Cook Reviewed-by: Joel Stanley [defconfig change] Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: James Morris Cc: "Serge E . Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/configs/skiroot_defconfig | 1 - include/linux/slab.h | 2 -- mm/slab.c | 13 ------------- mm/slab_common.c | 8 -------- mm/slub.c | 14 -------------- security/Kconfig | 14 -------------- 6 files changed, 52 deletions(-) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index b806a5d3a695..c3ba614c973d 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -275,7 +275,6 @@ CONFIG_NLS_UTF8=y CONFIG_ENCRYPTED_KEYS=y CONFIG_SECURITY=y CONFIG_HARDENED_USERCOPY=y -# CONFIG_HARDENED_USERCOPY_FALLBACK is not set CONFIG_HARDENED_USERCOPY_PAGESPAN=y CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_LOCKDOWN_LSM=y diff --git a/include/linux/slab.h b/include/linux/slab.h index 837cb16232ef..181045148b06 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -142,8 +142,6 @@ struct mem_cgroup; void __init kmem_cache_init(void); bool slab_is_available(void); -extern bool usercopy_fallback; - struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); diff --git a/mm/slab.c b/mm/slab.c index 64ec17a3bc2b..da132a9ae6f8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4204,19 +4204,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, n <= cachep->useroffset - offset + cachep->usersize) return; - /* - * If the copy is still within the allocated object, produce - * a warning instead of rejecting the copy. This is intended - * to be a temporary method to find any missing usercopy - * whitelists. - */ - if (usercopy_fallback && - offset <= cachep->object_size && - n <= cachep->object_size - offset) { - usercopy_warn("SLAB object", cachep->name, to_user, offset, n); - return; - } - usercopy_abort("SLAB object", cachep->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ diff --git a/mm/slab_common.c b/mm/slab_common.c index ec2bb0beed75..e5d080a93009 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,14 +37,6 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; -#ifdef CONFIG_HARDENED_USERCOPY -bool usercopy_fallback __ro_after_init = - IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK); -module_param(usercopy_fallback, bool, 0400); -MODULE_PARM_DESC(usercopy_fallback, - "WARN instead of reject usercopy whitelist violations"); -#endif - static LIST_HEAD(slab_caches_to_rcu_destroy); static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); static DECLARE_WORK(slab_caches_to_rcu_destroy_work, diff --git a/mm/slub.c b/mm/slub.c index e9a51dcf8bf9..432145d7b4ec 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4489,7 +4489,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, { struct kmem_cache *s; unsigned int offset; - size_t object_size; bool is_kfence = is_kfence_address(ptr); ptr = kasan_reset_tag(ptr); @@ -4522,19 +4521,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, n <= s->useroffset - offset + s->usersize) return; - /* - * If the copy is still within the allocated object, produce - * a warning instead of rejecting the copy. This is intended - * to be a temporary method to find any missing usercopy - * whitelists. - */ - object_size = slab_ksize(s); - if (usercopy_fallback && - offset <= object_size && n <= object_size - offset) { - usercopy_warn("SLUB object", s->name, to_user, offset, n); - return; - } - usercopy_abort("SLUB object", s->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ diff --git a/security/Kconfig b/security/Kconfig index 0ced7fd33e4d..d9698900c9b7 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -163,20 +163,6 @@ config HARDENED_USERCOPY or are part of the kernel text. This kills entire classes of heap overflow exploits and similar kernel memory exposures. -config HARDENED_USERCOPY_FALLBACK - bool "Allow usercopy whitelist violations to fallback to object size" - depends on HARDENED_USERCOPY - default y - help - This is a temporary option that allows missing usercopy whitelists - to be discovered via a WARN() to the kernel log, instead of - rejecting the copy, falling back to non-whitelisted hardened - usercopy that checks the slab allocation size instead of the - whitelist size. This option will be removed once it seems like - all missing usercopy whitelists have been identified and fixed. - Booting with "slab_common.usercopy_fallback=Y/N" can change - this setting. - config HARDENED_USERCOPY_PAGESPAN bool "Refuse to copy allocations that span multiple pages" depends on HARDENED_USERCOPY From a1554c002699cbc9ced2e9f44f9c1357181bead3 Mon Sep 17 00:00:00 2001 From: Mianhan Liu Date: Fri, 5 Nov 2021 13:45:21 -0700 Subject: [PATCH 454/512] include/linux/mm.h: move nr_free_buffer_pages from swap.h to mm.h nr_free_buffer_pages could be exposed through mm.h instead of swap.h. The advantage of this change is that it can reduce the obsolete includes. For example, net/ipv4/tcp.c wouldn't need swap.h any more since it has already included mm.h. Similarly, after checking all the other files, it comes that tcp.c, udp.c meter.c ,... follow the same rule, so these files can have swap.h removed too. Moreover, after preprocessing all the files that use nr_free_buffer_pages, it turns out that those files have already included mm.h.Thus, we can move nr_free_buffer_pages from swap.h to mm.h safely. This change will not affect the compilation of other files. Link: https://lkml.kernel.org/r/20210912133640.1624-1-liumh1@shanghaitech.edu.cn Signed-off-by: Mianhan Liu Cc: Jakub Kicinski CC: Ulf Hansson Cc: "David S . Miller" Cc: Simon Horman Cc: Pravin B Shelar Cc: Vlad Yasevich Cc: Marcelo Ricardo Leitner Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mmc/core/mmc_test.c | 1 - include/linux/mm.h | 2 ++ include/linux/swap.h | 1 - net/ipv4/tcp.c | 1 - net/ipv4/udp.c | 1 - net/netfilter/ipvs/ip_vs_ctl.c | 1 - net/openvswitch/meter.c | 1 - net/sctp/protocol.c | 1 - 8 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c index 63524551a13a..e6a2fd2c6d5c 100644 --- a/drivers/mmc/core/mmc_test.c +++ b/drivers/mmc/core/mmc_test.c @@ -10,7 +10,6 @@ #include #include -#include /* For nr_free_buffer_pages() */ #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 15058d9cec99..b1720aa63727 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -875,6 +875,8 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); void copy_huge_page(struct page *dst, struct page *src); +unsigned long nr_free_buffer_pages(void); + /* * Compound pages have a destructor function. Provide a * prototype for that function and accessor functions. diff --git a/include/linux/swap.h b/include/linux/swap.h index ba52f3a3478e..53f759a59996 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -335,7 +335,6 @@ void workingset_update_node(struct xa_node *node); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; -extern unsigned long nr_free_buffer_pages(void); /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f5c336f8b0c8..c5e07a2c6c52 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -260,7 +260,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2fffcf2b54f3..319dd7bbfe33 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -78,7 +78,6 @@ #include #include #include -#include #include #include #include diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 29ec3ef63edc..6ea4b882435e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 896b8f5bc885..04a060ac7fdf 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ec0f52567c16..35928fefae33 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include From f39f21b3ddc7fc0f87eb6dc75ddc81b5bbfb7672 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:25 -0700 Subject: [PATCH 455/512] stacktrace: move filter_irq_stacks() to kernel/stacktrace.c filter_irq_stacks() has little to do with the stackdepot implementation, except that it is usually used by users (such as KASAN) of stackdepot to reduce the stack trace. However, filter_irq_stacks() itself is not useful without a stack trace as obtained by stack_trace_save() and friends. Therefore, move filter_irq_stacks() to kernel/stacktrace.c, so that new users of filter_irq_stacks() do not have to start depending on STACKDEPOT only for filter_irq_stacks(). Link: https://lkml.kernel.org/r/20210923104803.2620285-1-elver@google.com Signed-off-by: Marco Elver Acked-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Jann Horn Cc: Aleksandr Nogikh Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackdepot.h | 2 -- include/linux/stacktrace.h | 1 + kernel/stacktrace.c | 30 ++++++++++++++++++++++++++++++ lib/stackdepot.c | 24 ------------------------ 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index b2f7e7c6ba54..d29860966bc9 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -25,8 +25,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); - #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); #else diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 9edecb494e9e..bef158815e83 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, unsigned int size, unsigned int skipnr); unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); /* Internal interfaces. Do not use in generic code */ #ifdef CONFIG_ARCH_STACKWALK diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9f8117c7cfdd..9c625257023d 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -13,6 +13,7 @@ #include #include #include +#include /** * stack_trace_print - Print the entries in the stack trace @@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) #endif /* CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* !CONFIG_ARCH_STACKWALK */ + +static inline bool in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +/** + * filter_irq_stacks - Find first IRQ stack entry in trace + * @entries: Pointer to stack trace array + * @nr_entries: Number of entries in the storage array + * + * Return: Number of trace entries until IRQ stack starts. + */ +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries) +{ + unsigned int i; + + for (i = 0; i < nr_entries; i++) { + if (in_irqentry_text(entries[i])) { + /* Include the irqentry function into the stack. */ + return i + 1; + } + } + return nr_entries; +} +EXPORT_SYMBOL_GPL(filter_irq_stacks); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index bda58597e375..09485dc5bd12 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -20,7 +20,6 @@ */ #include -#include #include #include #include @@ -371,26 +370,3 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, return __stack_depot_save(entries, nr_entries, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); - -static inline int in_irqentry_text(unsigned long ptr) -{ - return (ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end) || - (ptr >= (unsigned long)&__softirqentry_text_start && - ptr < (unsigned long)&__softirqentry_text_end); -} - -unsigned int filter_irq_stacks(unsigned long *entries, - unsigned int nr_entries) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (in_irqentry_text(entries[i])) { - /* Include the irqentry function into the stack. */ - return i + 1; - } - } - return nr_entries; -} -EXPORT_SYMBOL_GPL(filter_irq_stacks); From 9a19aeb5665068c3e2727230588684aae2cab7ef Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:28 -0700 Subject: [PATCH 456/512] kfence: count unexpectedly skipped allocations Maintain a counter to count allocations that are skipped due to being incompatible (oversized, incompatible gfp flags) or no capacity. This is to compute the fraction of allocations that could not be serviced by KFENCE, which we expect to be rare. Link: https://lkml.kernel.org/r/20210923104803.2620285-2-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Acked-by: Alexander Potapenko Cc: Aleksandr Nogikh Cc: Jann Horn Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 7a97db8bc8e7..249d75b7e5ee 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -112,6 +112,8 @@ enum kfence_counter_id { KFENCE_COUNTER_FREES, KFENCE_COUNTER_ZOMBIES, KFENCE_COUNTER_BUGS, + KFENCE_COUNTER_SKIP_INCOMPAT, + KFENCE_COUNTER_SKIP_CAPACITY, KFENCE_COUNTER_COUNT, }; static atomic_long_t counters[KFENCE_COUNTER_COUNT]; @@ -121,6 +123,8 @@ static const char *const counter_names[] = { [KFENCE_COUNTER_FREES] = "total frees", [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", [KFENCE_COUNTER_BUGS] = "total bugs", + [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)", + [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)", }; static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); @@ -271,8 +275,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g list_del_init(&meta->list); } raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); - if (!meta) + if (!meta) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]); return NULL; + } if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { /* @@ -740,8 +746,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) * Perform size check before switching kfence_allocation_gate, so that * we don't disable KFENCE without making an allocation. */ - if (size > PAGE_SIZE) + if (size > PAGE_SIZE) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; + } /* * Skip allocations from non-default zones, including DMA. We cannot @@ -749,8 +757,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) * properties (e.g. reside in DMAable memory). */ if ((flags & GFP_ZONEMASK) || - (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) + (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; + } /* * allocation_gate only needs to become non-zero, so it doesn't make From a9ab52bbcb52df49ec4b30e6741e120588989455 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:31 -0700 Subject: [PATCH 457/512] kfence: move saving stack trace of allocations into __kfence_alloc() Move the saving of the stack trace of allocations into __kfence_alloc(), so that the stack entries array can be used outside of kfence_guarded_alloc() and we avoid potentially unwinding the stack multiple times. Link: https://lkml.kernel.org/r/20210923104803.2620285-3-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Acked-by: Alexander Potapenko Cc: Aleksandr Nogikh Cc: Jann Horn Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 249d75b7e5ee..db01814f8ff0 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -187,19 +187,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m * Update the object's metadata state, including updating the alloc/free stacks * depending on the state transition. */ -static noinline void metadata_update_state(struct kfence_metadata *meta, - enum kfence_object_state next) +static noinline void +metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next, + unsigned long *stack_entries, size_t num_stack_entries) { struct kfence_track *track = next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; lockdep_assert_held(&meta->lock); - /* - * Skip over 1 (this) functions; noinline ensures we do not accidentally - * skip over the caller by never inlining. - */ - track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); + if (stack_entries) { + memcpy(track->stack_entries, stack_entries, + num_stack_entries * sizeof(stack_entries[0])); + } else { + /* + * Skip over 1 (this) functions; noinline ensures we do not + * accidentally skip over the caller by never inlining. + */ + num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); + } + track->num_stack_entries = num_stack_entries; track->pid = task_pid_nr(current); track->cpu = raw_smp_processor_id(); track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ @@ -261,7 +268,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta, } } -static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp) +static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, + unsigned long *stack_entries, size_t num_stack_entries) { struct kfence_metadata *meta = NULL; unsigned long flags; @@ -320,7 +328,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g addr = (void *)meta->addr; /* Update remaining metadata. */ - metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED); + metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries); /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ WRITE_ONCE(meta->cache, cache); meta->size = size; @@ -400,7 +408,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z memzero_explicit(addr, meta->size); /* Mark the object as freed. */ - metadata_update_state(meta, KFENCE_OBJECT_FREED); + metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); raw_spin_unlock_irqrestore(&meta->lock, flags); @@ -742,6 +750,9 @@ void kfence_shutdown_cache(struct kmem_cache *s) void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { + unsigned long stack_entries[KFENCE_STACK_DEPTH]; + size_t num_stack_entries; + /* * Perform size check before switching kfence_allocation_gate, so that * we don't disable KFENCE without making an allocation. @@ -786,7 +797,9 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) if (!READ_ONCE(kfence_enabled)) return NULL; - return kfence_guarded_alloc(s, size, flags); + num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0); + + return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries); } size_t kfence_ksize(const void *addr) From 08f6b10630f284755087f58aa393402e15b92977 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:34 -0700 Subject: [PATCH 458/512] kfence: limit currently covered allocations when pool nearly full One of KFENCE's main design principles is that with increasing uptime, allocation coverage increases sufficiently to detect previously undetected bugs. We have observed that frequent long-lived allocations of the same source (e.g. pagecache) tend to permanently fill up the KFENCE pool with increasing system uptime, thus breaking the above requirement. The workaround thus far had been increasing the sample interval and/or increasing the KFENCE pool size, but is no reliable solution. To ensure diverse coverage of allocations, limit currently covered allocations of the same source once pool utilization reaches 75% (configurable via `kfence.skip_covered_thresh`) or above. The effect is retaining reasonable allocation coverage when the pool is close to full. A side-effect is that this also limits frequent long-lived allocations of the same source filling up the pool permanently. Uniqueness of an allocation for coverage purposes is based on its (partial) allocation stack trace (the source). A Counting Bloom filter is used to check if an allocation is covered; if the allocation is currently covered, the allocation is skipped by KFENCE. Testing was done using: (a) a synthetic workload that performs frequent long-lived allocations (default config values; sample_interval=1; num_objects=63), and (b) normal desktop workloads on an otherwise idle machine where the problem was first reported after a few days of uptime (default config values). In both test cases the sampled allocation rate no longer drops to zero at any point. In the case of (b) we observe (after 2 days uptime) 15% unique allocations in the pool, 77% pool utilization, with 20% "skipped allocations (covered)". [elver@google.com: simplify and just use hash_32(), use more random stack_hash_seed] Link: https://lkml.kernel.org/r/YU3MRGaCaJiYht5g@elver.google.com [elver@google.com: fix 32 bit] Link: https://lkml.kernel.org/r/20210923104803.2620285-4-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Acked-by: Alexander Potapenko Cc: Aleksandr Nogikh Cc: Jann Horn Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 109 ++++++++++++++++++++++++++++++++++++++++++++- mm/kfence/kfence.h | 2 + 2 files changed, 109 insertions(+), 2 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index db01814f8ff0..b61ef93d9f98 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -10,12 +10,15 @@ #include #include #include +#include #include +#include #include #include #include #include #include +#include #include #include #include @@ -82,6 +85,10 @@ static const struct kernel_param_ops sample_interval_param_ops = { }; module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); +/* Pool usage% threshold when currently covered allocations are skipped. */ +static unsigned long kfence_skip_covered_thresh __read_mostly = 75; +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); + /* The pool of pages used for guard pages and objects. */ char *__kfence_pool __ro_after_init; EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ @@ -105,6 +112,32 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); /* Gates the allocation, ensuring only one succeeds in a given period. */ atomic_t kfence_allocation_gate = ATOMIC_INIT(1); +/* + * A Counting Bloom filter of allocation coverage: limits currently covered + * allocations of the same source filling up the pool. + * + * Assuming a range of 15%-85% unique allocations in the pool at any point in + * time, the below parameters provide a probablity of 0.02-0.33 for false + * positive hits respectively: + * + * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM + */ +#define ALLOC_COVERED_HNUM 2 +#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2) +#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER) +#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER) +#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1) +static atomic_t alloc_covered[ALLOC_COVERED_SIZE]; + +/* Stack depth used to determine uniqueness of an allocation. */ +#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8) + +/* + * Randomness for stack hashes, making the same collisions across reboots and + * different machines less likely. + */ +static u32 stack_hash_seed __ro_after_init; + /* Statistics counters for debugfs. */ enum kfence_counter_id { KFENCE_COUNTER_ALLOCATED, @@ -114,6 +147,7 @@ enum kfence_counter_id { KFENCE_COUNTER_BUGS, KFENCE_COUNTER_SKIP_INCOMPAT, KFENCE_COUNTER_SKIP_CAPACITY, + KFENCE_COUNTER_SKIP_COVERED, KFENCE_COUNTER_COUNT, }; static atomic_long_t counters[KFENCE_COUNTER_COUNT]; @@ -125,11 +159,57 @@ static const char *const counter_names[] = { [KFENCE_COUNTER_BUGS] = "total bugs", [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)", [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)", + [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)", }; static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); /* === Internals ============================================================ */ +static inline bool should_skip_covered(void) +{ + unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100; + + return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh; +} + +static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries) +{ + num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH); + num_entries = filter_irq_stacks(stack_entries, num_entries); + return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed); +} + +/* + * Adds (or subtracts) count @val for allocation stack trace hash + * @alloc_stack_hash from Counting Bloom filter. + */ +static void alloc_covered_add(u32 alloc_stack_hash, int val) +{ + int i; + + for (i = 0; i < ALLOC_COVERED_HNUM; i++) { + atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]); + alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); + } +} + +/* + * Returns true if the allocation stack trace hash @alloc_stack_hash is + * currently contained (non-zero count) in Counting Bloom filter. + */ +static bool alloc_covered_contains(u32 alloc_stack_hash) +{ + int i; + + for (i = 0; i < ALLOC_COVERED_HNUM; i++) { + if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK])) + return false; + alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); + } + + return true; +} + static bool kfence_protect(unsigned long addr) { return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); @@ -269,7 +349,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta, } static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, - unsigned long *stack_entries, size_t num_stack_entries) + unsigned long *stack_entries, size_t num_stack_entries, + u32 alloc_stack_hash) { struct kfence_metadata *meta = NULL; unsigned long flags; @@ -332,6 +413,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ WRITE_ONCE(meta->cache, cache); meta->size = size; + meta->alloc_stack_hash = alloc_stack_hash; + for_each_canary(meta, set_canary_byte); /* Set required struct page fields. */ @@ -344,6 +427,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g raw_spin_unlock_irqrestore(&meta->lock, flags); + alloc_covered_add(alloc_stack_hash, 1); + /* Memory initialization. */ /* @@ -412,6 +497,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z raw_spin_unlock_irqrestore(&meta->lock, flags); + alloc_covered_add(meta->alloc_stack_hash, -1); + /* Protect to detect use-after-frees. */ kfence_protect((unsigned long)addr); @@ -677,6 +764,7 @@ void __init kfence_init(void) if (!kfence_sample_interval) return; + stack_hash_seed = (u32)random_get_entropy(); if (!kfence_init_pool()) { pr_err("%s failed\n", __func__); return; @@ -752,6 +840,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { unsigned long stack_entries[KFENCE_STACK_DEPTH]; size_t num_stack_entries; + u32 alloc_stack_hash; /* * Perform size check before switching kfence_allocation_gate, so that @@ -799,7 +888,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0); - return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries); + /* + * Do expensive check for coverage of allocation in slow-path after + * allocation_gate has already become non-zero, even though it might + * mean not making any allocation within a given sample interval. + * + * This ensures reasonable allocation coverage when the pool is almost + * full, including avoiding long-lived allocations of the same source + * filling up the pool (e.g. pagecache allocations). + */ + alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries); + if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]); + return NULL; + } + + return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries, + alloc_stack_hash); } size_t kfence_ksize(const void *addr) diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index c1f23c61e5f9..2a2d5de9d379 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -87,6 +87,8 @@ struct kfence_metadata { /* Allocation and free stack information. */ struct kfence_track alloc_track; struct kfence_track free_track; + /* For updating alloc_covered on frees. */ + u32 alloc_stack_hash; }; extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; From 5cc906b4b4a510b274113ddb3f88d60644553f79 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:37 -0700 Subject: [PATCH 459/512] kfence: add note to documentation about skipping covered allocations Add a note briefly mentioning the new policy about "skipping currently covered allocations if pool close to full." Since this has a notable impact on KFENCE's bug-detection ability on systems with large uptimes, it is worth pointing out the feature. Link: https://lkml.kernel.org/r/20210923104803.2620285-5-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Dmitry Vyukov Acked-by: Alexander Potapenko Cc: Aleksandr Nogikh Cc: Jann Horn Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kfence.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index 0fbe3308bf37..d45f952986ae 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -269,6 +269,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused first, and the chances of detecting use-after-frees of recently freed objects is increased. +If pool utilization reaches 75% (default) or above, to reduce the risk of the +pool eventually being fully occupied by allocated objects yet ensure diverse +coverage of allocations, KFENCE limits currently covered allocations of the +same source from further filling up the pool. The "source" of an allocation is +based on its partial allocation stack trace. A side-effect is that this also +limits frequent long-lived allocations (e.g. pagecache) of the same source +filling up the pool permanently, which is the most common risk for the pool +becoming full and the sampled allocation rate dropping to zero. The threshold +at which to start limiting currently covered allocations can be configured via +the boot parameter ``kfence.skip_covered_thresh`` (pool usage%). + Interface --------- From f51733e2fc4d8f01d813d71fb036d6854a53e7e3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:40 -0700 Subject: [PATCH 460/512] kfence: test: use kunit_skip() to skip tests Use the new kunit_skip() to skip tests if requirements were not met. It makes it easier to see in KUnit's summary if there were skipped tests. Link: https://lkml.kernel.org/r/20210922182541.1372400-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: David Gow Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Aleksandr Nogikh Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/kfence_test.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index f1690cf54199..695030c1fff8 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -32,6 +32,11 @@ #define arch_kfence_test_address(addr) (addr) #endif +#define KFENCE_TEST_REQUIRES(test, cond) do { \ + if (!(cond)) \ + kunit_skip((test), "Test requires: " #cond); \ +} while (0) + /* Report as observed from console. */ static struct { spinlock_t lock; @@ -555,8 +560,7 @@ static void test_init_on_free(struct kunit *test) }; int i; - if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON)) - return; + KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON)); /* Assume it hasn't been disabled on command line. */ setup_test_cache(test, size, 0, NULL); @@ -603,10 +607,8 @@ static void test_gfpzero(struct kunit *test) char *buf1, *buf2; int i; - if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) { - kunit_warn(test, "skipping ... would take too long\n"); - return; - } + /* Skip if we think it'd take too long. */ + KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100); setup_test_cache(test, size, 0, NULL); buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); From 49332956227adb35ffa7e3282c13e787325ff301 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:43 -0700 Subject: [PATCH 461/512] kfence: shorten critical sections of alloc/free Initializing memory and setting/checking the canary bytes is relatively expensive, and doing so in the meta->lock critical sections extends the duration with preemption and interrupts disabled unnecessarily. Any reads to meta->addr and meta->size in kfence_guarded_alloc() and kfence_guarded_free() don't require locking meta->lock as long as the object is removed from the freelist: only kfence_guarded_alloc() sets meta->addr and meta->size after removing it from the freelist, which requires a preceding kfence_guarded_free() returning it to the list or the initial state. Therefore move reads to meta->addr and meta->size, including expensive memory initialization using them, out of meta->lock critical sections. Link: https://lkml.kernel.org/r/20210930153706.2105471-1-elver@google.com Signed-off-by: Marco Elver Acked-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index b61ef93d9f98..802905b1c89b 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -309,12 +309,19 @@ static inline bool set_canary_byte(u8 *addr) /* Check canary byte at @addr. */ static inline bool check_canary_byte(u8 *addr) { + struct kfence_metadata *meta; + unsigned long flags; + if (likely(*addr == KFENCE_CANARY_PATTERN(addr))) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr), - KFENCE_ERROR_CORRUPTION); + + meta = addr_to_metadata((unsigned long)addr); + raw_spin_lock_irqsave(&meta->lock, flags); + kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); + raw_spin_unlock_irqrestore(&meta->lock, flags); + return false; } @@ -324,8 +331,6 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta, const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); unsigned long addr; - lockdep_assert_held(&meta->lock); - /* * We'll iterate over each canary byte per-side until fn() returns * false. However, we'll still iterate over the canary bytes to the @@ -414,8 +419,9 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g WRITE_ONCE(meta->cache, cache); meta->size = size; meta->alloc_stack_hash = alloc_stack_hash; + raw_spin_unlock_irqrestore(&meta->lock, flags); - for_each_canary(meta, set_canary_byte); + alloc_covered_add(alloc_stack_hash, 1); /* Set required struct page fields. */ page = virt_to_page(meta->addr); @@ -425,11 +431,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g if (IS_ENABLED(CONFIG_SLAB)) page->s_mem = addr; - raw_spin_unlock_irqrestore(&meta->lock, flags); - - alloc_covered_add(alloc_stack_hash, 1); - /* Memory initialization. */ + for_each_canary(meta, set_canary_byte); /* * We check slab_want_init_on_alloc() ourselves, rather than letting @@ -454,6 +457,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z { struct kcsan_scoped_access assert_page_exclusive; unsigned long flags; + bool init; raw_spin_lock_irqsave(&meta->lock, flags); @@ -481,6 +485,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z meta->unprotected_page = 0; } + /* Mark the object as freed. */ + metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); + init = slab_want_init_on_free(meta->cache); + raw_spin_unlock_irqrestore(&meta->lock, flags); + + alloc_covered_add(meta->alloc_stack_hash, -1); + /* Check canary bytes for memory corruption. */ for_each_canary(meta, check_canary_byte); @@ -489,16 +500,9 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z * data is still there, and after a use-after-free is detected, we * unprotect the page, so the data is still accessible. */ - if (!zombie && unlikely(slab_want_init_on_free(meta->cache))) + if (!zombie && unlikely(init)) memzero_explicit(addr, meta->size); - /* Mark the object as freed. */ - metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); - - raw_spin_unlock_irqrestore(&meta->lock, flags); - - alloc_covered_add(meta->alloc_stack_hash, -1); - /* Protect to detect use-after-frees. */ kfence_protect((unsigned long)addr); From 07e8481d3c38f461d7b79c1d5c9afe013b162b0c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:46 -0700 Subject: [PATCH 462/512] kfence: always use static branches to guard kfence_alloc() Regardless of KFENCE mode (CONFIG_KFENCE_STATIC_KEYS: either using static keys to gate allocations, or using a simple dynamic branch), always use a static branch to avoid the dynamic branch in kfence_alloc() if KFENCE was disabled at boot. For CONFIG_KFENCE_STATIC_KEYS=n, this now avoids the dynamic branch if KFENCE was disabled at boot. To simplify, also unifies the location where kfence_allocation_gate is read-checked to just be inline in kfence_alloc(). Link: https://lkml.kernel.org/r/20211019102524.2807208-1-elver@google.com Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 21 +++++++++++---------- mm/kfence/core.c | 16 +++++++--------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 3fe6dd8a18c1..4b5e3679a72c 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -14,6 +14,9 @@ #ifdef CONFIG_KFENCE +#include +#include + /* * We allocate an even number of pages, as it simplifies calculations to map * address to metadata indices; effectively, the very first page serves as an @@ -22,13 +25,8 @@ #define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE) extern char *__kfence_pool; -#ifdef CONFIG_KFENCE_STATIC_KEYS -#include DECLARE_STATIC_KEY_FALSE(kfence_allocation_key); -#else -#include extern atomic_t kfence_allocation_gate; -#endif /** * is_kfence_address() - check if an address belongs to KFENCE pool @@ -116,13 +114,16 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags); */ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { -#ifdef CONFIG_KFENCE_STATIC_KEYS - if (static_branch_unlikely(&kfence_allocation_key)) +#if defined(CONFIG_KFENCE_STATIC_KEYS) || CONFIG_KFENCE_SAMPLE_INTERVAL == 0 + if (!static_branch_unlikely(&kfence_allocation_key)) + return NULL; #else - if (unlikely(!atomic_read(&kfence_allocation_gate))) + if (!static_branch_likely(&kfence_allocation_key)) + return NULL; #endif - return __kfence_alloc(s, size, flags); - return NULL; + if (likely(atomic_read(&kfence_allocation_gate))) + return NULL; + return __kfence_alloc(s, size, flags); } /** diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 802905b1c89b..09945784df9e 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -104,10 +104,11 @@ struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ -#ifdef CONFIG_KFENCE_STATIC_KEYS -/* The static key to set up a KFENCE allocation. */ +/* + * The static key to set up a KFENCE allocation; or if static keys are not used + * to gate allocations, to avoid a load and compare if KFENCE is disabled. + */ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); -#endif /* Gates the allocation, ensuring only one succeeds in a given period. */ atomic_t kfence_allocation_gate = ATOMIC_INIT(1); @@ -774,6 +775,8 @@ void __init kfence_init(void) return; } + if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) + static_branch_enable(&kfence_allocation_key); WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, @@ -866,12 +869,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) return NULL; } - /* - * allocation_gate only needs to become non-zero, so it doesn't make - * sense to continue writing to it and pay the associated contention - * cost, in case we have a large number of concurrent allocations. - */ - if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) + if (atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS /* From 4f612ed3f748962cbef1316ff3d323e2b9055b6e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:49 -0700 Subject: [PATCH 463/512] kfence: default to dynamic branch instead of static keys mode We have observed that on very large machines with newer CPUs, the static key/branch switching delay is on the order of milliseconds. This is due to the required broadcast IPIs, which simply does not scale well to hundreds of CPUs (cores). If done too frequently, this can adversely affect tail latencies of various workloads. One workaround is to increase the sample interval to several seconds, while decreasing sampled allocation coverage, but the problem still exists and could still increase tail latencies. As already noted in the Kconfig help text, there are trade-offs: at lower sample intervals the dynamic branch results in better performance; however, at very large sample intervals, the static keys mode can result in better performance -- careful benchmarking is recommended. Our initial benchmarking showed that with large enough sample intervals and workloads stressing the allocator, the static keys mode was slightly better. Evaluating and observing the possible system-wide side-effects of the static-key-switching induced broadcast IPIs, however, was a blind spot (in particular on large machines with 100s of cores). Therefore, a major downside of the static keys mode is, unfortunately, that it is hard to predict performance on new system architectures and topologies, but also making conclusions about performance of new workloads based on a limited set of benchmarks. Most distributions will simply select the defaults, while targeting a large variety of different workloads and system architectures. As such, the better default is CONFIG_KFENCE_STATIC_KEYS=n, and re-enabling it is only recommended after careful evaluation. For reference, on x86-64 the condition in kfence_alloc() generates exactly 2 instructions in the kmem_cache_alloc() fast-path: | ... | cmpl $0x0,0x1a8021c(%rip) # ffffffff82d560d0 | je ffffffff812d6003 | ... which, given kfence_allocation_gate is infrequently modified, should be well predicted by most CPUs. Link: https://lkml.kernel.org/r/20211019102524.2807208-2-elver@google.com Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kfence.rst | 12 ++++++++---- lib/Kconfig.kfence | 26 +++++++++++++++----------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index d45f952986ae..ac6b89d1a8c3 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -231,10 +231,14 @@ Guarded allocations are set up based on the sample interval. After expiration of the sample interval, the next allocation through the main allocator (SLAB or SLUB) returns a guarded allocation from the KFENCE object pool (allocation sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and -the next allocation is set up after the expiration of the interval. To "gate" a -KFENCE allocation through the main allocator's fast-path without overhead, -KFENCE relies on static branches via the static keys infrastructure. The static -branch is toggled to redirect the allocation to KFENCE. +the next allocation is set up after the expiration of the interval. + +When using ``CONFIG_KFENCE_STATIC_KEYS=y``, KFENCE allocations are "gated" +through the main allocator's fast-path by relying on static branches via the +static keys infrastructure. The static branch is toggled to redirect the +allocation to KFENCE. Depending on sample interval, target workloads, and +system architecture, this may perform better than the simple dynamic branch. +Careful benchmarking is recommended. KFENCE objects each reside on a dedicated page, at either the left or right page boundaries selected at random. The pages to the left and right of the diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index e641add33947..912f252a41fc 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -25,17 +25,6 @@ menuconfig KFENCE if KFENCE -config KFENCE_STATIC_KEYS - bool "Use static keys to set up allocations" - default y - depends on JUMP_LABEL # To ensure performance, require jump labels - help - Use static keys (static branches) to set up KFENCE allocations. Using - static keys is normally recommended, because it avoids a dynamic - branch in the allocator's fast path. However, with very low sample - intervals, or on systems that do not support jump labels, a dynamic - branch may still be an acceptable performance trade-off. - config KFENCE_SAMPLE_INTERVAL int "Default sample interval in milliseconds" default 100 @@ -56,6 +45,21 @@ config KFENCE_NUM_OBJECTS pages are required; with one containing the object and two adjacent ones used as guard pages. +config KFENCE_STATIC_KEYS + bool "Use static keys to set up allocations" if EXPERT + depends on JUMP_LABEL + help + Use static keys (static branches) to set up KFENCE allocations. This + option is only recommended when using very large sample intervals, or + performance has carefully been evaluated with this option. + + Using static keys comes with trade-offs that need to be carefully + evaluated given target workloads and system architectures. Notably, + enabling and disabling static keys invoke IPI broadcasts, the latency + and impact of which is much harder to predict than a dynamic branch. + + Say N if you are unsure. + config KFENCE_STRESS_TEST_FAULTS int "Stress testing of fault handling and error reporting" if EXPERT default 0 From f24b0626076783d56ef41c6459fedf70ab6dcbd0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 5 Nov 2021 13:45:52 -0700 Subject: [PATCH 464/512] mm/damon: grammar s/works/work/ Correct a singular versus plural grammar mistake in the help text for the DAMON_VADDR config symbol. Link: https://lkml.kernel.org/r/20210914073451.3883834-1-geert@linux-m68k.org Fixes: 3f49584b262cf8f4 ("mm/damon: implement primitives for the virtual memory address spaces") Signed-off-by: Geert Uytterhoeven Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 37024798a97c..ba8898c7eb8e 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -30,7 +30,7 @@ config DAMON_VADDR select PAGE_IDLE_FLAG help This builds the default data access monitoring primitives for DAMON - that works for virtual address spaces. + that work for virtual address spaces. config DAMON_VADDR_KUNIT_TEST bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS From ad782c48df326eb13cf5dec7aab571b44be3e415 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:45:55 -0700 Subject: [PATCH 465/512] Documentation/vm: move user guides to admin-guide/mm/ Most memory management user guide documents are in 'admin-guide/mm/', but two of those are in 'vm/'. This moves the two docs into 'admin-guide/mm' for easier documents finding. Link: https://lkml.kernel.org/r/20210917123958.3819-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/index.rst | 2 ++ .../{vm => admin-guide/mm}/swap_numa.rst | 0 .../{vm => admin-guide/mm}/zswap.rst | 0 Documentation/vm/index.rst | 26 ++++--------------- 4 files changed, 7 insertions(+), 21 deletions(-) rename Documentation/{vm => admin-guide/mm}/swap_numa.rst (100%) rename Documentation/{vm => admin-guide/mm}/zswap.rst (100%) diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index cbd19d5e625f..c21b5823f126 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -37,5 +37,7 @@ the Linux memory management. numaperf pagemap soft-dirty + swap_numa transhuge userfaultfd + zswap diff --git a/Documentation/vm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst similarity index 100% rename from Documentation/vm/swap_numa.rst rename to Documentation/admin-guide/mm/swap_numa.rst diff --git a/Documentation/vm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst similarity index 100% rename from Documentation/vm/zswap.rst rename to Documentation/admin-guide/mm/zswap.rst diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index b51f0d8992f8..6f5ffef4b716 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -3,27 +3,11 @@ Linux Memory Management Documentation ===================================== This is a collection of documents about the Linux memory management (mm) -subsystem. If you are looking for advice on simply allocating memory, -see the :ref:`memory_allocation`. - -User guides for MM features -=========================== - -The following documents provide guides for controlling and tuning -various features of the Linux memory management - -.. toctree:: - :maxdepth: 1 - - swap_numa - zswap - -Kernel developers MM documentation -================================== - -The below documents describe MM internals with different level of -details ranging from notes and mailing list responses to elaborate -descriptions of data structures and algorithms. +subsystem internals with different level of details ranging from notes and +mailing list responses for elaborating descriptions of data structures and +algorithms. If you are looking for advice on simply allocating memory, see the +:ref:`memory_allocation`. For controlling and tuning guides, see the +:doc:`admin guide <../admin-guide/mm/index>`. .. toctree:: :maxdepth: 1 From f9803a9918468d70d98d31c23cb7613d873e723f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:45:58 -0700 Subject: [PATCH 466/512] MAINTAINERS: update SeongJae's email address This updates SeongJae's email address in MAINTAINERS file to his preferred one. Link: https://lkml.kernel.org/r/20210917123958.3819-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3b79fd441dde..3cf33e113b9b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5161,7 +5161,7 @@ F: net/ax25/ax25_timer.c F: net/ax25/sysctl_net_ax25.c DATA ACCESS MONITOR -M: SeongJae Park +M: SeongJae Park L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/damon/ From 876d0aac2e3af10fbaf1c7a814840c71e470dc5c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:01 -0700 Subject: [PATCH 467/512] docs/vm/damon: remove broken reference Building DAMON documents warns for a reference to nonexisting doc, as below: $ time make htmldocs [...] Documentation/vm/damon/index.rst:24: WARNING: toctree contains reference to nonexisting document 'vm/damon/plans' This fixes the warning by removing the wrong reference. Link: https://lkml.kernel.org/r/20210917123958.3819-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/damon/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst index a2858baf3bf1..48c0bbff98b2 100644 --- a/Documentation/vm/damon/index.rst +++ b/Documentation/vm/damon/index.rst @@ -27,4 +27,3 @@ workloads and systems. faq design api - plans From d2f272b35a84ace2ef04334a9822fd726a7f061b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:04 -0700 Subject: [PATCH 468/512] include/linux/damon.h: fix kernel-doc comments for 'damon_callback' A few Kernel-doc comments in 'damon.h' are broken. This fixes them. Link: https://lkml.kernel.org/r/20210917123958.3819-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index d68b67b8d458..755d70804705 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -62,7 +62,7 @@ struct damon_target { struct damon_ctx; /** - * struct damon_primitive Monitoring primitives for given use cases. + * struct damon_primitive - Monitoring primitives for given use cases. * * @init: Initialize primitive-internal data structures. * @update: Update primitive-internal data structures. @@ -108,8 +108,8 @@ struct damon_primitive { void (*cleanup)(struct damon_ctx *context); }; -/* - * struct damon_callback Monitoring events notification callbacks. +/** + * struct damon_callback - Monitoring events notification callbacks. * * @before_start: Called before starting the monitoring. * @after_sampling: Called after each sampling. From 704571f997424ecd64b10b37ca6097e65690240a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:06 -0700 Subject: [PATCH 469/512] mm/damon/core: print kdamond start log in debug mode only Logging of kdamond startup is using 'pr_info()' unnecessarily. This makes it to use 'pr_debug()' instead. Link: https://lkml.kernel.org/r/20210917123958.3819-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 30e9211f494a..874558a790a0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -653,7 +653,7 @@ static int kdamond_fn(void *data) unsigned long sz_limit = 0; mutex_lock(&ctx->kdamond_lock); - pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); + pr_debug("kdamond (%d) starts\n", ctx->kdamond->pid); mutex_unlock(&ctx->kdamond_lock); if (ctx->primitive.init) From 5f7fe2b9b827662cf349ab45406d6cbf0cc6251f Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 5 Nov 2021 13:46:09 -0700 Subject: [PATCH 470/512] mm/damon: remove unnecessary do_exit() from kdamond Just return from the kthread function. Link: https://lkml.kernel.org/r/20210927232421.17694-1-changbin.du@gmail.com Signed-off-by: Changbin Du Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 874558a790a0..61a9e3b37bc9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -714,7 +714,7 @@ static int kdamond_fn(void *data) nr_running_ctxs--; mutex_unlock(&damon_lock); - do_exit(0); + return 0; } #include "core-test.h" From 42e4cef5fe48333e0db6e98b019edf5f2c2f11fd Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 5 Nov 2021 13:46:12 -0700 Subject: [PATCH 471/512] mm/damon: needn't hold kdamond_lock to print pid of kdamond Just get the pid by 'current->pid'. Meanwhile, to be symmetrical make the 'starts' and 'finishes' logs both use debug level. Link: https://lkml.kernel.org/r/20210927232432.17750-1-changbin.du@gmail.com Signed-off-by: Changbin Du Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 61a9e3b37bc9..8171e7dddc30 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -652,9 +652,7 @@ static int kdamond_fn(void *data) unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; - mutex_lock(&ctx->kdamond_lock); - pr_debug("kdamond (%d) starts\n", ctx->kdamond->pid); - mutex_unlock(&ctx->kdamond_lock); + pr_debug("kdamond (%d) starts\n", current->pid); if (ctx->primitive.init) ctx->primitive.init(ctx); @@ -705,7 +703,7 @@ static int kdamond_fn(void *data) if (ctx->primitive.cleanup) ctx->primitive.cleanup(ctx); - pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid); + pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); From 7ec1992b891e59dba0f04e0327980786e8f61b13 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 5 Nov 2021 13:46:15 -0700 Subject: [PATCH 472/512] mm/damon/core: nullify pointer ctx->kdamond with a NULL Currently a plain integer is being used to nullify the pointer ctx->kdamond. Use NULL instead. Cleans up sparse warning: mm/damon/core.c:317:40: warning: Using plain integer as NULL pointer Link: https://lkml.kernel.org/r/20210925215908.181226-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 8171e7dddc30..d993db50280c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -314,7 +314,7 @@ static int __damon_start(struct damon_ctx *ctx) nr_running_ctxs); if (IS_ERR(ctx->kdamond)) { err = PTR_ERR(ctx->kdamond); - ctx->kdamond = 0; + ctx->kdamond = NULL; } } mutex_unlock(&ctx->kdamond_lock); From fda504fade7f124858d7022341dc46ff35b45274 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:18 -0700 Subject: [PATCH 473/512] mm/damon/core: account age of target regions Patch series "Implement Data Access Monitoring-based Memory Operation Schemes". Introduction ============ DAMON[1] can be used as a primitive for data access aware memory management optimizations. For that, users who want such optimizations should run DAMON, read the monitoring results, analyze it, plan a new memory management scheme, and apply the new scheme by themselves. Such efforts will be inevitable for some complicated optimizations. However, in many other cases, the users would simply want the system to apply a memory management action to a memory region of a specific size having a specific access frequency for a specific time. For example, "page out a memory region larger than 100 MiB keeping only rare accesses more than 2 minutes", or "Do not use THP for a memory region larger than 2 MiB rarely accessed for more than 1 seconds". To make the works easier and non-redundant, this patchset implements a new feature of DAMON, which is called Data Access Monitoring-based Operation Schemes (DAMOS). Using the feature, users can describe the normal schemes in a simple way and ask DAMON to execute those on its own. [1] https://damonitor.github.io Evaluations =========== DAMOS is accurate and useful for memory management optimizations. An experimental DAMON-based operation scheme for THP, 'ethp', removes 76.15% of THP memory overheads while preserving 51.25% of THP speedup. Another experimental DAMON-based 'proactive reclamation' implementation, 'prcl', reduces 93.38% of residential sets and 23.63% of system memory footprint while incurring only 1.22% runtime overhead in the best case (parsec3/freqmine). NOTE that the experimental THP optimization and proactive reclamation are not for production but only for proof of concepts. Please refer to the showcase web site's evaluation document[1] for detailed evaluation setup and results. [1] https://damonitor.github.io/doc/html/v34/vm/damon/eval.html Long-term Support Trees ----------------------- For people who want to test DAMON but using LTS kernels, there are another couple of trees based on two latest LTS kernels respectively and containing the 'damon/master' backports. - For v5.4.y: https://git.kernel.org/sj/h/damon/for-v5.4.y - For v5.10.y: https://git.kernel.org/sj/h/damon/for-v5.10.y Sequence Of Patches =================== The 1st patch accounts age of each region. The 2nd patch implements the core of the DAMON-based operation schemes feature. The 3rd patch makes the default monitoring primitives for virtual address spaces to support the schemes. From this point, the kernel space users can use DAMOS. The 4th patch exports the feature to the user space via the debugfs interface. The 5th patch implements schemes statistics feature for easier tuning of the schemes and runtime access pattern analysis, and the 6th patch adds selftests for these changes. Finally, the 7th patch documents this new feature. This patch (of 7): DAMON can be used for data access pattern aware memory management optimizations. For that, users should run DAMON, read the monitoring results, analyze it, plan a new memory management scheme, and apply the new scheme by themselves. It would not be too hard, but still require some level of effort. For complicated cases, this effort is inevitable. That said, in many cases, users would simply want to apply an actions to a memory region of a specific size having a specific access frequency for a specific time. For example, "page out a memory region larger than 100 MiB but having a low access frequency more than 10 minutes", or "Use THP for a memory region larger than 2 MiB having a high access frequency for more than 2 seconds". For such optimizations, users will need to first account the age of each region themselves. To reduce such efforts, this implements a simple age account of each region in DAMON. For each aggregation step, DAMON compares the access frequency with that from last aggregation and reset the age of the region if the change is significant. Else, the age is incremented. Also, in case of the merge of regions, the region size-weighted average of the ages is set as the age of merged new region. Link: https://lkml.kernel.org/r/20211001125604.29660-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211001125604.29660-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Cameron Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Jonathan Corbet Cc: David Hildenbrand Cc: David Woodhouse Cc: Marco Elver Cc: Leonard Foerster Cc: Greg Thelen Cc: Markus Boehme Cc: David Rienjes Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 10 ++++++++++ mm/damon/core.c | 13 +++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 755d70804705..3e8215debbd4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -31,12 +31,22 @@ struct damon_addr_range { * @sampling_addr: Address of the sample for the next access check. * @nr_accesses: Access frequency of this region. * @list: List head for siblings. + * @age: Age of this region. + * + * @age is initially zero, increased for each aggregation interval, and reset + * to zero again if the access frequency is significantly changed. If two + * regions are merged into a new region, both @nr_accesses and @age of the new + * region are set as region size-weighted average of those of the two regions. */ struct damon_region { struct damon_addr_range ar; unsigned long sampling_addr; unsigned int nr_accesses; struct list_head list; + + unsigned int age; +/* private: Internal value for age calculation. */ + unsigned int last_nr_accesses; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index d993db50280c..3efbe80779db 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -45,6 +45,9 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) region->nr_accesses = 0; INIT_LIST_HEAD(®ion->list); + region->age = 0; + region->last_nr_accesses = 0; + return region; } @@ -444,6 +447,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_for_each_region(r, t) { trace_damon_aggregated(t, r, damon_nr_regions(t)); + r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } } @@ -461,6 +465,7 @@ static void damon_merge_two_regions(struct damon_target *t, l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); + l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; damon_destroy_region(r, t); } @@ -480,6 +485,11 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { + if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres) + r->age = 0; + else + r->age++; + if (prev && prev->ar.end == r->ar.start && diff_of(prev->nr_accesses, r->nr_accesses) <= thres && sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) @@ -527,6 +537,9 @@ static void damon_split_region_at(struct damon_ctx *ctx, r->ar.end = new->ar.start; + new->age = r->age; + new->last_nr_accesses = r->last_nr_accesses; + damon_insert_region(new, r, damon_next_region(r), t); } From 1f366e421c8f69583ed37b56d86e3747331869c3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:22 -0700 Subject: [PATCH 474/512] mm/damon/core: implement DAMON-based Operation Schemes (DAMOS) In many cases, users might use DAMON for simple data access aware memory management optimizations such as applying an operation scheme to a memory region of a specific size having a specific access frequency for a specific time. For example, "page out a memory region larger than 100 MiB but having a low access frequency more than 10 minutes", or "Use THP for a memory region larger than 2 MiB having a high access frequency for more than 2 seconds". Most simple form of the solution would be doing offline data access pattern profiling using DAMON and modifying the application source code or system configuration based on the profiling results. Or, developing a daemon constructed with two modules (one for access monitoring and the other for applying memory management actions via mlock(), madvise(), sysctl, etc) is imaginable. To avoid users spending their time for implementation of such simple data access monitoring-based operation schemes, this makes DAMON to handle such schemes directly. With this change, users can simply specify their desired schemes to DAMON. Then, DAMON will automatically apply the schemes to the user-specified target processes. Each of the schemes is composed with conditions for filtering of the target memory regions and desired memory management action for the target. Specifically, the format is:: The filtering conditions are size of memory region, number of accesses to the region monitored by DAMON, and the age of the region. The age of region is incremented periodically but reset when its addresses or access frequency has significantly changed or the action of a scheme was applied. For the action, current implementation supports a few of madvise()-like hints, ``WILLNEED``, ``COLD``, ``PAGEOUT``, ``HUGEPAGE``, and ``NOHUGEPAGE``. Because DAMON supports various address spaces and application of the actions to a monitoring target region is dependent to the type of the target address space, the application code should be implemented by each primitives and registered to the framework. Note that this only implements the framework part. Following commit will implement the action applications for virtual address spaces primitives. Link: https://lkml.kernel.org/r/20211001125604.29660-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 66 +++++++++++++++++++++++++ mm/damon/core.c | 109 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 3e8215debbd4..dbe18b0fb795 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -69,6 +69,48 @@ struct damon_target { struct list_head list; }; +/** + * enum damos_action - Represents an action of a Data Access Monitoring-based + * Operation Scheme. + * + * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. + * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. + * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. + * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + */ +enum damos_action { + DAMOS_WILLNEED, + DAMOS_COLD, + DAMOS_PAGEOUT, + DAMOS_HUGEPAGE, + DAMOS_NOHUGEPAGE, +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @min_sz_region: Minimum size of target regions. + * @max_sz_region: Maximum size of target regions. + * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. + * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. + * @min_age_region: Minimum age of target regions. + * @max_age_region: Maximum age of target regions. + * @action: &damo_action to be applied to the target regions. + * @list: List head for siblings. + * + * Note that both the minimums and the maximums are inclusive. + */ +struct damos { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; + enum damos_action action; + struct list_head list; +}; + struct damon_ctx; /** @@ -79,6 +121,7 @@ struct damon_ctx; * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. * @reset_aggregated: Reset aggregated accesses monitoring results. + * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. * @cleanup: Clean up the context. * @@ -104,6 +147,9 @@ struct damon_ctx; * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. + * @apply_scheme is called from @kdamond when a region for user provided + * DAMON-based operation scheme is found. It should apply the scheme's action + * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -114,6 +160,8 @@ struct damon_primitive { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); + int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; @@ -192,6 +240,7 @@ struct damon_callback { * @min_nr_regions: The minimum number of adaptive monitoring regions. * @max_nr_regions: The maximum number of adaptive monitoring regions. * @adaptive_targets: Head of monitoring targets (&damon_target) list. + * @schemes: Head of schemes (&damos) list. */ struct damon_ctx { unsigned long sample_interval; @@ -213,6 +262,7 @@ struct damon_ctx { unsigned long min_nr_regions; unsigned long max_nr_regions; struct list_head adaptive_targets; + struct list_head schemes; }; #define damon_next_region(r) \ @@ -233,6 +283,12 @@ struct damon_ctx { #define damon_for_each_target_safe(t, next, ctx) \ list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list) +#define damon_for_each_scheme(s, ctx) \ + list_for_each_entry(s, &(ctx)->schemes, list) + +#define damon_for_each_scheme_safe(s, next, ctx) \ + list_for_each_entry_safe(s, next, &(ctx)->schemes, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); @@ -242,6 +298,14 @@ inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); +struct damos *damon_new_scheme( + unsigned long min_sz_region, unsigned long max_sz_region, + unsigned int min_nr_accesses, unsigned int max_nr_accesses, + unsigned int min_age_region, unsigned int max_age_region, + enum damos_action action); +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); +void damon_destroy_scheme(struct damos *s); + struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); void damon_free_target(struct damon_target *t); @@ -255,6 +319,8 @@ int damon_set_targets(struct damon_ctx *ctx, int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_set_schemes(struct damon_ctx *ctx, + struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/core.c b/mm/damon/core.c index 3efbe80779db..0ed97b21cbb6 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -85,6 +85,50 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t) damon_free_region(r); } +struct damos *damon_new_scheme( + unsigned long min_sz_region, unsigned long max_sz_region, + unsigned int min_nr_accesses, unsigned int max_nr_accesses, + unsigned int min_age_region, unsigned int max_age_region, + enum damos_action action) +{ + struct damos *scheme; + + scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); + if (!scheme) + return NULL; + scheme->min_sz_region = min_sz_region; + scheme->max_sz_region = max_sz_region; + scheme->min_nr_accesses = min_nr_accesses; + scheme->max_nr_accesses = max_nr_accesses; + scheme->min_age_region = min_age_region; + scheme->max_age_region = max_age_region; + scheme->action = action; + INIT_LIST_HEAD(&scheme->list); + + return scheme; +} + +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s) +{ + list_add_tail(&s->list, &ctx->schemes); +} + +static void damon_del_scheme(struct damos *s) +{ + list_del(&s->list); +} + +static void damon_free_scheme(struct damos *s) +{ + kfree(s); +} + +void damon_destroy_scheme(struct damos *s) +{ + damon_del_scheme(s); + damon_free_scheme(s); +} + /* * Construct a damon_target struct * @@ -156,6 +200,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->max_nr_regions = 1000; INIT_LIST_HEAD(&ctx->adaptive_targets); + INIT_LIST_HEAD(&ctx->schemes); return ctx; } @@ -175,7 +220,13 @@ static void damon_destroy_targets(struct damon_ctx *ctx) void damon_destroy_ctx(struct damon_ctx *ctx) { + struct damos *s, *next_s; + damon_destroy_targets(ctx); + + damon_for_each_scheme_safe(s, next_s, ctx) + damon_destroy_scheme(s); + kfree(ctx); } @@ -250,6 +301,30 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, return 0; } +/** + * damon_set_schemes() - Set data access monitoring based operation schemes. + * @ctx: monitoring context + * @schemes: array of the schemes + * @nr_schemes: number of entries in @schemes + * + * This function should not be called while the kdamond of the context is + * running. + * + * Return: 0 if success, or negative error code otherwise. + */ +int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, + ssize_t nr_schemes) +{ + struct damos *s, *next; + ssize_t i; + + damon_for_each_scheme_safe(s, next, ctx) + damon_destroy_scheme(s); + for (i = 0; i < nr_schemes; i++) + damon_add_scheme(ctx, schemes[i]); + return 0; +} + /** * damon_nr_running_ctxs() - Return number of currently running contexts. */ @@ -453,6 +528,39 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } } +static void damon_do_apply_schemes(struct damon_ctx *c, + struct damon_target *t, + struct damon_region *r) +{ + struct damos *s; + unsigned long sz; + + damon_for_each_scheme(s, c) { + sz = r->ar.end - r->ar.start; + if (sz < s->min_sz_region || s->max_sz_region < sz) + continue; + if (r->nr_accesses < s->min_nr_accesses || + s->max_nr_accesses < r->nr_accesses) + continue; + if (r->age < s->min_age_region || s->max_age_region < r->age) + continue; + if (c->primitive.apply_scheme) + c->primitive.apply_scheme(c, t, r, s); + r->age = 0; + } +} + +static void kdamond_apply_schemes(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) + damon_do_apply_schemes(c, t, r); + } +} + #define sz_damon_region(r) (r->ar.end - r->ar.start) /* @@ -693,6 +801,7 @@ static int kdamond_fn(void *data) if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) set_kdamond_stop(ctx); + kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->primitive.reset_aggregated) From 6dea8add4d2875b80843e4a4c8acd334a4db8c8f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:25 -0700 Subject: [PATCH 475/512] mm/damon/vaddr: support DAMON-based Operation Schemes This makes DAMON's default primitives for virtual address spaces to support DAMON-based Operation Schemes (DAMOS) by implementing actions application functions and registering it to the monitoring context. The implementation simply links 'madvise()' for related DAMOS actions. That is, 'madvise(MADV_WILLNEED)' is called for 'WILLNEED' DAMOS action and similar for other actions ('COLD', 'PAGEOUT', 'HUGEPAGE', 'NOHUGEPAGE'). So, the kernel space DAMON users can now use the DAMON-based optimizations with only small amount of code. Link: https://lkml.kernel.org/r/20211001125604.29660-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 ++ mm/damon/vaddr.c | 56 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index dbe18b0fb795..be6b6e81e8ee 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -337,6 +337,8 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx); unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); void damon_va_cleanup(struct damon_ctx *ctx); +int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_VADDR */ diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 58c1fb2aafa9..3e1c74d36bab 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) "damon-va: " fmt +#include #include #include #include @@ -658,6 +659,60 @@ bool damon_va_target_valid(void *target) return false; } +#ifndef CONFIG_ADVISE_SYSCALLS +static int damos_madvise(struct damon_target *target, struct damon_region *r, + int behavior) +{ + return -EINVAL; +} +#else +static int damos_madvise(struct damon_target *target, struct damon_region *r, + int behavior) +{ + struct mm_struct *mm; + int ret = -ENOMEM; + + mm = damon_get_mm(target); + if (!mm) + goto out; + + ret = do_madvise(mm, PAGE_ALIGN(r->ar.start), + PAGE_ALIGN(r->ar.end - r->ar.start), behavior); + mmput(mm); +out: + return ret; +} +#endif /* CONFIG_ADVISE_SYSCALLS */ + +int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *scheme) +{ + int madv_action; + + switch (scheme->action) { + case DAMOS_WILLNEED: + madv_action = MADV_WILLNEED; + break; + case DAMOS_COLD: + madv_action = MADV_COLD; + break; + case DAMOS_PAGEOUT: + madv_action = MADV_PAGEOUT; + break; + case DAMOS_HUGEPAGE: + madv_action = MADV_HUGEPAGE; + break; + case DAMOS_NOHUGEPAGE: + madv_action = MADV_NOHUGEPAGE; + break; + default: + pr_warn("Wrong action %d\n", scheme->action); + return -EINVAL; + } + + return damos_madvise(t, r, madv_action); +} + void damon_va_set_primitives(struct damon_ctx *ctx) { ctx->primitive.init = damon_va_init; @@ -667,6 +722,7 @@ void damon_va_set_primitives(struct damon_ctx *ctx) ctx->primitive.reset_aggregated = NULL; ctx->primitive.target_valid = damon_va_target_valid; ctx->primitive.cleanup = NULL; + ctx->primitive.apply_scheme = damon_va_apply_scheme; } #include "vaddr-test.h" From af122dd8f3c0099349bc98ff69f0d90efd8b149f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:29 -0700 Subject: [PATCH 476/512] mm/damon/dbgfs: support DAMON-based Operation Schemes This makes 'damon-dbgfs' to support the data access monitoring oriented memory management schemes. Users can read and update the schemes using ``/damon/schemes`` file. The format is:: Link: https://lkml.kernel.org/r/20211001125604.29660-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 3 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index faee070977d8..78b7a04490c5 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -98,6 +98,159 @@ out: return ret; } +static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damos *s; + int written = 0; + int rc; + + damon_for_each_scheme(s, c) { + rc = scnprintf(&buf[written], len - written, + "%lu %lu %u %u %u %u %d\n", + s->min_sz_region, s->max_sz_region, + s->min_nr_accesses, s->max_nr_accesses, + s->min_age_region, s->max_age_region, + s->action); + if (!rc) + return -ENOMEM; + + written += rc; + } + return written; +} + +static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_schemes(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) +{ + ssize_t i; + + for (i = 0; i < nr_schemes; i++) + kfree(schemes[i]); + kfree(schemes); +} + +static bool damos_action_valid(int action) +{ + switch (action) { + case DAMOS_WILLNEED: + case DAMOS_COLD: + case DAMOS_PAGEOUT: + case DAMOS_HUGEPAGE: + case DAMOS_NOHUGEPAGE: + return true; + default: + return false; + } +} + +/* + * Converts a string into an array of struct damos pointers + * + * Returns an array of struct damos pointers that converted if the conversion + * success, or NULL otherwise. + */ +static struct damos **str_to_schemes(const char *str, ssize_t len, + ssize_t *nr_schemes) +{ + struct damos *scheme, **schemes; + const int max_nr_schemes = 256; + int pos = 0, parsed, ret; + unsigned long min_sz, max_sz; + unsigned int min_nr_a, max_nr_a, min_age, max_age; + unsigned int action; + + schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), + GFP_KERNEL); + if (!schemes) + return NULL; + + *nr_schemes = 0; + while (pos < len && *nr_schemes < max_nr_schemes) { + ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u%n", + &min_sz, &max_sz, &min_nr_a, &max_nr_a, + &min_age, &max_age, &action, &parsed); + if (ret != 7) + break; + if (!damos_action_valid(action)) { + pr_err("wrong action %d\n", action); + goto fail; + } + + pos += parsed; + scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, + min_age, max_age, action); + if (!scheme) + goto fail; + + schemes[*nr_schemes] = scheme; + *nr_schemes += 1; + } + return schemes; +fail: + free_schemes_arr(schemes, *nr_schemes); + return NULL; +} + +static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + struct damos **schemes; + ssize_t nr_schemes = 0, ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + schemes = str_to_schemes(kbuf, ret, &nr_schemes); + if (!schemes) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = damon_set_schemes(ctx, schemes, nr_schemes); + if (err) + ret = err; + else + nr_schemes = 0; +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + free_schemes_arr(schemes, nr_schemes); +out: + kfree(kbuf); + return ret; +} + static inline bool targetid_is_pid(const struct damon_ctx *ctx) { return ctx->primitive.target_valid == damon_va_target_valid; @@ -279,6 +432,12 @@ static const struct file_operations attrs_fops = { .write = dbgfs_attrs_write, }; +static const struct file_operations schemes_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_schemes_read, + .write = dbgfs_schemes_write, +}; + static const struct file_operations target_ids_fops = { .open = damon_dbgfs_open, .read = dbgfs_target_ids_read, @@ -292,10 +451,10 @@ static const struct file_operations kdamond_pid_fops = { static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) { - const char * const file_names[] = {"attrs", "target_ids", + const char * const file_names[] = {"attrs", "schemes", "target_ids", "kdamond_pid"}; - const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops, - &kdamond_pid_fops}; + const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, + &target_ids_fops, &kdamond_pid_fops}; int i; for (i = 0; i < ARRAY_SIZE(file_names); i++) From 2f0b548c9f03a78f4ce6ab48986e3108028936a6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:32 -0700 Subject: [PATCH 477/512] mm/damon/schemes: implement statistics feature To tune the DAMON-based operation schemes, knowing how many and how large regions are affected by each of the schemes will be helful. Those stats could be used for not only the tuning, but also monitoring of the working set size and the number of regions, if the scheme does not change the program behavior too much. For the reason, this implements the statistics for the schemes. The total number and size of the regions that each scheme is applied are exported to users via '->stat_count' and '->stat_sz' of 'struct damos'. Admins can also check the number by reading 'schemes' debugfs file. The last two integers now represents the stats. To allow collecting the stats without changing the program behavior, this also adds new scheme action, 'DAMOS_STAT'. Note that 'DAMOS_STAT' is not only making no memory operation actions, but also does not reset the age of regions. Link: https://lkml.kernel.org/r/20211001125604.29660-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 10 +++++++++- mm/damon/core.c | 7 ++++++- mm/damon/dbgfs.c | 5 +++-- mm/damon/vaddr.c | 2 ++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index be6b6e81e8ee..f301bb53381c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -78,6 +78,7 @@ struct damon_target { * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_STAT: Do nothing but count the stat. */ enum damos_action { DAMOS_WILLNEED, @@ -85,6 +86,7 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_STAT, /* Do nothing but only record the stat */ }; /** @@ -96,9 +98,13 @@ enum damos_action { * @min_age_region: Minimum age of target regions. * @max_age_region: Maximum age of target regions. * @action: &damo_action to be applied to the target regions. + * @stat_count: Total number of regions that this scheme is applied. + * @stat_sz: Total size of regions that this scheme is applied. * @list: List head for siblings. * - * Note that both the minimums and the maximums are inclusive. + * For each aggregation interval, DAMON applies @action to monitoring target + * regions fit in the condition and updates the statistics. Note that both + * the minimums and the maximums are inclusive. */ struct damos { unsigned long min_sz_region; @@ -108,6 +114,8 @@ struct damos { unsigned int min_age_region; unsigned int max_age_region; enum damos_action action; + unsigned long stat_count; + unsigned long stat_sz; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 0ed97b21cbb6..2f6785737902 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -103,6 +103,8 @@ struct damos *damon_new_scheme( scheme->min_age_region = min_age_region; scheme->max_age_region = max_age_region; scheme->action = action; + scheme->stat_count = 0; + scheme->stat_sz = 0; INIT_LIST_HEAD(&scheme->list); return scheme; @@ -544,9 +546,12 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; if (r->age < s->min_age_region || s->max_age_region < r->age) continue; + s->stat_count++; + s->stat_sz += sz; if (c->primitive.apply_scheme) c->primitive.apply_scheme(c, t, r, s); - r->age = 0; + if (s->action != DAMOS_STAT) + r->age = 0; } } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 78b7a04490c5..28d6abf27763 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -106,11 +106,11 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d\n", + "%lu %lu %u %u %u %u %d %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, - s->action); + s->action, s->stat_count, s->stat_sz); if (!rc) return -ENOMEM; @@ -159,6 +159,7 @@ static bool damos_action_valid(int action) case DAMOS_PAGEOUT: case DAMOS_HUGEPAGE: case DAMOS_NOHUGEPAGE: + case DAMOS_STAT: return true; default: return false; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3e1c74d36bab..953c145b4f08 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -705,6 +705,8 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_STAT: + return 0; default: pr_warn("Wrong action %d\n", scheme->action); return -EINVAL; From 8d5d4c6359054f3e680e1a2caca50e9b6d688b7d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:36 -0700 Subject: [PATCH 478/512] selftests/damon: add 'schemes' debugfs tests This adds simple selftets for 'schemes' debugfs file of DAMON. Link: https://lkml.kernel.org/r/20211001125604.29660-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/damon/debugfs_attrs.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh index bfabb19dc0d3..639cfb6a1f65 100644 --- a/tools/testing/selftests/damon/debugfs_attrs.sh +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -57,6 +57,19 @@ test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" echo "$orig_content" > "$file" +# Test schemes file +# ================= + +file="$DBGFS/schemes" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4 5 6 4" \ + "$orig_content" "valid input" +test_write_fail "$file" "1 2 +3 4 5 6 3" "$orig_content" "multi lines" +test_write_succ "$file" "" "$orig_content" "disabling" +echo "$orig_content" > "$file" + # Test target_ids file # ==================== From 68536f8e01e571f553f78fa058ba543de3834452 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:39 -0700 Subject: [PATCH 479/512] Docs/admin-guide/mm/damon: document DAMON-based Operation Schemes This adds the description of DAMON-based operation schemes in the DAMON documents. Link: https://lkml.kernel.org/r/20211001125604.29660-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/start.rst | 11 +++++ Documentation/admin-guide/mm/damon/usage.rst | 51 +++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index d5eb89a8fc38..51503cf90ca2 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -108,6 +108,17 @@ the results as separate image files. :: You can view the visualizations of this example workload at [1]_. Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_. + +Data Access Pattern Aware Memory Management +=========================================== + +Below three commands make every memory region of size >=4K that doesn't +accessed for >=60 seconds in your workload to be swapped out. :: + + $ echo "#min-size max-size min-acc max-acc min-age max-age action" > scheme + $ echo "4K max 0 0 60s max pageout" >> scheme + $ damo schemes -c my_thp_scheme + .. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns .. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html .. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index a72cda374aba..c0296c14babf 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -34,8 +34,8 @@ the reason, this document describes only the debugfs interface debugfs Interface ================= -DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under -its debugfs directory, ``/damon/``. +DAMON exports four files, ``attrs``, ``target_ids``, ``schemes`` and +``monitor_on`` under its debugfs directory, ``/damon/``. Attributes @@ -74,6 +74,53 @@ check it again:: Note that setting the target ids doesn't start the monitoring. +Schemes +------- + +For usual DAMON-based data access aware memory management optimizations, users +would simply want the system to apply a memory management action to a memory +region of a specific size having a specific access frequency for a specific +time. DAMON receives such formalized operation schemes from the user and +applies those to the target processes. It also counts the total number and +size of regions that each scheme is applied. This statistics can be used for +online analysis or tuning of the schemes. + +Users can get and set the schemes by reading from and writing to ``schemes`` +debugfs file. Reading the file also shows the statistics of each scheme. To +the file, each of the schemes should be represented in each line in below form: + + min-size max-size min-acc max-acc min-age max-age action + +Note that the ranges are closed interval. Bytes for the size of regions +(``min-size`` and ``max-size``), number of monitored accesses per aggregate +interval for access frequency (``min-acc`` and ``max-acc``), number of +aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a +predefined integer for memory management actions should be used. The supported +numbers and their meanings are as below. + + - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` + - 1: Call ``madvise()`` for the region with ``MADV_COLD`` + - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` + - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` + - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` + - 5: Do nothing but count the statistics + +You can disable schemes by simply writing an empty string to the file. For +example, below commands applies a scheme saying "If a memory region of size in +[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate +interval in [10, 20], page out the region", check the entered scheme again, and +finally remove the scheme. :: + + # cd /damon + # echo "4096 8192 0 5 10 20 2" > schemes + # cat schemes + 4096 8192 0 5 10 20 2 0 0 + # echo > schemes + +The last two integers in the 4th line of above example is the total number and +the total size of the regions that the scheme is applied. + + Turning On/Off -------------- From 90bebce9fcd6488ba6b010af3a16a0a0d7e44cb6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:42 -0700 Subject: [PATCH 480/512] mm/damon/dbgfs: allow users to set initial monitoring target regions Patch series "DAMON: Support Physical Memory Address Space Monitoring:. DAMON currently supports only virtual address spaces monitoring. It can be easily extended for various use cases and address spaces by configuring its monitoring primitives layer to use appropriate primitives implementations, though. This patchset implements monitoring primitives for the physical address space monitoring using the structure. The first 3 patches allow the user space users manually set the monitoring regions. The 1st patch implements the feature in the 'damon-dbgfs'. Then, patches for adding a unit tests (the 2nd patch) and updating the documentation (the 3rd patch) follow. Following 4 patches implement the physical address space monitoring primitives. The 4th patch makes some primitive functions for the virtual address spaces primitives reusable. The 5th patch implements the physical address space monitoring primitives. The 6th patch links the primitives to the 'damon-dbgfs'. Finally, 7th patch documents this new features. This patch (of 7): Some 'damon-dbgfs' users would want to monitor only a part of the entire virtual memory address space. The program interface users in the kernel space could use '->before_start()' callback or set the regions inside the context struct as they want, but 'damon-dbgfs' users cannot. For that reason, this introduces a new debugfs file called 'init_region'. 'damon-dbgfs' users can specify which initial monitoring target address regions they want by writing special input to the file. The input should describe each region in each line in the below form: Note that the regions will be updated to cover entire memory mapped regions after a 'regions update interval' is passed. If you want the regions to not be updated after the initial setting, you could set the interval as a very long time, say, a few decades. Link: https://lkml.kernel.org/r/20211012205711.29216-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211012205711.29216-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Cameron Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Jonathan Corbet Cc: David Hildenbrand Cc: David Woodhouse Cc: Marco Elver Cc: Leonard Foerster Cc: Greg Thelen Cc: Markus Boehme Cc: David Rienjes Cc: Shakeel Butt Cc: Shuah Khan Cc: Brendan Higgins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 154 insertions(+), 2 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 28d6abf27763..1cce53cd241d 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -394,6 +394,152 @@ out: return ret; } +static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r; + int written = 0; + int rc; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + rc = scnprintf(&buf[written], len - written, + "%lu %lu %lu\n", + t->id, r->ar.start, r->ar.end); + if (!rc) + return -ENOMEM; + written += rc; + } + } + return written; +} + +static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + mutex_unlock(&ctx->kdamond_lock); + len = -EBUSY; + goto out; + } + + len = sprint_init_regions(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static int add_init_region(struct damon_ctx *c, + unsigned long target_id, struct damon_addr_range *ar) +{ + struct damon_target *t; + struct damon_region *r, *prev; + unsigned long id; + int rc = -EINVAL; + + if (ar->start >= ar->end) + return -EINVAL; + + damon_for_each_target(t, c) { + id = t->id; + if (targetid_is_pid(c)) + id = (unsigned long)pid_vnr((struct pid *)id); + if (id == target_id) { + r = damon_new_region(ar->start, ar->end); + if (!r) + return -ENOMEM; + damon_add_region(r, t); + if (damon_nr_regions(t) > 1) { + prev = damon_prev_region(r); + if (prev->ar.end > r->ar.start) { + damon_destroy_region(r, t); + return -EINVAL; + } + } + rc = 0; + } + } + return rc; +} + +static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r, *next; + int pos = 0, parsed, ret; + unsigned long target_id; + struct damon_addr_range ar; + int err; + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + + while (pos < len) { + ret = sscanf(&str[pos], "%lu %lu %lu%n", + &target_id, &ar.start, &ar.end, &parsed); + if (ret != 3) + break; + err = add_init_region(c, target_id, &ar); + if (err) + goto fail; + pos += parsed; + } + + return 0; + +fail: + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + return err; +} + +static ssize_t dbgfs_init_regions_write(struct file *file, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = set_init_regions(ctx, kbuf, ret); + if (err) + ret = err; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + kfree(kbuf); + return ret; +} + static ssize_t dbgfs_kdamond_pid_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -445,6 +591,12 @@ static const struct file_operations target_ids_fops = { .write = dbgfs_target_ids_write, }; +static const struct file_operations init_regions_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_init_regions_read, + .write = dbgfs_init_regions_write, +}; + static const struct file_operations kdamond_pid_fops = { .open = damon_dbgfs_open, .read = dbgfs_kdamond_pid_read, @@ -453,9 +605,9 @@ static const struct file_operations kdamond_pid_fops = { static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) { const char * const file_names[] = {"attrs", "schemes", "target_ids", - "kdamond_pid"}; + "init_regions", "kdamond_pid"}; const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, - &target_ids_fops, &kdamond_pid_fops}; + &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; int i; for (i = 0; i < ARRAY_SIZE(file_names); i++) From 1c2e11bfa649cc07e6322b0e5ea3cdbada9c43c3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:46 -0700 Subject: [PATCH 481/512] mm/damon/dbgfs-test: add a unit test case for 'init_regions' This adds another test case for the new feature, 'init_regions'. Link: https://lkml.kernel.org/r/20211012205711.29216-3-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Brendan Higgins Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs-test.h | 54 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 4eddcfa73996..104b22957616 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -109,9 +109,63 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) dbgfs_destroy_ctx(ctx); } +static void damon_dbgfs_test_set_init_regions(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + unsigned long ids[] = {1, 2, 3}; + /* Each line represents one region in `` `` */ + char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45", + "2 10 20\n", + "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n", + ""}; + /* Reading the file again will show sorted, clean output */ + char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n", + "2 10 20\n", + "1 39 59\n1 70 134\n2 10 20\n2 20 25\n", + ""}; + char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */ + "2 10 20\n 2 14 26\n", /* regions overlap */ + "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */ + char *input, *expect; + int i, rc; + char buf[256]; + + damon_set_targets(ctx, ids, 3); + + /* Put valid inputs and check the results */ + for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { + input = valid_inputs[i]; + expect = valid_expects[i]; + + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, 0); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, expect); + } + /* Put invlid inputs and check the return error code */ + for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { + input = invalid_inputs[i]; + pr_info("input: %s\n", input); + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, -EINVAL); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, ""); + } + + damon_set_targets(ctx, NULL, 0); + damon_destroy_ctx(ctx); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), KUNIT_CASE(damon_dbgfs_test_set_targets), + KUNIT_CASE(damon_dbgfs_test_set_init_regions), {}, }; From c2fe4987ed31c32591c9aea5a1e8e2540ce66e12 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:49 -0700 Subject: [PATCH 482/512] Docs/admin-guide/mm/damon: document 'init_regions' feature This adds description of the 'init_regions' feature in the DAMON usage document. Link: https://lkml.kernel.org/r/20211012205711.29216-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/usage.rst | 41 +++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index c0296c14babf..f7d5cfbb50c2 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -34,8 +34,9 @@ the reason, this document describes only the debugfs interface debugfs Interface ================= -DAMON exports four files, ``attrs``, ``target_ids``, ``schemes`` and -``monitor_on`` under its debugfs directory, ``/damon/``. +DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``, +``schemes`` and ``monitor_on`` under its debugfs directory, +``/damon/``. Attributes @@ -74,6 +75,42 @@ check it again:: Note that setting the target ids doesn't start the monitoring. +Initial Monitoring Target Regions +--------------------------------- + +In case of the debugfs based monitoring, DAMON automatically sets and updates +the monitoring target regions so that entire memory mappings of target +processes can be covered. However, users can want to limit the monitoring +region to specific address ranges, such as the heap, the stack, or specific +file-mapped area. Or, some users can know the initial access pattern of their +workloads and therefore want to set optimal initial regions for the 'adaptive +regions adjustment'. + +In such cases, users can explicitly set the initial monitoring target regions +as they want, by writing proper values to the ``init_regions`` file. Each line +of the input should represent one region in below form.:: + + + +The ``target id`` should already in ``target_ids`` file, and the regions should +be passed in address order. For example, below commands will set a couple of +address ranges, ``1-100`` and ``100-200`` as the initial monitoring target +region of process 42, and another couple of address ranges, ``20-40`` and +``50-100`` as that of process 4242.:: + + # cd /damon + # echo "42 1 100 + 42 100 200 + 4242 20 40 + 4242 50 100" > init_regions + +Note that this sets the initial monitoring target regions only. In case of +virtual memory monitoring, DAMON will automatically updates the boundary of the +regions after one ``regions update interval``. Therefore, users should set the +``regions update interval`` large enough in this case, if they don't want the +update. + + Schemes ------- From 46c3a0accdc48c86928157fd073e66807f338485 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:53 -0700 Subject: [PATCH 483/512] mm/damon/vaddr: separate commonly usable functions This moves functions in the default virtual address spaces monitoring primitives that commonly usable from other address spaces like physical address space into a header file. Those will be reused by the physical address space monitoring primitives which will be implemented by the following commit. [sj@kernel.org: include 'highmem.h' to fix a build failure] Link: https://lkml.kernel.org/r/20211014110848.5204-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211012205711.29216-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/Makefile | 2 +- mm/damon/prmtv-common.c | 87 +++++++++++++++++++++++++++++++++++++++ mm/damon/prmtv-common.h | 17 ++++++++ mm/damon/vaddr.c | 90 ++--------------------------------------- 4 files changed, 109 insertions(+), 87 deletions(-) create mode 100644 mm/damon/prmtv-common.c create mode 100644 mm/damon/prmtv-common.h diff --git a/mm/damon/Makefile b/mm/damon/Makefile index fed4be3bace3..99b1bfe01ff5 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DAMON) := core.o -obj-$(CONFIG_DAMON_VADDR) += vaddr.o +obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o diff --git a/mm/damon/prmtv-common.c b/mm/damon/prmtv-common.c new file mode 100644 index 000000000000..7e62ee54fb54 --- /dev/null +++ b/mm/damon/prmtv-common.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include +#include +#include +#include + +#include "prmtv-common.h" + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (pte_young(*pte)) { + referenced = true; + *pte = pte_mkold(*pte); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmd_young(*pmd)) { + referenced = true; + *pmd = pmd_mkold(*pmd); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + ((1UL) << HPAGE_PMD_SHIFT))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h new file mode 100644 index 000000000000..7093d19e5d42 --- /dev/null +++ b/mm/damon/prmtv-common.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include +#include + +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + +struct page *damon_get_page(unsigned long pfn); + +void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); +void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 953c145b4f08..f0aef35602da 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -8,25 +8,19 @@ #define pr_fmt(fmt) "damon-va: " fmt #include -#include -#include -#include -#include #include +#include +#include #include #include -#include -#include -#include + +#include "prmtv-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST #undef DAMON_MIN_REGION #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - /* * 't->id' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. @@ -373,82 +367,6 @@ void damon_va_update(struct damon_ctx *ctx) } } -/* - * Get an online page for a pfn if it's in the LRU list. Otherwise, returns - * NULL. - * - * The body of this function is stolen from the 'page_idle_get_page()'. We - * steal rather than reuse it because the code is quite simple. - */ -static struct page *damon_get_page(unsigned long pfn) -{ - struct page *page = pfn_to_online_page(pfn); - - if (!page || !PageLRU(page) || !get_page_unless_zero(page)) - return NULL; - - if (unlikely(!PageLRU(page))) { - put_page(page); - page = NULL; - } - return page; -} - -static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, - unsigned long addr) -{ - bool referenced = false; - struct page *page = damon_get_page(pte_pfn(*pte)); - - if (!page) - return; - - if (pte_young(*pte)) { - referenced = true; - *pte = pte_mkold(*pte); - } - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) - set_page_young(page); - - set_page_idle(page); - put_page(page); -} - -static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, - unsigned long addr) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool referenced = false; - struct page *page = damon_get_page(pmd_pfn(*pmd)); - - if (!page) - return; - - if (pmd_young(*pmd)) { - referenced = true; - *pmd = pmd_mkold(*pmd); - } - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, - addr + ((1UL) << HPAGE_PMD_SHIFT))) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) - set_page_young(page); - - set_page_idle(page); - put_page(page); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -} - static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { From a28397beb55b68bd0f15c6778540e8ae1bc26d21 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:56 -0700 Subject: [PATCH 484/512] mm/damon: implement primitives for physical address space monitoring This implements the monitoring primitives for the physical memory address space. Internally, it uses the PTE Accessed bit, similar to that of the virtual address spaces monitoring primitives. It supports only user memory pages, as idle pages tracking does. If the monitoring target physical memory address range contains non-user memory pages, access check of the pages will do nothing but simply treat the pages as not accessed. Link: https://lkml.kernel.org/r/20211012205711.29216-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 10 ++ mm/damon/Kconfig | 8 ++ mm/damon/Makefile | 1 + mm/damon/paddr.c | 224 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 243 insertions(+) create mode 100644 mm/damon/paddr.c diff --git a/include/linux/damon.h b/include/linux/damon.h index f301bb53381c..715dadd21f7c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -351,4 +351,14 @@ void damon_va_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_VADDR */ +#ifdef CONFIG_DAMON_PADDR + +/* Monitoring primitives for the physical memory address space */ +void damon_pa_prepare_access_checks(struct damon_ctx *ctx); +unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); +bool damon_pa_target_valid(void *t); +void damon_pa_set_primitives(struct damon_ctx *ctx); + +#endif /* CONFIG_DAMON_PADDR */ + #endif /* _DAMON_H */ diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index ba8898c7eb8e..2a5923be631e 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -32,6 +32,14 @@ config DAMON_VADDR This builds the default data access monitoring primitives for DAMON that work for virtual address spaces. +config DAMON_PADDR + bool "Data access monitoring primitives for the physical address space" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring primitives for DAMON + that works for the physical address space. + config DAMON_VADDR_KUNIT_TEST bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS depends on DAMON_VADDR && KUNIT=y diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 99b1bfe01ff5..8d9b0df79702 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -2,4 +2,5 @@ obj-$(CONFIG_DAMON) := core.o obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o +obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c new file mode 100644 index 000000000000..d7a2ecd09ed0 --- /dev/null +++ b/mm/damon/paddr.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for The Physical Address Space + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-pa: " fmt + +#include +#include +#include +#include + +#include "prmtv-common.h" + +static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; + + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) + damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr); + else + damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr); + } + return true; +} + +static void damon_pa_mkold(unsigned long paddr) +{ + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct rmap_walk_control rwc = { + .rmap_one = __damon_pa_mkold, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return; + + if (!page_mapped(page) || !page_rmapping(page)) { + set_page_idle(page); + goto out; + } + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) + goto out; + + rmap_walk(page, &rwc); + + if (need_lock) + unlock_page(page); + +out: + put_page(page); +} + +static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, + struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_pa_mkold(r->sampling_addr); +} + +void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + __damon_pa_prepare_access_check(ctx, r); + } +} + +struct damon_pa_access_chk_result { + unsigned long page_sz; + bool accessed; +}; + +static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct damon_pa_access_chk_result *result = arg; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; + + result->accessed = false; + result->page_sz = PAGE_SIZE; + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + result->accessed = pte_young(*pvmw.pte) || + !page_is_idle(page) || + mmu_notifier_test_young(vma->vm_mm, addr); + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + result->accessed = pmd_young(*pvmw.pmd) || + !page_is_idle(page) || + mmu_notifier_test_young(vma->vm_mm, addr); + result->page_sz = ((1UL) << HPAGE_PMD_SHIFT); +#else + WARN_ON_ONCE(1); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + } + if (result->accessed) { + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* If accessed, stop walking */ + return !result->accessed; +} + +static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) +{ + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct damon_pa_access_chk_result result = { + .page_sz = PAGE_SIZE, + .accessed = false, + }; + struct rmap_walk_control rwc = { + .arg = &result, + .rmap_one = __damon_pa_young, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return false; + + if (!page_mapped(page) || !page_rmapping(page)) { + if (page_is_idle(page)) + result.accessed = false; + else + result.accessed = true; + put_page(page); + goto out; + } + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) { + put_page(page); + return NULL; + } + + rmap_walk(page, &rwc); + + if (need_lock) + unlock_page(page); + put_page(page); + +out: + *page_sz = result.page_sz; + return result.accessed; +} + +static void __damon_pa_check_access(struct damon_ctx *ctx, + struct damon_region *r) +{ + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz)) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_addr = r->sampling_addr; +} + +unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) { + __damon_pa_check_access(ctx, r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + } + + return max_nr_accesses; +} + +bool damon_pa_target_valid(void *t) +{ + return true; +} + +void damon_pa_set_primitives(struct damon_ctx *ctx) +{ + ctx->primitive.init = NULL; + ctx->primitive.update = NULL; + ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks; + ctx->primitive.check_accesses = damon_pa_check_accesses; + ctx->primitive.reset_aggregated = NULL; + ctx->primitive.target_valid = damon_pa_target_valid; + ctx->primitive.cleanup = NULL; + ctx->primitive.apply_scheme = NULL; +} From c026291ab88f02247999959d01182cb8eb6e6a5b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:00 -0700 Subject: [PATCH 485/512] mm/damon/dbgfs: support physical memory monitoring This makes the 'damon-dbgfs' to support the physical memory monitoring, in addition to the virtual memory monitoring. Users can do the physical memory monitoring by writing a special keyword, 'paddr' to the 'target_ids' debugfs file. Then, DAMON will check the special keyword and configure the monitoring context to run with the primitives for the physical address space. Unlike the virtual memory monitoring, the monitoring target region will not be automatically set. Therefore, users should also set the monitoring target address region using the 'init_regions' debugfs file. Also, note that the physical memory monitoring will not automatically terminated. The user should explicitly turn off the monitoring by writing 'off' to the 'monitor_on' debugfs file. Link: https://lkml.kernel.org/r/20211012205711.29216-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/Kconfig | 2 +- mm/damon/dbgfs.c | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 2a5923be631e..ca33b289ebbe 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -54,7 +54,7 @@ config DAMON_VADDR_KUNIT_TEST config DAMON_DBGFS bool "DAMON debugfs interface" - depends on DAMON_VADDR && DEBUG_FS + depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS help This builds the debugfs interface for DAMON. The user space admins can use the interface for arbitrary data access monitoring. diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 1cce53cd241d..38188347d8ab 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -339,6 +339,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; + bool id_is_pid = true; char *kbuf, *nrs; unsigned long *targets; ssize_t nr_targets; @@ -351,6 +352,11 @@ static ssize_t dbgfs_target_ids_write(struct file *file, return PTR_ERR(kbuf); nrs = kbuf; + if (!strncmp(kbuf, "paddr\n", count)) { + id_is_pid = false; + /* target id is meaningless here, but we set it just for fun */ + scnprintf(kbuf, count, "42 "); + } targets = str_to_target_ids(nrs, ret, &nr_targets); if (!targets) { @@ -358,7 +364,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, goto out; } - if (targetid_is_pid(ctx)) { + if (id_is_pid) { for (i = 0; i < nr_targets; i++) { targets[i] = (unsigned long)find_get_pid( (int)targets[i]); @@ -372,15 +378,24 @@ static ssize_t dbgfs_target_ids_write(struct file *file, mutex_lock(&ctx->kdamond_lock); if (ctx->kdamond) { - if (targetid_is_pid(ctx)) + if (id_is_pid) dbgfs_put_pids(targets, nr_targets); ret = -EBUSY; goto unlock_out; } + /* remove targets with previously-set primitive */ + damon_set_targets(ctx, NULL, 0); + + /* Configure the context for the address space type */ + if (id_is_pid) + damon_va_set_primitives(ctx); + else + damon_pa_set_primitives(ctx); + err = damon_set_targets(ctx, targets, nr_targets); if (err) { - if (targetid_is_pid(ctx)) + if (id_is_pid) dbgfs_put_pids(targets, nr_targets); ret = err; } From c638072107f52ec35f292c97b6f3df9b9f2ed87d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:03 -0700 Subject: [PATCH 486/512] Docs/DAMON: document physical memory monitoring support This updates the DAMON documents for the physical memory address space monitoring support. Link: https://lkml.kernel.org/r/20211012205711.29216-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/usage.rst | 25 +++++++++++++---- Documentation/vm/damon/design.rst | 29 ++++++++++++-------- Documentation/vm/damon/faq.rst | 5 ++-- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index f7d5cfbb50c2..ed96bbf0daff 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -10,15 +10,16 @@ DAMON provides below three interfaces for different users. This is for privileged people such as system administrators who want a just-working human-friendly interface. Using this, users can use the DAMON’s major features in a human-friendly way. It may not be highly tuned for - special cases, though. It supports only virtual address spaces monitoring. + special cases, though. It supports both virtual and physical address spaces + monitoring. - *debugfs interface.* This is for privileged user space programmers who want more optimized use of DAMON. Using this, users can use DAMON’s major features by reading from and writing to special debugfs files. Therefore, you can write and use your personalized DAMON debugfs wrapper programs that reads/writes the debugfs files instead of you. The DAMON user space tool is also a reference - implementation of such programs. It supports only virtual address spaces - monitoring. + implementation of such programs. It supports both virtual and physical + address spaces monitoring. - *Kernel Space Programming Interface.* This is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by writing kernel space @@ -72,20 +73,34 @@ check it again:: # cat target_ids 42 4242 +Users can also monitor the physical memory address space of the system by +writing a special keyword, "``paddr\n``" to the file. Because physical address +space monitoring doesn't support multiple targets, reading the file will show a +fake value, ``42``, as below:: + + # cd /damon + # echo paddr > target_ids + # cat target_ids + 42 + Note that setting the target ids doesn't start the monitoring. Initial Monitoring Target Regions --------------------------------- -In case of the debugfs based monitoring, DAMON automatically sets and updates -the monitoring target regions so that entire memory mappings of target +In case of the virtual address space monitoring, DAMON automatically sets and +updates the monitoring target regions so that entire memory mappings of target processes can be covered. However, users can want to limit the monitoring region to specific address ranges, such as the heap, the stack, or specific file-mapped area. Or, some users can know the initial access pattern of their workloads and therefore want to set optimal initial regions for the 'adaptive regions adjustment'. +In contrast, DAMON do not automatically sets and updates the monitoring target +regions in case of physical memory monitoring. Therefore, users should set the +monitoring target regions by themselves. + In such cases, users can explicitly set the initial monitoring target regions as they want, by writing proper values to the ``init_regions`` file. Each line of the input should represent one region in below form.:: diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst index b05159c295f4..210f0f50efd8 100644 --- a/Documentation/vm/damon/design.rst +++ b/Documentation/vm/damon/design.rst @@ -35,13 +35,17 @@ two parts: 1. Identification of the monitoring target address range for the address space. 2. Access check of specific address range in the target space. -DAMON currently provides the implementation of the primitives for only the -virtual address spaces. Below two subsections describe how it works. +DAMON currently provides the implementations of the primitives for the physical +and virtual address spaces. Below two subsections describe how those work. VMA-based Target Address Range Construction ------------------------------------------- +This is only for the virtual address space primitives implementation. That for +the physical address space simply asks users to manually set the monitoring +target address ranges. + Only small parts in the super-huge virtual address space of the processes are mapped to the physical memory and accessed. Thus, tracking the unmapped address regions is just wasteful. However, because DAMON can deal with some @@ -71,15 +75,18 @@ to make a reasonable trade-off. Below shows this in detail:: PTE Accessed-bit Based Access Check ----------------------------------- -The implementation for the virtual address space uses PTE Accessed-bit for -basic access checks. It finds the relevant PTE Accessed bit from the address -by walking the page table for the target task of the address. In this way, the -implementation finds and clears the bit for next sampling target address and -checks whether the bit set again after one sampling period. This could disturb -other kernel subsystems using the Accessed bits, namely Idle page tracking and -the reclaim logic. To avoid such disturbances, DAMON makes it mutually -exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page -flags to solve the conflict with the reclaim logic, as Idle page tracking does. +Both of the implementations for physical and virtual address spaces use PTE +Accessed-bit for basic access checks. Only one difference is the way of +finding the relevant PTE Accessed bit(s) from the address. While the +implementation for the virtual address walks the page table for the target task +of the address, the implementation for the physical address walks every page +table having a mapping to the address. In this way, the implementations find +and clear the bit(s) for next sampling target address and checks whether the +bit(s) set again after one sampling period. This could disturb other kernel +subsystems using the Accessed bits, namely Idle page tracking and the reclaim +logic. To avoid such disturbances, DAMON makes it mutually exclusive with Idle +page tracking and uses ``PG_idle`` and ``PG_young`` page flags to solve the +conflict with the reclaim logic, as Idle page tracking does. Address Space Independent Core Mechanisms diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst index cb3d8b585a8b..11aea40eb328 100644 --- a/Documentation/vm/damon/faq.rst +++ b/Documentation/vm/damon/faq.rst @@ -36,10 +36,9 @@ constructions and actual access checks can be implemented and configured on the DAMON core by the users. In this way, DAMON users can monitor any address space with any access check technique. -Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based +Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based implementations of the address space dependent functions for the virtual memory -by default, for a reference and convenient use. In near future, we will -provide those for physical memory address space. +and the physical memory by default, for a reference and convenient use. Can I simply monitor page granularity? From 199b50f4c9485c46c2403d8b3e0eca90ec401ed6 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Fri, 5 Nov 2021 13:47:07 -0700 Subject: [PATCH 487/512] mm/damon/vaddr: constify static mm_walk_ops The only usage of these structs is to pass their addresses to walk_page_range(), which takes a pointer to const mm_walk_ops as argument. Make them const to allow the compiler to put them in read-only memory. Link: https://lkml.kernel.org/r/20211014075042.17174-2-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Reviewed-by: SeongJae Park Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index f0aef35602da..758501b8d97d 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -394,7 +394,7 @@ out: return 0; } -static struct mm_walk_ops damon_mkold_ops = { +static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, }; @@ -490,7 +490,7 @@ out: return 0; } -static struct mm_walk_ops damon_young_ops = { +static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, }; From 9210622ab81f7e722da7563166d93b2a028a79d4 Mon Sep 17 00:00:00 2001 From: Rongwei Wang Date: Fri, 5 Nov 2021 13:47:09 -0700 Subject: [PATCH 488/512] mm/damon/dbgfs: remove unnecessary variables In some functions, it's unnecessary to declare 'err' and 'ret' variables at the same time. This patch mainly to simplify the issue of such declarations by reusing one variable. Link: https://lkml.kernel.org/r/20211014073014.35754-1-sj@kernel.org Signed-off-by: Rongwei Wang Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 66 +++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 38188347d8ab..c90988a20fa4 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -69,8 +69,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, struct damon_ctx *ctx = file->private_data; unsigned long s, a, r, minr, maxr; char *kbuf; - ssize_t ret = count; - int err; + ssize_t ret; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -88,9 +87,9 @@ static ssize_t dbgfs_attrs_write(struct file *file, goto unlock_out; } - err = damon_set_attrs(ctx, s, a, r, minr, maxr); - if (err) - ret = err; + ret = damon_set_attrs(ctx, s, a, r, minr, maxr); + if (!ret) + ret = count; unlock_out: mutex_unlock(&ctx->kdamond_lock); out: @@ -220,14 +219,13 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, struct damon_ctx *ctx = file->private_data; char *kbuf; struct damos **schemes; - ssize_t nr_schemes = 0, ret = count; - int err; + ssize_t nr_schemes = 0, ret; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - schemes = str_to_schemes(kbuf, ret, &nr_schemes); + schemes = str_to_schemes(kbuf, count, &nr_schemes); if (!schemes) { ret = -EINVAL; goto out; @@ -239,11 +237,12 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, goto unlock_out; } - err = damon_set_schemes(ctx, schemes, nr_schemes); - if (err) - ret = err; - else + ret = damon_set_schemes(ctx, schemes, nr_schemes); + if (!ret) { + ret = count; nr_schemes = 0; + } + unlock_out: mutex_unlock(&ctx->kdamond_lock); free_schemes_arr(schemes, nr_schemes); @@ -343,9 +342,8 @@ static ssize_t dbgfs_target_ids_write(struct file *file, char *kbuf, *nrs; unsigned long *targets; ssize_t nr_targets; - ssize_t ret = count; + ssize_t ret; int i; - int err; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -358,7 +356,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, scnprintf(kbuf, count, "42 "); } - targets = str_to_target_ids(nrs, ret, &nr_targets); + targets = str_to_target_ids(nrs, count, &nr_targets); if (!targets) { ret = -ENOMEM; goto out; @@ -393,11 +391,12 @@ static ssize_t dbgfs_target_ids_write(struct file *file, else damon_pa_set_primitives(ctx); - err = damon_set_targets(ctx, targets, nr_targets); - if (err) { + ret = damon_set_targets(ctx, targets, nr_targets); + if (ret) { if (id_is_pid) dbgfs_put_pids(targets, nr_targets); - ret = err; + } else { + ret = count; } unlock_out: @@ -715,8 +714,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file, { char *kbuf; char *ctx_name; - ssize_t ret = count; - int err; + ssize_t ret; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -734,9 +732,9 @@ static ssize_t dbgfs_mk_context_write(struct file *file, } mutex_lock(&damon_dbgfs_lock); - err = dbgfs_mk_context(ctx_name); - if (err) - ret = err; + ret = dbgfs_mk_context(ctx_name); + if (!ret) + ret = count; mutex_unlock(&damon_dbgfs_lock); out: @@ -805,8 +803,7 @@ static ssize_t dbgfs_rm_context_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *kbuf; - ssize_t ret = count; - int err; + ssize_t ret; char *ctx_name; kbuf = user_input_str(buf, count, ppos); @@ -825,9 +822,9 @@ static ssize_t dbgfs_rm_context_write(struct file *file, } mutex_lock(&damon_dbgfs_lock); - err = dbgfs_rm_context(ctx_name); - if (err) - ret = err; + ret = dbgfs_rm_context(ctx_name); + if (!ret) + ret = count; mutex_unlock(&damon_dbgfs_lock); out: @@ -851,9 +848,8 @@ static ssize_t dbgfs_monitor_on_read(struct file *file, static ssize_t dbgfs_monitor_on_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - ssize_t ret = count; + ssize_t ret; char *kbuf; - int err; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -866,14 +862,14 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, } if (!strncmp(kbuf, "on", count)) - err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); else if (!strncmp(kbuf, "off", count)) - err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); else - err = -EINVAL; + ret = -EINVAL; - if (err) - ret = err; + if (!ret) + ret = count; kfree(kbuf); return ret; } From 57223ac295845b1d72ec1bd02b5fab992b77a021 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:13 -0700 Subject: [PATCH 489/512] mm/damon/paddr: support the pageout scheme Introduction ============ This patchset 1) makes the engine for general data access pattern-oriented memory management (DAMOS) be more useful for production environments, and 2) implements a static kernel module for lightweight proactive reclamation using the engine. Proactive Reclamation --------------------- On general memory over-committed systems, proactively reclaiming cold pages helps saving memory and reducing latency spikes that incurred by the direct reclaim or the CPU consumption of kswapd, while incurring only minimal performance degradation[2]. A Free Pages Reporting[8] based memory over-commit virtualization system would be one more specific use case. In the system, the guest VMs reports their free memory to host, and the host reallocates the reported memory to other guests. As a result, the system's memory utilization can be maximized. However, the guests could be not so memory-frugal, because some kernel subsystems and user-space applications are designed to use as much memory as available. Then, guests would report only small amount of free memory to host, results in poor memory utilization. Running the proactive reclamation in such guests could help mitigating this problem. Google has also implemented this idea and using it in their data center. They further proposed upstreaming it in LSFMM'19, and "the general consensus was that, while this sort of proactive reclaim would be useful for a number of users, the cost of this particular solution was too high to consider merging it upstream"[3]. The cost mainly comes from the coldness tracking. Roughly speaking, the implementation periodically scans the 'Accessed' bit of each page. For the reason, the overhead linearly increases as the size of the memory and the scanning frequency grows. As a result, Google is known to dedicating one CPU for the work. That's a reasonable option to someone like Google, but it wouldn't be so to some others. DAMON and DAMOS: An engine for data access pattern-oriented memory management ----------------------------------------------------------------------------- DAMON[4] is a framework for general data access monitoring. Its adaptive monitoring overhead control feature minimizes its monitoring overhead. It also let the upper-bound of the overhead be configurable by clients, regardless of the size of the monitoring target memory. While monitoring 70 GiB memory of a production system every 5 milliseconds, it consumes less than 1% single CPU time. For this, it could sacrify some of the quality of the monitoring results. Nevertheless, the lower-bound of the quality is configurable, and it uses a best-effort algorithm for better quality. Our test results[5] show the quality is practical enough. From the production system monitoring, we were able to find a 4 KiB region in the 70 GiB memory that shows highest access frequency. We normally don't monitor the data access pattern just for fun but to improve something like memory management. Proactive reclamation is one such usage. For such general cases, DAMON provides a feature called DAMon-based Operation Schemes (DAMOS)[6]. It makes DAMON an engine for general data access pattern oriented memory management. Using this, clients can ask DAMON to find memory regions of specific data access pattern and apply some memory management action (e.g., page out, move to head of the LRU list, use huge page, ...). We call the request 'scheme'. Proactive Reclamation on top of DAMON/DAMOS ------------------------------------------- Therefore, by using DAMON for the cold pages detection, the proactive reclamation's monitoring overhead issue can be solved. Actually, we previously implemented a version of proactive reclamation using DAMOS and achieved noticeable improvements with our evaluation setup[5]. Nevertheless, it more for a proof-of-concept, rather than production uses. It supports only virtual address spaces of processes, and require additional tuning efforts for given workloads and the hardware. For the tuning, we introduced a simple auto-tuning user space tool[8]. Google is also known to using a ML-based similar approach for their fleets[2]. But, making it just works with intuitive knobs in the kernel would be helpful for general users. To this end, this patchset improves DAMOS to be ready for such production usages, and implements another version of the proactive reclamation, namely DAMON_RECLAIM, on top of it. DAMOS Improvements: Aggressiveness Control, Prioritization, and Watermarks -------------------------------------------------------------------------- First of all, the current version of DAMOS supports only virtual address spaces. This patchset makes it supports the physical address space for the page out action. Next major problem of the current version of DAMOS is the lack of the aggressiveness control, which can results in arbitrary overhead. For example, if huge memory regions having the data access pattern of interest are found, applying the requested action to all of the regions could incur significant overhead. It can be controlled by tuning the target data access pattern with manual or automated approaches[2,7]. But, some people would prefer the kernel to just work with only intuitive tuning or default values. For such cases, this patchset implements a safeguard, namely time/size quota. Using this, the clients can specify up to how much time can be used for applying the action, and/or up to how much memory regions the action can be applied within a user-specified time duration. A followup question is, to which memory regions should the action applied within the limits? We implement a simple regions prioritization mechanism for each action and make DAMOS to apply the action to high priority regions first. It also allows clients tune the prioritization mechanism to use different weights for size, access frequency, and age of memory regions. This means we could use not only LRU but also LFU or some fancy algorithms like CAR[9] with lightweight overhead. Though DAMON is lightweight, someone would want to remove even the cold pages monitoring overhead when it is unnecessary. Currently, it should manually turned on and off by clients, but some clients would simply want to turn it on and off based on some metrics like free memory ratio or memory fragmentation. For such cases, this patchset implements a watermarks-based automatic activation feature. It allows the clients configure the metric of their interest, and three watermarks of the metric. If the metric is higher than the high watermark or lower than the low watermark, the scheme is deactivated. If the metric is lower than the mid watermark but higher than the low watermark, the scheme is activated. DAMON-based Reclaim ------------------- Using the improved version of DAMOS, this patchset implements a static kernel module called 'damon_reclaim'. It finds memory regions that didn't accessed for specific time duration and page out. Consuming too much CPU for the paging out operations, or doing pageout too frequently can be critical for systems configuring their swap devices with software-defined in-memory block devices like zram/zswap or total number of writes limited devices like SSDs, respectively. To avoid the problems, the time/size quotas can be configured. Under the quotas, it pages out memory regions that didn't accessed longer first. Also, to remove the monitoring overhead under peaceful situation, and to fall back to the LRU-list based page granularity reclamation when it doesn't make progress, the three watermarks based activation mechanism is used, with the free memory ratio as the watermark metric. For convenient configurations, it provides several module parameters. Using these, sysadmins can enable/disable it, and tune its parameters including the coldness identification time threshold, the time/size quotas and the three watermarks. Evaluation ========== In short, DAMON_RECLAIM with 50ms/s time quota and regions prioritization on v5.15-rc5 Linux kernel with ZRAM swap device achieves 38.58% memory saving with only 1.94% runtime overhead. For this, DAMON_RECLAIM consumes only 4.97% of single CPU time. Setup ----- We evaluate DAMON_RECLAIM to show how each of the DAMOS improvements make effect. For this, we measure DAMON_RECLAIM's CPU consumption, entire system memory footprint, total number of major page faults, and runtime of 24 realistic workloads in PARSEC3 and SPLASH-2X benchmark suites on my QEMU/KVM based virtual machine. The virtual machine runs on an i3.metal AWS instance, has 130GiB memory, and runs a linux kernel built on latest -mm tree[1] plus this patchset. It also utilizes a 4 GiB ZRAM swap device. We repeats the measurement 5 times and use averages. [1] https://github.com/hnaz/linux-mm/tree/v5.15-rc5-mmots-2021-10-13-19-55 Detailed Results ---------------- The results are summarized in the below table. With coldness identification threshold of 5 seconds, DAMON_RECLAIM without the time quota-based speed limit achieves 47.21% memory saving, but incur 4.59% runtime slowdown to the workloads on average. For this, DAMON_RECLAIM consumes about 11.28% single CPU time. Applying time quotas of 200ms/s, 50ms/s, and 10ms/s without the regions prioritization reduces the slowdown to 4.89%, 2.65%, and 1.5%, respectively. Time quota of 200ms/s (20%) makes no real change compared to the quota unapplied version, because the quota unapplied version consumes only 11.28% CPU time. DAMON_RECLAIM's CPU utilization also similarly reduced: 11.24%, 5.51%, and 2.01% of single CPU time. That is, the overhead is proportional to the speed limit. Nevertheless, it also reduces the memory saving because it becomes less aggressive. In detail, the three variants show 48.76%, 37.83%, and 7.85% memory saving, respectively. Applying the regions prioritization (page out regions that not accessed longer first within the time quota) further reduces the performance degradation. Runtime slowdowns and total number of major page faults increase has been 4.89%/218,690% -> 4.39%/166,136% (200ms/s), 2.65%/111,886% -> 1.94%/59,053% (50ms/s), and 1.5%/34,973.40% -> 2.08%/8,781.75% (10ms/s). The runtime under 10ms/s time quota has increased with prioritization, but apparently that's under the margin of error. time quota prioritization memory_saving cpu_util slowdown pgmajfaults overhead N N 47.21% 11.28% 4.59% 194,802% 200ms/s N 48.76% 11.24% 4.89% 218,690% 50ms/s N 37.83% 5.51% 2.65% 111,886% 10ms/s N 7.85% 2.01% 1.5% 34,793.40% 200ms/s Y 50.08% 10.38% 4.39% 166,136% 50ms/s Y 38.58% 4.97% 1.94% 59,053% 10ms/s Y 3.63% 1.73% 2.08% 8,781.75% Baseline and Complete Git Trees =============================== The patches are based on the latest -mm tree (v5.15-rc5-mmots-2021-10-13-19-55). You can also clone the complete git tree from: $ git clone git://github.com/sjp38/linux -b damon_reclaim/patches/v1 The web is also available: https://git.kernel.org/pub/scm/linux/kernel/git/sj/linux.git/tag/?h=damon_reclaim/patches/v1 Sequence Of Patches =================== The first patch makes DAMOS support the physical address space for the page out action. Following five patches (patches 2-6) implement the time/size quotas. Next four patches (patches 7-10) implement the memory regions prioritization within the limit. Then, three following patches (patches 11-13) implement the watermarks-based schemes activation. Finally, the last two patches (patches 14-15) implement and document the DAMON-based reclamation using the advanced DAMOS. [1] https://www.kernel.org/doc/html/v5.15-rc1/vm/damon/index.html [2] https://research.google/pubs/pub48551/ [3] https://lwn.net/Articles/787611/ [4] https://damonitor.github.io [5] https://damonitor.github.io/doc/html/latest/vm/damon/eval.html [6] https://lore.kernel.org/linux-mm/20211001125604.29660-1-sj@kernel.org/ [7] https://github.com/awslabs/damoos [8] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html [9] https://www.usenix.org/conference/fast-04/car-clock-adaptive-replacement This patch (of 15): This makes the DAMON primitives for physical address space support the pageout action for DAMON-based Operation Schemes. With this commit, hence, users can easily implement system-level data access-aware reclamations using DAMOS. [sj@kernel.org: fix missing-prototype build warning] Link: https://lkml.kernel.org/r/20211025064220.13904-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211019150731.16699-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211019150731.16699-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Cameron Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Jonathan Corbet Cc: David Hildenbrand Cc: David Woodhouse Cc: Marco Elver Cc: Leonard Foerster Cc: Greg Thelen Cc: Markus Boehme Cc: David Rientjes Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 ++ mm/damon/paddr.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 715dadd21f7c..9a327bc787b5 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -357,6 +357,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx); void damon_pa_prepare_access_checks(struct damon_ctx *ctx); unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); +int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_PADDR */ diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index d7a2ecd09ed0..957ada55de77 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -11,7 +11,9 @@ #include #include #include +#include +#include "../internal.h" #include "prmtv-common.h" static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, @@ -211,6 +213,39 @@ bool damon_pa_target_valid(void *t) return true; } +int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *scheme) +{ + unsigned long addr; + LIST_HEAD(page_list); + + if (scheme->action != DAMOS_PAGEOUT) + return -EINVAL; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + if (PageUnevictable(page)) { + putback_lru_page(page); + } else { + list_add(&page->lru, &page_list); + put_page(page); + } + } + reclaim_pages(&page_list); + cond_resched(); + return 0; +} + void damon_pa_set_primitives(struct damon_ctx *ctx) { ctx->primitive.init = NULL; @@ -220,5 +255,5 @@ void damon_pa_set_primitives(struct damon_ctx *ctx) ctx->primitive.reset_aggregated = NULL; ctx->primitive.target_valid = damon_pa_target_valid; ctx->primitive.cleanup = NULL; - ctx->primitive.apply_scheme = NULL; + ctx->primitive.apply_scheme = damon_pa_apply_scheme; } From 2b8a248d5873343aa16f6c5ede30517693995f13 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:16 -0700 Subject: [PATCH 490/512] mm/damon/schemes: implement size quota for schemes application speed control There could be arbitrarily large memory regions fulfilling the target data access pattern of a DAMON-based operation scheme. In the case, applying the action of the scheme could incur too high overhead. To provide an intuitive way for avoiding it, this implements a feature called size quota. If the quota is set, DAMON tries to apply the action only up to the given amount of memory regions within a given time window. Link: https://lkml.kernel.org/r/20211019150731.16699-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 36 +++++++++++++++++++++++--- mm/damon/core.c | 60 +++++++++++++++++++++++++++++++++++++------ mm/damon/dbgfs.c | 4 ++- 3 files changed, 87 insertions(+), 13 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 9a327bc787b5..3a1ce9d9921c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -89,6 +89,26 @@ enum damos_action { DAMOS_STAT, /* Do nothing but only record the stat */ }; +/** + * struct damos_quota - Controls the aggressiveness of the given scheme. + * @sz: Maximum bytes of memory that the action can be applied. + * @reset_interval: Charge reset interval in milliseconds. + * + * To avoid consuming too much CPU time or IO resources for applying the + * &struct damos->action to large memory, DAMON allows users to set a size + * quota. The quota can be set by writing non-zero values to &sz. If the size + * quota is set, DAMON tries to apply the action only up to &sz bytes within + * &reset_interval. + */ +struct damos_quota { + unsigned long sz; + unsigned long reset_interval; + +/* private: For charging the quota */ + unsigned long charged_sz; + unsigned long charged_from; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -98,13 +118,20 @@ enum damos_action { * @min_age_region: Minimum age of target regions. * @max_age_region: Maximum age of target regions. * @action: &damo_action to be applied to the target regions. + * @quota: Control the aggressiveness of this scheme. * @stat_count: Total number of regions that this scheme is applied. * @stat_sz: Total size of regions that this scheme is applied. * @list: List head for siblings. * - * For each aggregation interval, DAMON applies @action to monitoring target - * regions fit in the condition and updates the statistics. Note that both - * the minimums and the maximums are inclusive. + * For each aggregation interval, DAMON finds regions which fit in the + * condition (&min_sz_region, &max_sz_region, &min_nr_accesses, + * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to + * those. To avoid consuming too much CPU time or IO resources for the + * &action, "a is used. + * + * After applying the &action to each region, &stat_count and &stat_sz is + * updated to reflect the number of regions and total size of regions that the + * &action is applied. */ struct damos { unsigned long min_sz_region; @@ -114,6 +141,7 @@ struct damos { unsigned int min_age_region; unsigned int max_age_region; enum damos_action action; + struct damos_quota quota; unsigned long stat_count; unsigned long stat_sz; struct list_head list; @@ -310,7 +338,7 @@ struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, unsigned int min_nr_accesses, unsigned int max_nr_accesses, unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action); + enum damos_action action, struct damos_quota *quota); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); diff --git a/mm/damon/core.c b/mm/damon/core.c index 2f6785737902..cce14a0d5c72 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -89,7 +89,7 @@ struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, unsigned int min_nr_accesses, unsigned int max_nr_accesses, unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action) + enum damos_action action, struct damos_quota *quota) { struct damos *scheme; @@ -107,6 +107,11 @@ struct damos *damon_new_scheme( scheme->stat_sz = 0; INIT_LIST_HEAD(&scheme->list); + scheme->quota.sz = quota->sz; + scheme->quota.reset_interval = quota->reset_interval; + scheme->quota.charged_sz = 0; + scheme->quota.charged_from = 0; + return scheme; } @@ -530,15 +535,25 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } } +static void damon_split_region_at(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + unsigned long sz_r); + static void damon_do_apply_schemes(struct damon_ctx *c, struct damon_target *t, struct damon_region *r) { struct damos *s; - unsigned long sz; damon_for_each_scheme(s, c) { - sz = r->ar.end - r->ar.start; + struct damos_quota *quota = &s->quota; + unsigned long sz = r->ar.end - r->ar.start; + + /* Check the quota */ + if (quota->sz && quota->charged_sz >= quota->sz) + continue; + + /* Check the target regions condition */ if (sz < s->min_sz_region || s->max_sz_region < sz) continue; if (r->nr_accesses < s->min_nr_accesses || @@ -546,22 +561,51 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; if (r->age < s->min_age_region || s->max_age_region < r->age) continue; - s->stat_count++; - s->stat_sz += sz; - if (c->primitive.apply_scheme) + + /* Apply the scheme */ + if (c->primitive.apply_scheme) { + if (quota->sz && quota->charged_sz + sz > quota->sz) { + sz = ALIGN_DOWN(quota->sz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(c, t, r, sz); + } c->primitive.apply_scheme(c, t, r, s); + quota->charged_sz += sz; + } if (s->action != DAMOS_STAT) r->age = 0; + +update_stat: + s->stat_count++; + s->stat_sz += sz; } } static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; - struct damon_region *r; + struct damon_region *r, *next_r; + struct damos *s; + + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + + if (!quota->sz) + continue; + + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies( + quota->reset_interval))) { + quota->charged_from = jiffies; + quota->charged_sz = 0; + } + } damon_for_each_target(t, c) { - damon_for_each_region(r, t) + damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); } } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index c90988a20fa4..a04bd50cc4c4 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -188,6 +188,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, *nr_schemes = 0; while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_quota quota = {}; + ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u%n", &min_sz, &max_sz, &min_nr_a, &max_nr_a, &min_age, &max_age, &action, &parsed); @@ -200,7 +202,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, pos += parsed; scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, - min_age, max_age, action); + min_age, max_age, action, "a); if (!scheme) goto fail; From 50585192bc2ef9309d32dabdbb5e735679f4f128 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:20 -0700 Subject: [PATCH 491/512] mm/damon/schemes: skip already charged targets and regions If DAMOS has stopped applying action in the middle of a group of memory regions due to its size quota, it starts the work again from the beginning of the address space in the next charge window. If there is a huge memory region at the beginning of the address space and it fulfills the scheme's target data access pattern always, the action will applied to only the region. This mitigates the case by skipping memory regions that charged in current charge window at the beginning of next charge window. Link: https://lkml.kernel.org/r/20211019150731.16699-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 5 +++++ mm/damon/core.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 3a1ce9d9921c..585d985768fd 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -107,6 +107,8 @@ struct damos_quota { /* private: For charging the quota */ unsigned long charged_sz; unsigned long charged_from; + struct damon_target *charge_target_from; + unsigned long charge_addr_from; }; /** @@ -307,6 +309,9 @@ struct damon_ctx { #define damon_prev_region(r) \ (container_of(r->list.prev, struct damon_region, list)) +#define damon_last_region(t) \ + (list_last_entry(&t->regions_list, struct damon_region, list)) + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index cce14a0d5c72..693b75bc3450 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -111,6 +111,8 @@ struct damos *damon_new_scheme( scheme->quota.reset_interval = quota->reset_interval; scheme->quota.charged_sz = 0; scheme->quota.charged_from = 0; + scheme->quota.charge_target_from = NULL; + scheme->quota.charge_addr_from = 0; return scheme; } @@ -553,6 +555,37 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->sz && quota->charged_sz >= quota->sz) continue; + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + continue; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + continue; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + continue; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz) { + if (r->ar.end - r->ar.start <= + DAMON_MIN_REGION) + continue; + sz = DAMON_MIN_REGION; + } + damon_split_region_at(c, t, r, sz); + r = damon_next_region(r); + sz = r->ar.end - r->ar.start; + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + /* Check the target regions condition */ if (sz < s->min_sz_region || s->max_sz_region < sz) continue; @@ -573,6 +606,10 @@ static void damon_do_apply_schemes(struct damon_ctx *c, } c->primitive.apply_scheme(c, t, r, s); quota->charged_sz += sz; + if (quota->sz && quota->charged_sz >= quota->sz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } } if (s->action != DAMOS_STAT) r->age = 0; From 1cd2430300594a230dba9178ac9e286d868d9da2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:23 -0700 Subject: [PATCH 492/512] mm/damon/schemes: implement time quota The size quota feature of DAMOS is useful for IO resource-critical systems, but not so intuitive for CPU time-critical systems. Systems using zram or zswap-like swap device would be examples. To provide another intuitive ways for such systems, this implements time-based quota for DAMON-based Operation Schemes. If the quota is set, DAMOS tries to use only up to the user-defined quota of CPU time within a given time window. Link: https://lkml.kernel.org/r/20211019150731.16699-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 25 +++++++++++++++++++----- mm/damon/core.c | 45 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 10 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 585d985768fd..1e7671bf3d23 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,20 +91,35 @@ enum damos_action { /** * struct damos_quota - Controls the aggressiveness of the given scheme. + * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. * @reset_interval: Charge reset interval in milliseconds. * * To avoid consuming too much CPU time or IO resources for applying the - * &struct damos->action to large memory, DAMON allows users to set a size - * quota. The quota can be set by writing non-zero values to &sz. If the size - * quota is set, DAMON tries to apply the action only up to &sz bytes within - * &reset_interval. + * &struct damos->action to large memory, DAMON allows users to set time and/or + * size quotas. The quotas can be set by writing non-zero values to &ms and + * &sz, respectively. If the time quota is set, DAMON tries to use only up to + * &ms milliseconds within &reset_interval for applying the action. If the + * size quota is set, DAMON tries to apply the action only up to &sz bytes + * within &reset_interval. + * + * Internally, the time quota is transformed to a size quota using estimated + * throughput of the scheme's action. DAMON then compares it against &sz and + * uses smaller one as the effective quota. */ struct damos_quota { + unsigned long ms; unsigned long sz; unsigned long reset_interval; -/* private: For charging the quota */ +/* private: */ + /* For throughput estimation */ + unsigned long total_charged_sz; + unsigned long total_charged_ns; + + unsigned long esz; /* Effective size quota in bytes */ + + /* For charging the quota */ unsigned long charged_sz; unsigned long charged_from; struct damon_target *charge_target_from; diff --git a/mm/damon/core.c b/mm/damon/core.c index 693b75bc3450..d1da4bef96ed 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -107,8 +107,12 @@ struct damos *damon_new_scheme( scheme->stat_sz = 0; INIT_LIST_HEAD(&scheme->list); + scheme->quota.ms = quota->ms; scheme->quota.sz = quota->sz; scheme->quota.reset_interval = quota->reset_interval; + scheme->quota.total_charged_sz = 0; + scheme->quota.total_charged_ns = 0; + scheme->quota.esz = 0; scheme->quota.charged_sz = 0; scheme->quota.charged_from = 0; scheme->quota.charge_target_from = NULL; @@ -550,9 +554,10 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; unsigned long sz = r->ar.end - r->ar.start; + struct timespec64 begin, end; /* Check the quota */ - if (quota->sz && quota->charged_sz >= quota->sz) + if (quota->esz && quota->charged_sz >= quota->esz) continue; /* Skip previously charged regions */ @@ -597,16 +602,21 @@ static void damon_do_apply_schemes(struct damon_ctx *c, /* Apply the scheme */ if (c->primitive.apply_scheme) { - if (quota->sz && quota->charged_sz + sz > quota->sz) { - sz = ALIGN_DOWN(quota->sz - quota->charged_sz, + if (quota->esz && + quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, DAMON_MIN_REGION); if (!sz) goto update_stat; damon_split_region_at(c, t, r, sz); } + ktime_get_coarse_ts64(&begin); c->primitive.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); quota->charged_sz += sz; - if (quota->sz && quota->charged_sz >= quota->sz) { + if (quota->esz && quota->charged_sz >= quota->esz) { quota->charge_target_from = t; quota->charge_addr_from = r->ar.end + 1; } @@ -620,6 +630,29 @@ update_stat: } } +/* Shouldn't be called if quota->ms and quota->sz are zero */ +static void damos_set_effective_quota(struct damos_quota *quota) +{ + unsigned long throughput; + unsigned long esz; + + if (!quota->ms) { + quota->esz = quota->sz; + return; + } + + if (quota->total_charged_ns) + throughput = quota->total_charged_sz * 1000000 / + quota->total_charged_ns; + else + throughput = PAGE_SIZE * 1024; + esz = throughput * quota->ms; + + if (quota->sz && quota->sz < esz) + esz = quota->sz; + quota->esz = esz; +} + static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; @@ -629,15 +662,17 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - if (!quota->sz) + if (!quota->ms && !quota->sz) continue; /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies( quota->reset_interval))) { + quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; + damos_set_effective_quota(quota); } } From d7d0ec85e983945079364db3c3d2d80cc795a48c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:27 -0700 Subject: [PATCH 493/512] mm/damon/dbgfs: support quotas of schemes This makes the debugfs interface of DAMON support the scheme quotas by chaning the format of the input for the schemes file. Link: https://lkml.kernel.org/r/20211019150731.16699-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index a04bd50cc4c4..097e6745ba75 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -105,11 +105,14 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu\n", + "%lu %lu %u %u %u %u %d %lu %lu %lu %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, - s->action, s->stat_count, s->stat_sz); + s->action, + s->quota.ms, s->quota.sz, + s->quota.reset_interval, + s->stat_count, s->stat_sz); if (!rc) return -ENOMEM; @@ -190,10 +193,11 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, while (pos < len && *nr_schemes < max_nr_schemes) { struct damos_quota quota = {}; - ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u%n", + ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu%n", &min_sz, &max_sz, &min_nr_a, &max_nr_a, - &min_age, &max_age, &action, &parsed); - if (ret != 7) + &min_age, &max_age, &action, "a.ms, + "a.sz, "a.reset_interval, &parsed); + if (ret != 10) break; if (!damos_action_valid(action)) { pr_err("wrong action %d\n", action); From a2cb4dd0d40d3dcb7288a963d0f66271934417b6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:30 -0700 Subject: [PATCH 494/512] mm/damon/selftests: support schemes quotas This updates DAMON selftests to support updated schemes debugfs file format for the quotas. Link: https://lkml.kernel.org/r/20211019150731.16699-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/damon/debugfs_attrs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh index 639cfb6a1f65..8e33a7b584e7 100644 --- a/tools/testing/selftests/damon/debugfs_attrs.sh +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -63,10 +63,10 @@ echo "$orig_content" > "$file" file="$DBGFS/schemes" orig_content=$(cat "$file") -test_write_succ "$file" "1 2 3 4 5 6 4" \ +test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0" \ "$orig_content" "valid input" test_write_fail "$file" "1 2 -3 4 5 6 3" "$orig_content" "multi lines" +3 4 5 6 3 0 0 0" "$orig_content" "multi lines" test_write_succ "$file" "" "$orig_content" "disabling" echo "$orig_content" > "$file" From 38683e003153f7abfa612d7b7fe147efa4624af2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:33 -0700 Subject: [PATCH 495/512] mm/damon/schemes: prioritize regions within the quotas This makes DAMON apply schemes to regions having higher priority first, if it cannot apply schemes to all regions due to the quotas. The prioritization function should be implemented in the monitoring primitives. Those would commonly calculate the priority of the region using attributes of regions, namely 'size', 'nr_accesses', and 'age'. For example, some primitive would calculate the priority of each region using a weighted sum of 'nr_accesses' and 'age' of the region. The optimal weights would depend on give environments, so this makes those customizable. Nevertheless, the score calculation functions are only encouraged to respect the weights, not mandated. Link: https://lkml.kernel.org/r/20211019150731.16699-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 26 ++++++++++++++++++ mm/damon/core.c | 62 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 81 insertions(+), 7 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 1e7671bf3d23..5d47ad9e3911 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -14,6 +14,8 @@ /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE +/* Max priority score for DAMON-based operation schemes */ +#define DAMOS_MAX_SCORE (99) /** * struct damon_addr_range - Represents an address region of [@start, @end). @@ -95,6 +97,10 @@ enum damos_action { * @sz: Maximum bytes of memory that the action can be applied. * @reset_interval: Charge reset interval in milliseconds. * + * @weight_sz: Weight of the region's size for prioritization. + * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. + * @weight_age: Weight of the region's age for prioritization. + * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -106,12 +112,22 @@ enum damos_action { * Internally, the time quota is transformed to a size quota using estimated * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. + * + * For selecting regions within the quota, DAMON prioritizes current scheme's + * target memory regions using the &struct damon_primitive->get_scheme_score. + * You could customize the prioritization logic by setting &weight_sz, + * &weight_nr_accesses, and &weight_age, because monitoring primitives are + * encouraged to respect those. */ struct damos_quota { unsigned long ms; unsigned long sz; unsigned long reset_interval; + unsigned int weight_sz; + unsigned int weight_nr_accesses; + unsigned int weight_age; + /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; @@ -124,6 +140,10 @@ struct damos_quota { unsigned long charged_from; struct damon_target *charge_target_from; unsigned long charge_addr_from; + + /* For prioritization */ + unsigned long histogram[DAMOS_MAX_SCORE + 1]; + unsigned int min_score; }; /** @@ -174,6 +194,7 @@ struct damon_ctx; * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. * @reset_aggregated: Reset aggregated accesses monitoring results. + * @get_scheme_score: Get the score of a region for a scheme. * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. * @cleanup: Clean up the context. @@ -200,6 +221,8 @@ struct damon_ctx; * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. + * @get_scheme_score should return the priority score of a region for a scheme + * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. @@ -213,6 +236,9 @@ struct damon_primitive { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); + int (*get_scheme_score)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); bool (*target_valid)(void *target); diff --git a/mm/damon/core.c b/mm/damon/core.c index d1da4bef96ed..fad25778e2ec 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -12,6 +12,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -110,6 +111,9 @@ struct damos *damon_new_scheme( scheme->quota.ms = quota->ms; scheme->quota.sz = quota->sz; scheme->quota.reset_interval = quota->reset_interval; + scheme->quota.weight_sz = quota->weight_sz; + scheme->quota.weight_nr_accesses = quota->weight_nr_accesses; + scheme->quota.weight_age = quota->weight_age; scheme->quota.total_charged_sz = 0; scheme->quota.total_charged_ns = 0; scheme->quota.esz = 0; @@ -545,6 +549,28 @@ static void damon_split_region_at(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, unsigned long sz_r); +static bool __damos_valid_target(struct damon_region *r, struct damos *s) +{ + unsigned long sz; + + sz = r->ar.end - r->ar.start; + return s->min_sz_region <= sz && sz <= s->max_sz_region && + s->min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->max_nr_accesses && + s->min_age_region <= r->age && r->age <= s->max_age_region; +} + +static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + bool ret = __damos_valid_target(r, s); + + if (!ret || !s->quota.esz || !c->primitive.get_scheme_score) + return ret; + + return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score; +} + static void damon_do_apply_schemes(struct damon_ctx *c, struct damon_target *t, struct damon_region *r) @@ -591,13 +617,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, quota->charge_addr_from = 0; } - /* Check the target regions condition */ - if (sz < s->min_sz_region || s->max_sz_region < sz) - continue; - if (r->nr_accesses < s->min_nr_accesses || - s->max_nr_accesses < r->nr_accesses) - continue; - if (r->age < s->min_age_region || s->max_age_region < r->age) + if (!damos_valid_target(c, t, r, s)) continue; /* Apply the scheme */ @@ -661,6 +681,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; if (!quota->ms && !quota->sz) continue; @@ -674,6 +696,32 @@ static void kdamond_apply_schemes(struct damon_ctx *c) quota->charged_sz = 0; damos_set_effective_quota(quota); } + + if (!c->primitive.get_scheme_score) + continue; + + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->primitive.get_scheme_score( + c, t, r, s); + quota->histogram[score] += + r->ar.end - r->ar.start; + if (score > max_score) + max_score = score; + } + } + + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; } damon_for_each_target(t, c) { From 198f0f4c58b9f481e4e51c8c70a6ab9852bbab7f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:37 -0700 Subject: [PATCH 496/512] mm/damon/vaddr,paddr: support pageout prioritization This makes the default monitoring primitives for virtual address spaces and the physical address sapce to support memory regions prioritization for 'PAGEOUT' DAMOS action. It calculates hotness of each region as weighted sum of 'nr_accesses' and 'age' of the region and get the priority score as reverse of the hotness, so that cold regions can be paged out first. Link: https://lkml.kernel.org/r/20211019150731.16699-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 4 ++++ mm/damon/paddr.c | 14 +++++++++++++ mm/damon/prmtv-common.c | 46 +++++++++++++++++++++++++++++++++++++++++ mm/damon/prmtv-common.h | 3 +++ mm/damon/vaddr.c | 15 ++++++++++++++ 5 files changed, 82 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 5d47ad9e3911..1217566a0ebc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -421,6 +421,8 @@ bool damon_va_target_valid(void *t); void damon_va_cleanup(struct damon_ctx *ctx); int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); +int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_VADDR */ @@ -433,6 +435,8 @@ unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); +int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_PADDR */ diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 957ada55de77..a496d6f203d6 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -246,6 +246,19 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, return 0; } +int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pageout_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + void damon_pa_set_primitives(struct damon_ctx *ctx) { ctx->primitive.init = NULL; @@ -256,4 +269,5 @@ void damon_pa_set_primitives(struct damon_ctx *ctx) ctx->primitive.target_valid = damon_pa_target_valid; ctx->primitive.cleanup = NULL; ctx->primitive.apply_scheme = damon_pa_apply_scheme; + ctx->primitive.get_scheme_score = damon_pa_scheme_score; } diff --git a/mm/damon/prmtv-common.c b/mm/damon/prmtv-common.c index 7e62ee54fb54..92a04f5831d6 100644 --- a/mm/damon/prmtv-common.c +++ b/mm/damon/prmtv-common.c @@ -85,3 +85,49 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) put_page(page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ } + +#define DAMON_MAX_SUBSCORE (100) +#define DAMON_MAX_AGE_IN_LOG (32) + +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + unsigned int max_nr_accesses; + int freq_subscore; + unsigned int age_in_sec; + int age_in_log, age_subscore; + unsigned int freq_weight = s->quota.weight_nr_accesses; + unsigned int age_weight = s->quota.weight_age; + int hotness; + + max_nr_accesses = c->aggr_interval / c->sample_interval; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + + age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; + age_in_log++, age_in_sec >>= 1) + ; + + /* If frequency is 0, higher age means it's colder */ + if (freq_subscore == 0) + age_in_log *= -1; + + /* + * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. + * Scale it to be in [0, 100] and set it as age subscore. + */ + age_in_log += DAMON_MAX_AGE_IN_LOG; + age_subscore = age_in_log * DAMON_MAX_SUBSCORE / + DAMON_MAX_AGE_IN_LOG / 2; + + hotness = (freq_weight * freq_subscore + age_weight * age_subscore); + if (freq_weight + age_weight) + hotness /= freq_weight + age_weight; + /* + * Transform it to fit in [0, DAMOS_MAX_SCORE] + */ + hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; + + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; +} diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h index 7093d19e5d42..61f27037603e 100644 --- a/mm/damon/prmtv-common.h +++ b/mm/damon/prmtv-common.h @@ -15,3 +15,6 @@ struct page *damon_get_page(unsigned long pfn); void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); + +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 758501b8d97d..675cd8c7df9b 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -633,6 +633,20 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, return damos_madvise(t, r, madv_action); } +int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme) +{ + + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pageout_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + void damon_va_set_primitives(struct damon_ctx *ctx) { ctx->primitive.init = damon_va_init; @@ -643,6 +657,7 @@ void damon_va_set_primitives(struct damon_ctx *ctx) ctx->primitive.target_valid = damon_va_target_valid; ctx->primitive.cleanup = NULL; ctx->primitive.apply_scheme = damon_va_apply_scheme; + ctx->primitive.get_scheme_score = damon_va_scheme_score; } #include "vaddr-test.h" From f4a68b4a04e6db9397f7776c51d0f9715bd1a60e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:40 -0700 Subject: [PATCH 497/512] mm/damon/dbgfs: support prioritization weights This allows DAMON debugfs interface users set the prioritization weights by putting three more numbers to the 'schemes' file. Link: https://lkml.kernel.org/r/20211019150731.16699-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 097e6745ba75..20c4feb8b918 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -105,13 +105,16 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %lu %lu\n", + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, s->action, s->quota.ms, s->quota.sz, s->quota.reset_interval, + s->quota.weight_sz, + s->quota.weight_nr_accesses, + s->quota.weight_age, s->stat_count, s->stat_sz); if (!rc) return -ENOMEM; @@ -193,11 +196,14 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, while (pos < len && *nr_schemes < max_nr_schemes) { struct damos_quota quota = {}; - ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu%n", + ret = sscanf(&str[pos], + "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u%n", &min_sz, &max_sz, &min_nr_a, &max_nr_a, &min_age, &max_age, &action, "a.ms, - "a.sz, "a.reset_interval, &parsed); - if (ret != 10) + "a.sz, "a.reset_interval, + "a.weight_sz, "a.weight_nr_accesses, + "a.weight_age, &parsed); + if (ret != 13) break; if (!damos_action_valid(action)) { pr_err("wrong action %d\n", action); From 5a0d6a08b81162fbe1e207f02571ace6d888f8b0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:44 -0700 Subject: [PATCH 498/512] tools/selftests/damon: update for regions prioritization of schemes This updates the DAMON selftests for 'schemes' debugfs file, as the file format is updated. Link: https://lkml.kernel.org/r/20211019150731.16699-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/damon/debugfs_attrs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh index 8e33a7b584e7..466dbeb37e31 100644 --- a/tools/testing/selftests/damon/debugfs_attrs.sh +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -63,10 +63,10 @@ echo "$orig_content" > "$file" file="$DBGFS/schemes" orig_content=$(cat "$file") -test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0" \ +test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3" \ "$orig_content" "valid input" test_write_fail "$file" "1 2 -3 4 5 6 3 0 0 0" "$orig_content" "multi lines" +3 4 5 6 3 0 0 0 1 2 3" "$orig_content" "multi lines" test_write_succ "$file" "" "$orig_content" "disabling" echo "$orig_content" > "$file" From ee801b7dd7822a82fd7663048ad649545fac6df3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:47 -0700 Subject: [PATCH 499/512] mm/damon/schemes: activate schemes based on a watermarks mechanism DAMON-based operation schemes need to be manually turned on and off. In some use cases, however, the condition for turning a scheme on and off would depend on the system's situation. For example, schemes for proactive pages reclamation would need to be turned on when some memory pressure is detected, and turned off when the system has enough free memory. For easier control of schemes activation based on the system situation, this introduces a watermarks-based mechanism. The client can describe the watermark metric (e.g., amount of free memory in the system), watermark check interval, and three watermarks, namely high, mid, and low. If the scheme is deactivated, it only gets the metric and compare that to the three watermarks for every check interval. If the metric is higher than the high watermark, the scheme is deactivated. If the metric is between the mid watermark and the low watermark, the scheme is activated. If the metric is lower than the low watermark, the scheme is deactivated again. This is to allow users fall back to traditional page-granularity mechanisms. Link: https://lkml.kernel.org/r/20211019150731.16699-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 52 ++++++++++++++++++++++- mm/damon/core.c | 97 ++++++++++++++++++++++++++++++++++++++++++- mm/damon/dbgfs.c | 5 ++- 3 files changed, 151 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 1217566a0ebc..c93325efddd7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -146,6 +146,45 @@ struct damos_quota { unsigned int min_score; }; +/** + * enum damos_wmark_metric - Represents the watermark metric. + * + * @DAMOS_WMARK_NONE: Ignore the watermarks of the given scheme. + * @DAMOS_WMARK_FREE_MEM_RATE: Free memory rate of the system in [0,1000]. + */ +enum damos_wmark_metric { + DAMOS_WMARK_NONE, + DAMOS_WMARK_FREE_MEM_RATE, +}; + +/** + * struct damos_watermarks - Controls when a given scheme should be activated. + * @metric: Metric for the watermarks. + * @interval: Watermarks check time interval in microseconds. + * @high: High watermark. + * @mid: Middle watermark. + * @low: Low watermark. + * + * If &metric is &DAMOS_WMARK_NONE, the scheme is always active. Being active + * means DAMON does monitoring and applying the action of the scheme to + * appropriate memory regions. Else, DAMON checks &metric of the system for at + * least every &interval microseconds and works as below. + * + * If &metric is higher than &high, the scheme is inactivated. If &metric is + * between &mid and &low, the scheme is activated. If &metric is lower than + * &low, the scheme is inactivated. + */ +struct damos_watermarks { + enum damos_wmark_metric metric; + unsigned long interval; + unsigned long high; + unsigned long mid; + unsigned long low; + +/* private: */ + bool activated; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -156,6 +195,7 @@ struct damos_quota { * @max_age_region: Maximum age of target regions. * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. + * @wmarks: Watermarks for automated (in)activation of this scheme. * @stat_count: Total number of regions that this scheme is applied. * @stat_sz: Total size of regions that this scheme is applied. * @list: List head for siblings. @@ -166,6 +206,14 @@ struct damos_quota { * those. To avoid consuming too much CPU time or IO resources for the * &action, "a is used. * + * To do the work only when needed, schemes can be activated for specific + * system situations using &wmarks. If all schemes that registered to the + * monitoring context are inactive, DAMON stops monitoring either, and just + * repeatedly checks the watermarks. + * + * If all schemes that registered to a &struct damon_ctx are inactive, DAMON + * stops monitoring and just repeatedly checks the watermarks. + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -179,6 +227,7 @@ struct damos { unsigned int max_age_region; enum damos_action action; struct damos_quota quota; + struct damos_watermarks wmarks; unsigned long stat_count; unsigned long stat_sz; struct list_head list; @@ -384,7 +433,8 @@ struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, unsigned int min_nr_accesses, unsigned int max_nr_accesses, unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota); + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); diff --git a/mm/damon/core.c b/mm/damon/core.c index fad25778e2ec..6993c60ae31c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -90,7 +91,8 @@ struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, unsigned int min_nr_accesses, unsigned int max_nr_accesses, unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota) + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) { struct damos *scheme; @@ -122,6 +124,13 @@ struct damos *damon_new_scheme( scheme->quota.charge_target_from = NULL; scheme->quota.charge_addr_from = 0; + scheme->wmarks.metric = wmarks->metric; + scheme->wmarks.interval = wmarks->interval; + scheme->wmarks.high = wmarks->high; + scheme->wmarks.mid = wmarks->mid; + scheme->wmarks.low = wmarks->low; + scheme->wmarks.activated = true; + return scheme; } @@ -582,6 +591,9 @@ static void damon_do_apply_schemes(struct damon_ctx *c, unsigned long sz = r->ar.end - r->ar.start; struct timespec64 begin, end; + if (!s->wmarks.activated) + continue; + /* Check the quota */ if (quota->esz && quota->charged_sz >= quota->esz) continue; @@ -684,6 +696,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c) unsigned long cumulated_sz; unsigned int score, max_score = 0; + if (!s->wmarks.activated) + continue; + if (!quota->ms && !quota->sz) continue; @@ -924,6 +939,83 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) return true; } +static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) +{ + struct sysinfo i; + + switch (metric) { + case DAMOS_WMARK_FREE_MEM_RATE: + si_meminfo(&i); + return i.freeram * 1000 / i.totalram; + default: + break; + } + return -EINVAL; +} + +/* + * Returns zero if the scheme is active. Else, returns time to wait for next + * watermark check in micro-seconds. + */ +static unsigned long damos_wmark_wait_us(struct damos *scheme) +{ + unsigned long metric; + + if (scheme->wmarks.metric == DAMOS_WMARK_NONE) + return 0; + + metric = damos_wmark_metric_value(scheme->wmarks.metric); + /* higher than high watermark or lower than low watermark */ + if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { + if (scheme->wmarks.activated) + pr_debug("inactivate a scheme (%d) for %s wmark\n", + scheme->action, + metric > scheme->wmarks.high ? + "high" : "low"); + scheme->wmarks.activated = false; + return scheme->wmarks.interval; + } + + /* inactive and higher than middle watermark */ + if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) && + !scheme->wmarks.activated) + return scheme->wmarks.interval; + + if (!scheme->wmarks.activated) + pr_debug("activate a scheme (%d)\n", scheme->action); + scheme->wmarks.activated = true; + return 0; +} + +static void kdamond_usleep(unsigned long usecs) +{ + if (usecs > 100 * 1000) + schedule_timeout_interruptible(usecs_to_jiffies(usecs)); + else + usleep_range(usecs, usecs + 1); +} + +/* Returns negative error code if it's not activated but should return */ +static int kdamond_wait_activation(struct damon_ctx *ctx) +{ + struct damos *s; + unsigned long wait_time; + unsigned long min_wait_time = 0; + + while (!kdamond_need_stop(ctx)) { + damon_for_each_scheme(s, ctx) { + wait_time = damos_wmark_wait_us(s); + if (!min_wait_time || wait_time < min_wait_time) + min_wait_time = wait_time; + } + if (!min_wait_time) + return 0; + + kdamond_usleep(min_wait_time); + } + return -EBUSY; +} + static void set_kdamond_stop(struct damon_ctx *ctx) { mutex_lock(&ctx->kdamond_lock); @@ -952,6 +1044,9 @@ static int kdamond_fn(void *data) sz_limit = damon_region_sz_limit(ctx); while (!kdamond_need_stop(ctx)) { + if (kdamond_wait_activation(ctx)) + continue; + if (ctx->primitive.prepare_access_checks) ctx->primitive.prepare_access_checks(ctx); if (ctx->callback.after_sampling && diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 20c4feb8b918..9f13060d1058 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -195,6 +195,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, *nr_schemes = 0; while (pos < len && *nr_schemes < max_nr_schemes) { struct damos_quota quota = {}; + struct damos_watermarks wmarks = { + .metric = DAMOS_WMARK_NONE, + }; ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u%n", @@ -212,7 +215,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, pos += parsed; scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, - min_age, max_age, action, "a); + min_age, max_age, action, "a, &wmarks); if (!scheme) goto fail; From ae666a6dddfd119da55cc1bad54f7cbd8b2ef54c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:50 -0700 Subject: [PATCH 500/512] mm/damon/dbgfs: support watermarks This updates DAMON debugfs interface to support the watermarks based schemes activation. For this, now 'schemes' file receives five more values. Link: https://lkml.kernel.org/r/20211019150731.16699-13-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 9f13060d1058..6828e463348b 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %lu %lu\n", + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, @@ -115,6 +115,8 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->quota.weight_sz, s->quota.weight_nr_accesses, s->quota.weight_age, + s->wmarks.metric, s->wmarks.interval, + s->wmarks.high, s->wmarks.mid, s->wmarks.low, s->stat_count, s->stat_sz); if (!rc) return -ENOMEM; @@ -195,18 +197,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, *nr_schemes = 0; while (pos < len && *nr_schemes < max_nr_schemes) { struct damos_quota quota = {}; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_NONE, - }; + struct damos_watermarks wmarks; ret = sscanf(&str[pos], - "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u%n", + "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", &min_sz, &max_sz, &min_nr_a, &max_nr_a, &min_age, &max_age, &action, "a.ms, "a.sz, "a.reset_interval, "a.weight_sz, "a.weight_nr_accesses, - "a.weight_age, &parsed); - if (ret != 13) + "a.weight_age, &wmarks.metric, + &wmarks.interval, &wmarks.high, &wmarks.mid, + &wmarks.low, &parsed); + if (ret != 18) break; if (!damos_action_valid(action)) { pr_err("wrong action %d\n", action); From 1dc90ccd15c55cc3edec508466db9248a841acad Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:54 -0700 Subject: [PATCH 501/512] selftests/damon: support watermarks This updates DAMON selftests for 'schemes' debugfs file to reflect the changes in the format. Link: https://lkml.kernel.org/r/20211019150731.16699-14-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/damon/debugfs_attrs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh index 466dbeb37e31..196b6640bf37 100644 --- a/tools/testing/selftests/damon/debugfs_attrs.sh +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -63,10 +63,10 @@ echo "$orig_content" > "$file" file="$DBGFS/schemes" orig_content=$(cat "$file") -test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3" \ +test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \ "$orig_content" "valid input" test_write_fail "$file" "1 2 -3 4 5 6 3 0 0 0 1 2 3" "$orig_content" "multi lines" +3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines" test_write_succ "$file" "" "$orig_content" "disabling" echo "$orig_content" > "$file" From 43b0536cb4710e7bb591edfda7e68a1c327a3409 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:57 -0700 Subject: [PATCH 502/512] mm/damon: introduce DAMON-based Reclamation (DAMON_RECLAIM) This implements a new kernel subsystem that finds cold memory regions using DAMON and reclaims those immediately. It is intended to be used as proactive lightweigh reclamation logic for light memory pressure. For heavy memory pressure, it could be inactivated and fall back to the traditional page-scanning based reclamation. It's implemented on top of DAMON framework to use the DAMON-based Operation Schemes (DAMOS) feature. It utilizes all the DAMOS features including speed limit, prioritization, and watermarks. It could be enabled and tuned in boot time via the kernel boot parameter, and in run time via its module parameters ('/sys/module/damon_reclaim/parameters/') interface. [yangyingliang@huawei.com: fix error return code in damon_reclaim_turn()] Link: https://lkml.kernel.org/r/20211025124500.2758060-1-yangyingliang@huawei.com Link: https://lkml.kernel.org/r/20211019150731.16699-15-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Yang Yingliang Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/Kconfig | 12 ++ mm/damon/Makefile | 1 + mm/damon/reclaim.c | 356 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 369 insertions(+) create mode 100644 mm/damon/reclaim.c diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index ca33b289ebbe..5bcf05851ad0 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -73,4 +73,16 @@ config DAMON_DBGFS_KUNIT_TEST If unsure, say N. +config DAMON_RECLAIM + bool "Build DAMON-based reclaim (DAMON_RECLAIM)" + depends on DAMON_PADDR + help + This builds the DAMON-based reclamation subsystem. It finds pages + that not accessed for a long time (cold) using DAMON and reclaim + those. + + This is suggested to be used as a proactive and lightweight + reclamation under light memory pressure, while the traditional page + scanning-based reclamation is used for heavy pressure. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 8d9b0df79702..f7d5ac377a2b 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_DAMON) := core.o obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o +obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c new file mode 100644 index 000000000000..dc1485044eaf --- /dev/null +++ b/mm/damon/reclaim.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based page reclamation + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-reclaim: " fmt + +#include +#include +#include +#include +#include + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_reclaim." + +/* + * Enable or disable DAMON_RECLAIM. + * + * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. + * Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could + * do no real monitoring and reclamation due to the watermarks-based activation + * condition. Refer to below descriptions for the watermarks parameter for + * this. + */ +static bool enabled __read_mostly; +module_param(enabled, bool, 0600); + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_RECLAIM + * identifies the region as cold, and reclaims. 120 seconds by default. + */ +static unsigned long min_age __read_mostly = 120000000; +module_param(min_age, ulong, 0600); + +/* + * Limit of time for trying the reclamation in milliseconds. + * + * DAMON_RECLAIM tries to use only up to this time within a time window + * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be + * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, + * the limit is disabled. + * + * 10 ms by default. + */ +static unsigned long quota_ms __read_mostly = 10; +module_param(quota_ms, ulong, 0600); + +/* + * Limit of size of memory for the reclamation in bytes. + * + * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a + * time window (quota_reset_interval_ms) and makes no more than this limit is + * tried. This can be used for limiting consumption of CPU and IO. If this + * value is zero, the limit is disabled. + * + * 128 MiB by default. + */ +static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024; +module_param(quota_sz, ulong, 0600); + +/* + * The time/size quota charge reset interval in milliseconds. + * + * The charge reset interval for the quota of time (quota_ms) and size + * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than + * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms + * milliseconds. + * + * 1 second by default. + */ +static unsigned long quota_reset_interval_ms __read_mostly = 1000; +module_param(quota_reset_interval_ms, ulong, 0600); + +/* + * The watermarks check time interval in microseconds. + * + * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is + * enabled but inactive due to its watermarks rule. 5 seconds by default. + */ +static unsigned long wmarks_interval __read_mostly = 5000000; +module_param(wmarks_interval, ulong, 0600); + +/* + * Free memory rate (per thousand) for the high watermark. + * + * If free memory of the system in bytes per thousand bytes is higher than + * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically + * checks the watermarks. 500 (50%) by default. + */ +static unsigned long wmarks_high __read_mostly = 500; +module_param(wmarks_high, ulong, 0600); + +/* + * Free memory rate (per thousand) for the middle watermark. + * + * If free memory of the system in bytes per thousand bytes is between this and + * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring + * and the reclaiming. 400 (40%) by default. + */ +static unsigned long wmarks_mid __read_mostly = 400; +module_param(wmarks_mid, ulong, 0600); + +/* + * Free memory rate (per thousand) for the low watermark. + * + * If free memory of the system in bytes per thousand bytes is lower than this, + * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks + * the watermarks. In the case, the system falls back to the LRU-based page + * granularity reclamation logic. 200 (20%) by default. + */ +static unsigned long wmarks_low __read_mostly = 200; +module_param(wmarks_low, ulong, 0600); + +/* + * Sampling interval for the monitoring in microseconds. + * + * The sampling interval of DAMON for the cold memory monitoring. Please refer + * to the DAMON documentation for more detail. 5 ms by default. + */ +static unsigned long sample_interval __read_mostly = 5000; +module_param(sample_interval, ulong, 0600); + +/* + * Aggregation interval for the monitoring in microseconds. + * + * The aggregation interval of DAMON for the cold memory monitoring. Please + * refer to the DAMON documentation for more detail. 100 ms by default. + */ +static unsigned long aggr_interval __read_mostly = 100000; +module_param(aggr_interval, ulong, 0600); + +/* + * Minimum number of monitoring regions. + * + * The minimal number of monitoring regions of DAMON for the cold memory + * monitoring. This can be used to set lower-bound of the monitoring quality. + * But, setting this too high could result in increased monitoring overhead. + * Please refer to the DAMON documentation for more detail. 10 by default. + */ +static unsigned long min_nr_regions __read_mostly = 10; +module_param(min_nr_regions, ulong, 0600); + +/* + * Maximum number of monitoring regions. + * + * The maximum number of monitoring regions of DAMON for the cold memory + * monitoring. This can be used to set upper-bound of the monitoring overhead. + * However, setting this too low could result in bad monitoring quality. + * Please refer to the DAMON documentation for more detail. 1000 by default. + */ +static unsigned long max_nr_regions __read_mostly = 1000; +module_param(max_nr_regions, ulong, 0600); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +static struct damon_ctx *ctx; +static struct damon_target *target; + +struct damon_reclaim_ram_walk_arg { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_reclaim_ram_walk_arg *a = arg; + + if (a->end - a->start < res->end - res->start) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +static bool get_monitoring_region(unsigned long *start, unsigned long *end) +{ + struct damon_reclaim_ram_walk_arg arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + +static struct damos *damon_reclaim_new_scheme(void) +{ + struct damos_watermarks wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = wmarks_interval, + .high = wmarks_high, + .mid = wmarks_mid, + .low = wmarks_low, + }; + struct damos_quota quota = { + /* + * Do not try reclamation for more than quota_ms milliseconds + * or quota_sz bytes within quota_reset_interval_ms. + */ + .ms = quota_ms, + .sz = quota_sz, + .reset_interval = quota_reset_interval_ms, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 + }; + struct damos *scheme = damon_new_scheme( + /* Find regions having PAGE_SIZE or larger size */ + PAGE_SIZE, ULONG_MAX, + /* and not accessed at all */ + 0, 0, + /* for min_age or more micro-seconds, and */ + min_age / aggr_interval, UINT_MAX, + /* page out those, as soon as found */ + DAMOS_PAGEOUT, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &wmarks); + + return scheme; +} + +static int damon_reclaim_turn(bool on) +{ + struct damon_region *region; + struct damos *scheme; + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, + min_nr_regions, max_nr_regions); + if (err) + return err; + + if (monitor_region_start > monitor_region_end) + return -EINVAL; + if (!monitor_region_start && !monitor_region_end && + !get_monitoring_region(&monitor_region_start, + &monitor_region_end)) + return -EINVAL; + /* DAMON will free this on its own when finish monitoring */ + region = damon_new_region(monitor_region_start, monitor_region_end); + if (!region) + return -ENOMEM; + damon_add_region(region, target); + + /* Will be freed by 'damon_set_schemes()' below */ + scheme = damon_reclaim_new_scheme(); + if (!scheme) { + err = -ENOMEM; + goto free_region_out; + } + err = damon_set_schemes(ctx, &scheme, 1); + if (err) + goto free_scheme_out; + + err = damon_start(&ctx, 1); + if (!err) { + kdamond_pid = ctx->kdamond->pid; + return 0; + } + +free_scheme_out: + damon_destroy_scheme(scheme); +free_region_out: + damon_destroy_region(region, target); + return err; +} + +#define ENABLE_CHECK_INTERVAL_MS 1000 +static struct delayed_work damon_reclaim_timer; +static void damon_reclaim_timer_fn(struct work_struct *work) +{ + static bool last_enabled; + bool now_enabled; + + now_enabled = enabled; + if (last_enabled != now_enabled) { + if (!damon_reclaim_turn(now_enabled)) + last_enabled = now_enabled; + else + enabled = last_enabled; + } + + schedule_delayed_work(&damon_reclaim_timer, + msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS)); +} +static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); + +static int __init damon_reclaim_init(void) +{ + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + damon_pa_set_primitives(ctx); + + /* 4242 means nothing but fun */ + target = damon_new_target(4242); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + schedule_delayed_work(&damon_reclaim_timer, 0); + return 0; +} + +module_init(damon_reclaim_init); From bec976b691437d056a92964cb7af07ee1a54221a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:48:01 -0700 Subject: [PATCH 503/512] Documentation/admin-guide/mm/damon: add a document for DAMON_RECLAIM This adds an admin-guide document for DAMON-based Reclamation. Link: https://lkml.kernel.org/r/20211019150731.16699-16-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/index.rst | 1 + .../admin-guide/mm/damon/reclaim.rst | 235 ++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 Documentation/admin-guide/mm/damon/reclaim.rst diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 8c5dde3a5754..61aff88347f3 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -13,3 +13,4 @@ optimize those. start usage + reclaim diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst new file mode 100644 index 000000000000..fb9def3a7355 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -0,0 +1,235 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +DAMON-based Reclamation +======================= + +DAMON-based Reclamation (DAMON_RECLAIM) is a static kernel module that aimed to +be used for proactive and lightweight reclamation under light memory pressure. +It doesn't aim to replace the LRU-list based page_granularity reclamation, but +to be selectively used for different level of memory pressure and requirements. + +Where Proactive Reclamation is Required? +======================================== + +On general memory over-committed systems, proactively reclaiming cold pages +helps saving memory and reducing latency spikes that incurred by the direct +reclaim of the process or CPU consumption of kswapd, while incurring only +minimal performance degradation [1]_ [2]_ . + +Free Pages Reporting [3]_ based memory over-commit virtualization systems are +good example of the cases. In such systems, the guest VMs reports their free +memory to host, and the host reallocates the reported memory to other guests. +As a result, the memory of the systems are fully utilized. However, the +guests could be not so memory-frugal, mainly because some kernel subsystems and +user-space applications are designed to use as much memory as available. Then, +guests could report only small amount of memory as free to host, results in +memory utilization drop of the systems. Running the proactive reclamation in +guests could mitigate this problem. + +How It Works? +============= + +DAMON_RECLAIM finds memory regions that didn't accessed for specific time +duration and page out. To avoid it consuming too much CPU for the paging out +operation, a speed limit can be configured. Under the speed limit, it pages +out memory regions that didn't accessed longer time first. System +administrators can also configure under what situation this scheme should +automatically activated and deactivated with three memory pressure watermarks. + +Interface: Module Parameters +============================ + +To use this feature, you should first ensure your system is running on a kernel +that is built with ``CONFIG_DAMON_RECLAIM=y``. + +To let sysadmins enable or disable it and tune for the given system, +DAMON_RECLAIM utilizes module parameters. That is, you can put +``damon_reclaim.=`` on the kernel boot command line or write +proper values to ``/sys/modules/damon_reclaim/parameters/`` files. + +Note that the parameter values except ``enabled`` are applied only when +DAMON_RECLAIM starts. Therefore, if you want to apply new parameter values in +runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable +it via ``enabled`` parameter file. Writing of the new values to proper +parameter values should be done before the re-enablement. + +Below are the description of each parameter. + +enabled +------- + +Enable or disable DAMON_RECLAIM. + +You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. +Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could do +no real monitoring and reclamation due to the watermarks-based activation +condition. Refer to below descriptions for the watermarks parameter for this. + +min_age +------- + +Time threshold for cold memory regions identification in microseconds. + +If a memory region is not accessed for this or longer time, DAMON_RECLAIM +identifies the region as cold, and reclaims it. + +120 seconds by default. + +quota_ms +-------- + +Limit of time for the reclamation in milliseconds. + +DAMON_RECLAIM tries to use only up to this time within a time window +(quota_reset_interval_ms) for trying reclamation of cold pages. This can be +used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, the +limit is disabled. + +10 ms by default. + +quota_sz +-------- + +Limit of size of memory for the reclamation in bytes. + +DAMON_RECLAIM charges amount of memory which it tried to reclaim within a time +window (quota_reset_interval_ms) and makes no more than this limit is tried. +This can be used for limiting consumption of CPU and IO. If this value is +zero, the limit is disabled. + +128 MiB by default. + +quota_reset_interval_ms +----------------------- + +The time/size quota charge reset interval in milliseconds. + +The charget reset interval for the quota of time (quota_ms) and size +(quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than +quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms +milliseconds. + +1 second by default. + +wmarks_interval +--------------- + +Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is +enabled but inactive due to its watermarks rule. + +wmarks_high +----------- + +Free memory rate (per thousand) for the high watermark. + +If free memory of the system in bytes per thousand bytes is higher than this, +DAMON_RECLAIM becomes inactive, so it does nothing but only periodically checks +the watermarks. + +wmarks_mid +---------- + +Free memory rate (per thousand) for the middle watermark. + +If free memory of the system in bytes per thousand bytes is between this and +the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring and +the reclaiming. + +wmarks_low +---------- + +Free memory rate (per thousand) for the low watermark. + +If free memory of the system in bytes per thousand bytes is lower than this, +DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks the +watermarks. In the case, the system falls back to the LRU-list based page +granularity reclamation logic. + +sample_interval +--------------- + +Sampling interval for the monitoring in microseconds. + +The sampling interval of DAMON for the cold memory monitoring. Please refer to +the DAMON documentation (:doc:`usage`) for more detail. + +aggr_interval +------------- + +Aggregation interval for the monitoring in microseconds. + +The aggregation interval of DAMON for the cold memory monitoring. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. + +min_nr_regions +-------------- + +Minimum number of monitoring regions. + +The minimal number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set lower-bound of the monitoring quality. +But, setting this too high could result in increased monitoring overhead. +Please refer to the DAMON documentation (:doc:`usage`) for more detail. + +max_nr_regions +-------------- + +Maximum number of monitoring regions. + +The maximum number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set upper-bound of the monitoring overhead. +However, setting this too low could result in bad monitoring quality. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. + +monitor_region_start +-------------------- + +Start of target memory region in physical address. + +The start physical address of memory region that DAMON_RECLAIM will do work +against. That is, DAMON_RECLAIM will find cold memory regions in this region +and reclaims. By default, biggest System RAM is used as the region. + +monitor_region_end +------------------ + +End of target memory region in physical address. + +The end physical address of memory region that DAMON_RECLAIM will do work +against. That is, DAMON_RECLAIM will find cold memory regions in this region +and reclaims. By default, biggest System RAM is used as the region. + +kdamond_pid +----------- + +PID of the DAMON thread. + +If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else, +-1. + +Example +======= + +Below runtime example commands make DAMON_RECLAIM to find memory regions that +not accessed for 30 seconds or more and pages out. The reclamation is limited +to be done only up to 1 GiB per second to avoid DAMON_RECLAIM consuming too +much CPU time for the paging out operation. It also asks DAMON_RECLAIM to do +nothing if the system's free memory rate is more than 50%, but start the real +works if it becomes lower than 40%. If DAMON_RECLAIM doesn't make progress and +therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to +do nothing again, so that we can fall back to the LRU-list based page +granularity reclamation. :: + + # cd /sys/modules/damon_reclaim/parameters + # echo 30000000 > min_age + # echo $((1 * 1024 * 1024 * 1024)) > quota_sz + # echo 1000 > quota_reset_interval_ms + # echo 500 > wmarks_high + # echo 400 > wmarks_mid + # echo 200 > wmarks_low + # echo Y > enabled + +.. [1] https://research.google/pubs/pub48551/ +.. [2] https://lwn.net/Articles/787611/ +.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html From a460a36034bad4403c2c62e04a521bc6987ae5db Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 5 Nov 2021 13:48:04 -0700 Subject: [PATCH 504/512] mm/damon: remove unnecessary variable initialization Patch series "mm/damon: Fix some small bugs", v4. This patch (of 2): In 'damon_va_apply_three_regions' there is no need to set variable 'i' to zero. Link: https://lkml.kernel.org/r/b7df8d3dad0943a37e01f60c441b1968b2b20354.1634720326.git.xhao@linux.alibaba.com Link: https://lkml.kernel.org/r/cover.1634720326.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 675cd8c7df9b..35fe49080ee9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -306,7 +306,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, struct damon_addr_range bregions[3]) { struct damon_region *r, *next; - unsigned int i = 0; + unsigned int i; /* Remove regions which are not in the three big regions now */ damon_for_each_region_safe(r, next, t) { From b5ca3e83ddb05342b1b30700b999cb9b107511f6 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 5 Nov 2021 13:48:07 -0700 Subject: [PATCH 505/512] mm/damon/dbgfs: add adaptive_targets list check before enable monitor_on When the ctx->adaptive_targets list is empty, I did some test on monitor_on interface like this. # cat /sys/kernel/debug/damon/target_ids # # echo on > /sys/kernel/debug/damon/monitor_on # damon: kdamond (5390) starts Though the ctx->adaptive_targets list is empty, but the kthread_run still be called, and the kdamond.x thread still be created, this is meaningless. So there adds a judgment in 'dbgfs_monitor_on_write', if the ctx->adaptive_targets list is empty, return -EINVAL. Link: https://lkml.kernel.org/r/0a60a6e8ec9d71989e0848a4dc3311996ca3b5d4.1634720326.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 1 + mm/damon/core.c | 5 +++++ mm/damon/dbgfs.c | 15 ++++++++++++--- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index c93325efddd7..fa7f32614b65 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -440,6 +440,7 @@ void damon_destroy_scheme(struct damos *s); struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +bool damon_targets_empty(struct damon_ctx *ctx); void damon_free_target(struct damon_target *t); void damon_destroy_target(struct damon_target *t); unsigned int damon_nr_regions(struct damon_target *t); diff --git a/mm/damon/core.c b/mm/damon/core.c index 6993c60ae31c..46a6afea3030 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -180,6 +180,11 @@ void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) list_add_tail(&t->list, &ctx->adaptive_targets); } +bool damon_targets_empty(struct damon_ctx *ctx) +{ + return list_empty(&ctx->adaptive_targets); +} + static void damon_del_target(struct damon_target *t) { list_del(&t->list); diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 6828e463348b..befb27a29aab 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -878,12 +878,21 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return -EINVAL; } - if (!strncmp(kbuf, "on", count)) + if (!strncmp(kbuf, "on", count)) { + int i; + + for (i = 0; i < dbgfs_nr_ctxs; i++) { + if (damon_targets_empty(dbgfs_ctxs[i])) { + kfree(kbuf); + return -EINVAL; + } + } ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); - else if (!strncmp(kbuf, "off", count)) + } else if (!strncmp(kbuf, "off", count)) { ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); - else + } else { ret = -EINVAL; + } if (!ret) ret = count; From 82e3fff55d0010310d3ee9005fec366c9cb3836a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:48:10 -0700 Subject: [PATCH 506/512] Docs/admin-guide/mm/damon/start: fix wrong example commands Patch series "Fix trivial nits in Documentation/admin-guide/mm". This patchset fixes trivial nits in admin guide documents for DAMON and pagemap. This patch (of 4): Some of the example commands in DAMON getting started guide are outdated, missing sudo, or just wrong. This fixes those. Link: https://lkml.kernel.org/r/20211022090311.3856-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/start.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index 51503cf90ca2..3ad8bbed9b18 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -19,7 +19,7 @@ your workload. :: # mount -t debugfs none /sys/kernel/debug/ # git clone https://github.com/awslabs/damo # ./damo/damo record $(pidof ) - # ./damo/damo report heat --plot_ascii + # ./damo/damo report heats --heatmap stdout The final command draws the access heatmap of ````. The heatmap shows which memory region (x-axis) is accessed when (y-axis) and how frequently @@ -94,9 +94,9 @@ Visualizing Recorded Patterns The following three commands visualize the recorded access patterns and save the results as separate image files. :: - $ damo report heats --heatmap access_pattern_heatmap.png - $ damo report wss --range 0 101 1 --plot wss_dist.png - $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png + $ sudo damo report heats --heatmap access_pattern_heatmap.png + $ sudo damo report wss --range 0 101 1 --plot wss_dist.png + $ sudo damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png - ``access_pattern_heatmap.png`` will visualize the data access pattern in a heatmap, showing which memory region (y-axis) got accessed when (x-axis) @@ -115,9 +115,9 @@ Data Access Pattern Aware Memory Management Below three commands make every memory region of size >=4K that doesn't accessed for >=60 seconds in your workload to be swapped out. :: - $ echo "#min-size max-size min-acc max-acc min-age max-age action" > scheme - $ echo "4K max 0 0 60s max pageout" >> scheme - $ damo schemes -c my_thp_scheme + $ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme + $ echo "4K max 0 0 60s max pageout" >> test_scheme + $ damo schemes -c test_scheme .. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns .. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html From 49ce7dee10891c709da769a731a45b42cf7c54f6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:48:13 -0700 Subject: [PATCH 507/512] Docs/admin-guide/mm/damon/start: fix a wrong link The 'Getting Started' of DAMON is providing a link to DAMON's user interface document while saying about its user space tool's detailed usages. This fixes the link. Link: https://lkml.kernel.org/r/20211022090311.3856-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/start.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index 3ad8bbed9b18..5f3b22cafc76 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -6,7 +6,9 @@ Getting Started This document briefly describes how you can use DAMON by demonstrating its default user space tool. Please note that this document describes only a part -of its features for brevity. Please refer to :doc:`usage` for more details. +of its features for brevity. Please refer to the usage `doc +`_ of the tool for more +details. TL; DR From b1eee3c5486003b247127538210f15fd6ebb5ee5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:48:16 -0700 Subject: [PATCH 508/512] Docs/admin-guide/mm/damon/start: simplify the content Information in 'TL; DR' section of 'Getting Started' is duplicated in other parts of the doc. It is also asking readers to visit the access pattern visualizations gallery web site to show the results of example visualization commands, while the users of the commands can use terminal output. To make the doc simple, this removes the duplicated 'TL; DR' section and replaces the visualization example commands with versions using terminal outputs. Link: https://lkml.kernel.org/r/20211022090311.3856-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/start.rst | 107 ++++++++++--------- 1 file changed, 57 insertions(+), 50 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index 5f3b22cafc76..4d5ca2c46288 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -11,38 +11,6 @@ of its features for brevity. Please refer to the usage `doc details. -TL; DR -====== - -Follow the commands below to monitor and visualize the memory access pattern of -your workload. :: - - # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot - # mount -t debugfs none /sys/kernel/debug/ - # git clone https://github.com/awslabs/damo - # ./damo/damo record $(pidof ) - # ./damo/damo report heats --heatmap stdout - -The final command draws the access heatmap of ````. The heatmap -shows which memory region (x-axis) is accessed when (y-axis) and how frequently -(number; the higher the more accesses have been observed). :: - - 111111111111111111111111111111111111111111111111111111110000 - 111121111111111111111111111111211111111111111111111111110000 - 000000000000000000000000000000000000000000000000001555552000 - 000000000000000000000000000000000000000000000222223555552000 - 000000000000000000000000000000000000000011111677775000000000 - 000000000000000000000000000000000000000488888000000000000000 - 000000000000000000000000000000000177888400000000000000000000 - 000000000000000000000000000046666522222100000000000000000000 - 000000000000000000000014444344444300000000000000000000000000 - 000000000000000002222245555510000000000000000000000000000000 - # access_frequency: 0 1 2 3 4 5 6 7 8 9 - # x-axis: space (140286319947776-140286426374096: 101.496 MiB) - # y-axis: time (605442256436361-605479951866441: 37.695430s) - # resolution: 60x10 (1.692 MiB and 3.770s for each character) - - Prerequisites ============= @@ -93,22 +61,66 @@ pattern in the ``damon.data`` file. Visualizing Recorded Patterns ============================= -The following three commands visualize the recorded access patterns and save -the results as separate image files. :: +You can visualize the pattern in a heatmap, showing which memory region +(x-axis) got accessed when (y-axis) and how frequently (number).:: - $ sudo damo report heats --heatmap access_pattern_heatmap.png - $ sudo damo report wss --range 0 101 1 --plot wss_dist.png - $ sudo damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png + $ sudo damo report heats --heatmap stdout + 22222222222222222222222222222222222222211111111111111111111111111111111111111100 + 44444444444444444444444444444444444444434444444444444444444444444444444444443200 + 44444444444444444444444444444444444444433444444444444444444444444444444444444200 + 33333333333333333333333333333333333333344555555555555555555555555555555555555200 + 33333333333333333333333333333333333344444444444444444444444444444444444444444200 + 22222222222222222222222222222222222223355555555555555555555555555555555555555200 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + 33333333333333333333333333333333333333355555555555555555555555555555555555555200 + 88888888888888888888888888888888888888600000000000000000000000000000000000000000 + 88888888888888888888888888888888888888600000000000000000000000000000000000000000 + 33333333333333333333333333333333333333444444444444444444444444444444444444443200 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + [...] + # access_frequency: 0 1 2 3 4 5 6 7 8 9 + # x-axis: space (139728247021568-139728453431248: 196.848 MiB) + # y-axis: time (15256597248362-15326899978162: 1 m 10.303 s) + # resolution: 80x40 (2.461 MiB and 1.758 s for each character) -- ``access_pattern_heatmap.png`` will visualize the data access pattern in a - heatmap, showing which memory region (y-axis) got accessed when (x-axis) - and how frequently (color). -- ``wss_dist.png`` will show the distribution of the working set size. -- ``wss_chron_change.png`` will show how the working set size has - chronologically changed. +You can also visualize the distribution of the working set size, sorted by the +size.:: -You can view the visualizations of this example workload at [1]_. -Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_. + $ sudo damo report wss --range 0 101 10 + # + # target_id 18446632103789443072 + # avr: 107.708 MiB + 0 0 B | | + 10 95.328 MiB |**************************** | + 20 95.332 MiB |**************************** | + 30 95.340 MiB |**************************** | + 40 95.387 MiB |**************************** | + 50 95.387 MiB |**************************** | + 60 95.398 MiB |**************************** | + 70 95.398 MiB |**************************** | + 80 95.504 MiB |**************************** | + 90 190.703 MiB |********************************************************* | + 100 196.875 MiB |***********************************************************| + +Using ``--sortby`` option with the above command, you can show how the working +set size has chronologically changed.:: + + $ sudo damo report wss --range 0 101 10 --sortby time + # + # target_id 18446632103789443072 + # avr: 107.708 MiB + 0 3.051 MiB | | + 10 190.703 MiB |***********************************************************| + 20 95.336 MiB |***************************** | + 30 95.328 MiB |***************************** | + 40 95.387 MiB |***************************** | + 50 95.332 MiB |***************************** | + 60 95.320 MiB |***************************** | + 70 95.398 MiB |***************************** | + 80 95.398 MiB |***************************** | + 90 95.340 MiB |***************************** | + 100 95.398 MiB |***************************** | Data Access Pattern Aware Memory Management @@ -120,8 +132,3 @@ accessed for >=60 seconds in your workload to be swapped out. :: $ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme $ echo "4K max 0 0 60s max pageout" >> test_scheme $ damo schemes -c test_scheme - -.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns -.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html -.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html -.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html From 0d16cfd46b48689de6a5ca594bc1a68105f6658b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:48:19 -0700 Subject: [PATCH 509/512] Docs/admin-guide/mm/pagemap: wordsmith page flags descriptions Some descriptions of page flags in 'pagemap.rst' are written in assumption of none-rst, which respects every new line, as below: 7 - SLAB page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator When compound page is used, SLUB/SLQB will only set this flag on the head Because rst ignores the new line between the first sentence and second sentence, resulting html looks a little bit weird, as below. 7 - SLAB page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator When ^ compound page is used, SLUB/SLQB will only set this flag on the head page; SLOB will not flag it at all. This change makes it more natural and consistent with other parts in the rendered version. Link: https://lkml.kernel.org/r/20211022090311.3856-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 53 ++++++++++++------------ 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 4581527c07ae..bfc28704856c 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -90,13 +90,14 @@ Short descriptions to the page flags ==================================== 0 - LOCKED - page is being locked for exclusive access, e.g. by undergoing read/write IO + The page is being locked for exclusive access, e.g. by undergoing read/write + IO. 7 - SLAB - page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator + The page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator. When compound page is used, SLUB/SLQB will only set this flag on the head page; SLOB will not flag it at all. 10 - BUDDY - a free memory block managed by the buddy system allocator + A free memory block managed by the buddy system allocator. The buddy system organizes free memory in blocks of various orders. An order N block has 2^N physically contiguous pages, with the BUDDY flag set for and _only_ for the first page. @@ -112,65 +113,65 @@ Short descriptions to the page flags 16 - COMPOUND_TAIL A compound page tail (see description above). 17 - HUGE - this is an integral part of a HugeTLB page + This is an integral part of a HugeTLB page. 19 - HWPOISON - hardware detected memory corruption on this page: don't touch the data! + Hardware detected memory corruption on this page: don't touch the data! 20 - NOPAGE - no page frame exists at the requested address + No page frame exists at the requested address. 21 - KSM - identical memory pages dynamically shared between one or more processes + Identical memory pages dynamically shared between one or more processes. 22 - THP - contiguous pages which construct transparent hugepages + Contiguous pages which construct transparent hugepages. 23 - OFFLINE - page is logically offline + The page is logically offline. 24 - ZERO_PAGE - zero page for pfn_zero or huge_zero page + Zero page for pfn_zero or huge_zero page. 25 - IDLE - page has not been accessed since it was marked idle (see + The page has not been accessed since it was marked idle (see :ref:`Documentation/admin-guide/mm/idle_page_tracking.rst `). Note that this flag may be stale in case the page was accessed via a PTE. To make sure the flag is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. 26 - PGTABLE - page is in use as a page table + The page is in use as a page table. IO related page flags --------------------- 1 - ERROR - IO error occurred + IO error occurred. 3 - UPTODATE - page has up-to-date data + The page has up-to-date data. ie. for file backed page: (in-memory data revision >= on-disk one) 4 - DIRTY - page has been written to, hence contains new data + The page has been written to, hence contains new data. i.e. for file backed page: (in-memory data revision > on-disk one) 8 - WRITEBACK - page is being synced to disk + The page is being synced to disk. LRU related page flags ---------------------- 5 - LRU - page is in one of the LRU lists + The page is in one of the LRU lists. 6 - ACTIVE - page is in the active LRU list + The page is in the active LRU list. 18 - UNEVICTABLE - page is in the unevictable (non-)LRU list It is somehow pinned and + The page is in the unevictable (non-)LRU list It is somehow pinned and not a candidate for LRU page reclaims, e.g. ramfs pages, - shmctl(SHM_LOCK) and mlock() memory segments + shmctl(SHM_LOCK) and mlock() memory segments. 2 - REFERENCED - page has been referenced since last LRU list enqueue/requeue + The page has been referenced since last LRU list enqueue/requeue. 9 - RECLAIM - page will be reclaimed soon after its pageout IO completed + The page will be reclaimed soon after its pageout IO completed. 11 - MMAP - a memory mapped page + A memory mapped page. 12 - ANON - a memory mapped page that is not part of a file + A memory mapped page that is not part of a file. 13 - SWAPCACHE - page is mapped to swap space, i.e. has an associated swap entry + The page is mapped to swap space, i.e. has an associated swap entry. 14 - SWAPBACKED - page is backed by swap/RAM + The page is backed by swap/RAM. The page-types tool in the tools/vm directory can be used to query the above flags. From 0f91d13366a402420bf98eaaf393db03946c13e0 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 5 Nov 2021 13:48:22 -0700 Subject: [PATCH 510/512] mm/damon: simplify stop mechanism A kernel thread can exit gracefully with kthread_stop(). So we don't need a new flag 'kdamond_stop'. And to make sure the task struct is not freed when accessing it, get reference to it before termination. Link: https://lkml.kernel.org/r/20211027130517.4404-1-changbin.du@gmail.com Signed-off-by: Changbin Du Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 1 - mm/damon/core.c | 51 +++++++++++++------------------------------ 2 files changed, 15 insertions(+), 37 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index fa7f32614b65..321de9d72360 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -381,7 +381,6 @@ struct damon_ctx { /* public: */ struct task_struct *kdamond; - bool kdamond_stop; struct mutex kdamond_lock; struct damon_primitive primitive; diff --git a/mm/damon/core.c b/mm/damon/core.c index 46a6afea3030..f37c17b53814 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -390,17 +390,6 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) return sz; } -static bool damon_kdamond_running(struct damon_ctx *ctx) -{ - bool running; - - mutex_lock(&ctx->kdamond_lock); - running = ctx->kdamond != NULL; - mutex_unlock(&ctx->kdamond_lock); - - return running; -} - static int kdamond_fn(void *data); /* @@ -418,7 +407,6 @@ static int __damon_start(struct damon_ctx *ctx) mutex_lock(&ctx->kdamond_lock); if (!ctx->kdamond) { err = 0; - ctx->kdamond_stop = false; ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", nr_running_ctxs); if (IS_ERR(ctx->kdamond)) { @@ -474,13 +462,15 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs) */ static int __damon_stop(struct damon_ctx *ctx) { + struct task_struct *tsk; + mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ctx->kdamond_stop = true; + tsk = ctx->kdamond; + if (tsk) { + get_task_struct(tsk); mutex_unlock(&ctx->kdamond_lock); - while (damon_kdamond_running(ctx)) - usleep_range(ctx->sample_interval, - ctx->sample_interval * 2); + kthread_stop(tsk); + put_task_struct(tsk); return 0; } mutex_unlock(&ctx->kdamond_lock); @@ -925,12 +915,8 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx) static bool kdamond_need_stop(struct damon_ctx *ctx) { struct damon_target *t; - bool stop; - mutex_lock(&ctx->kdamond_lock); - stop = ctx->kdamond_stop; - mutex_unlock(&ctx->kdamond_lock); - if (stop) + if (kthread_should_stop()) return true; if (!ctx->primitive.target_valid) @@ -1021,13 +1007,6 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) return -EBUSY; } -static void set_kdamond_stop(struct damon_ctx *ctx) -{ - mutex_lock(&ctx->kdamond_lock); - ctx->kdamond_stop = true; - mutex_unlock(&ctx->kdamond_lock); -} - /* * The monitoring daemon that runs as a kernel thread */ @@ -1038,17 +1017,18 @@ static int kdamond_fn(void *data) struct damon_region *r, *next; unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; + bool done = false; pr_debug("kdamond (%d) starts\n", current->pid); if (ctx->primitive.init) ctx->primitive.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) - set_kdamond_stop(ctx); + done = true; sz_limit = damon_region_sz_limit(ctx); - while (!kdamond_need_stop(ctx)) { + while (!kdamond_need_stop(ctx) && !done) { if (kdamond_wait_activation(ctx)) continue; @@ -1056,7 +1036,7 @@ static int kdamond_fn(void *data) ctx->primitive.prepare_access_checks(ctx); if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) - set_kdamond_stop(ctx); + done = true; usleep_range(ctx->sample_interval, ctx->sample_interval + 1); @@ -1069,7 +1049,7 @@ static int kdamond_fn(void *data) sz_limit); if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) - set_kdamond_stop(ctx); + done = true; kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); @@ -1088,9 +1068,8 @@ static int kdamond_fn(void *data) damon_destroy_region(r, t); } - if (ctx->callback.before_terminate && - ctx->callback.before_terminate(ctx)) - set_kdamond_stop(ctx); + if (ctx->callback.before_terminate) + ctx->callback.before_terminate(ctx); if (ctx->primitive.cleanup) ctx->primitive.cleanup(ctx); From 0107865541961ee128149c9873996d32143a74d0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 5 Nov 2021 13:48:24 -0700 Subject: [PATCH 511/512] mm/damon: fix a few spelling mistakes in comments and a pr_debug message There are a few spelling mistakes in the code. Fix these. Link: https://lkml.kernel.org/r/20211028184157.614544-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/core.c | 2 +- mm/damon/dbgfs-test.h | 2 +- mm/damon/vaddr-test.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index f37c17b53814..c381b3c525d0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -959,7 +959,7 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme) /* higher than high watermark or lower than low watermark */ if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { if (scheme->wmarks.activated) - pr_debug("inactivate a scheme (%d) for %s wmark\n", + pr_debug("deactivate a scheme (%d) for %s wmark\n", scheme->action, metric > scheme->wmarks.high ? "high" : "low"); diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 104b22957616..86b9f9528231 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -145,7 +145,7 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) KUNIT_EXPECT_STREQ(test, (char *)buf, expect); } - /* Put invlid inputs and check the return error code */ + /* Put invalid inputs and check the return error code */ for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { input = invalid_inputs[i]; pr_info("input: %s\n", input); diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 1f5c13257dba..ecfd0b2ed222 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -233,7 +233,7 @@ static void damon_test_apply_three_regions3(struct kunit *test) * and 70-100) has totally freed and mapped to different area (30-32 and * 65-68). The target regions which were in the old second and third big * regions should now be removed and new target regions covering the new second - * and third big regions should be crated. + * and third big regions should be created. */ static void damon_test_apply_three_regions4(struct kunit *test) { From 658f9ae761b5965893727dd4edcdad56e5a439bb Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 5 Nov 2021 13:48:27 -0700 Subject: [PATCH 512/512] mm/damon: remove return value from before_terminate callback Since the return value of 'before_terminate' callback is never used, we make it have no return value. Link: https://lkml.kernel.org/r/20211029005023.8895-1-changbin.du@gmail.com Signed-off-by: Changbin Du Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 +- mm/damon/dbgfs.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 321de9d72360..b4d4be3cc987 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -322,7 +322,7 @@ struct damon_callback { int (*before_start)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); - int (*before_terminate)(struct damon_ctx *context); + void (*before_terminate)(struct damon_ctx *context); }; /** diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index befb27a29aab..eccc14b34901 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -645,18 +645,17 @@ static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); } -static int dbgfs_before_terminate(struct damon_ctx *ctx) +static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; if (!targetid_is_pid(ctx)) - return 0; + return; damon_for_each_target_safe(t, next, ctx) { put_pid((struct pid *)t->id); damon_destroy_target(t); } - return 0; } static struct damon_ctx *dbgfs_new_ctx(void)