diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index bcc80269bb6a..b8c0460730f3 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1329,6 +1329,10 @@ PAGE_SIZE multiple when read back. workingset_activate Number of refaulted pages that were immediately activated + workingset_restore + Number of restored pages which have been detected as an active + workingset before they got reclaimed. + workingset_nodereclaim Number of times a shadow node has been reclaimed @@ -1370,6 +1374,22 @@ PAGE_SIZE multiple when read back. The total amount of swap currently being used by the cgroup and its descendants. + memory.swap.high + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Swap usage throttle limit. If a cgroup's swap usage exceeds + this limit, all its further allocations will be throttled to + allow userspace to implement custom out-of-memory procedures. + + This limit marks a point of no return for the cgroup. It is NOT + designed to manage the amount of swapping a workload does + during regular operation. Compare to memory.swap.max, which + prohibits swapping past a set amount, but lets the cgroup + continue unimpeded as long as other memory can be reclaimed. + + Healthy workloads are not expected to reach this limit. + memory.swap.max A read-write single value file which exists on non-root cgroups. The default is "max". @@ -1383,6 +1403,10 @@ PAGE_SIZE multiple when read back. otherwise, a value change in this file generates a file modified event. + high + The number of times the cgroup's swap usage was over + the high threshold. + max The number of times the cgroup's swap usage was about to go over the max boundary and swap allocation diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 93cb65d52720..a1582cc79f0f 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -213,7 +213,7 @@ Here are the routines, one by one: there will be no entries in the cache for the kernel address space for virtual addresses in the range 'start' to 'end-1'. - The first of these two routines is invoked after map_vm_area() + The first of these two routines is invoked after map_kernel_range() has installed the page table entries. The second is invoked before unmap_kernel_range() deletes the page table entries. diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 5057e4d9dcd1..0af2e0e11461 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -239,6 +239,7 @@ prototypes:: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -271,7 +272,8 @@ writepage: yes, unlocks (see below) readpage: yes, unlocks writepages: set_page_dirty no -readpages: +readahead: yes, unlocks +readpages: no write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: @@ -295,6 +297,8 @@ the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O completion. +->readahead() unlocks the pages that I/O is attempted on like ->readpage(). + ->readpages() populates the pagecache with the passed pages and starts I/O against them. They come unlocked upon I/O completion. diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 06d41c0b91cc..430963e0e8c3 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1043,8 +1043,8 @@ PageTables amount of memory dedicated to the lowest level of page tables. NFS_Unstable - NFS pages sent to the server, but not yet committed to stable - storage + Always zero. Previous counted pages which had been written to + the server, but has not been committed to stable storage. Bounce Memory used for block device "bounce buffers" WritebackTmp diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 7d4d09dd5e6d..ed17771c212b 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -706,6 +706,7 @@ cache in your filesystem. The following members are defined: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -781,12 +782,26 @@ cache in your filesystem. The following members are defined: If defined, it should set the PageDirty flag, and the PAGECACHE_TAG_DIRTY tag in the radix tree. +``readahead`` + Called by the VM to read pages associated with the address_space + object. The pages are consecutive in the page cache and are + locked. The implementation should decrement the page refcount + after starting I/O on each page. Usually the page will be + unlocked by the I/O completion handler. If the filesystem decides + to stop attempting I/O before reaching the end of the readahead + window, it can simply return. The caller will decrement the page + refcount and unlock the remaining pages for you. Set PageUptodate + if the I/O completes successfully. Setting PageError on any page + will be ignored; simply unlock the page if an I/O error occurs. + ``readpages`` called by the VM to read pages associated with the address_space object. This is essentially just a vector version of readpage. Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. + This interface is deprecated and will be removed by the end of + 2020; implement readahead instead. ``write_begin`` Called by the generic buffered write code to ask the filesystem diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index 933ada4368ff..4eee598555c9 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst @@ -49,7 +49,7 @@ Possible debug options are:: P Poisoning (object and padding) U User tracking (free and alloc) T Trace (please only use on single slabs) - A Toggle failslab filter mark for the cache + A Enable failslab filter mark for the cache O Switch debugging off for caches that would have caused higher minimum slab orders - Switch all debugging off (useful if the kernel is diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 8b83d4a5d309..fe383f5a92fb 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -81,7 +81,7 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y CONFIG_CMA=y CONFIG_ZSMALLOC=m -CONFIG_PGTABLE_MAPPING=y +CONFIG_ZSMALLOC_PGTABLE_MAPPING=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e50e4dda90c2..dae0466d19d6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -407,6 +407,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) +#define pgprot_nx(prot) \ + __pgprot_modify(prot, 0, PTE_PXN) + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 0a12115d9638..0cc6636e3f15 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -19,10 +19,8 @@ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node_range(stack_size, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, PAGE_KERNEL, 0, node, - __builtin_return_address(0)); + return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + __builtin_return_address(0)); } #endif /* __ASM_VMAP_STACK_H */ diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 78163b7a7dde..0da020c563e6 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -252,7 +252,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); static const char units[] = "KMGTPE"; diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 9832c73a7021..cd7df48dc874 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -93,10 +93,8 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define set_pte_at(mm, addr, ptep, pteval) \ do { \ - pte_t old_pte; \ unsigned long flags; \ spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);\ - old_pte = *ptep; \ set_pte(ptep, pteval); \ purge_tlb_entries(mm, addr); \ spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);\ diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 635969b5b58e..13f90dd03450 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -699,10 +699,6 @@ static inline void iosync(void) * * * iounmap undoes such a mapping and can be hooked * - * * __ioremap_at (and the pending __iounmap_at) are low level functions to - * create hand-made mappings for use only by the PCI code and cannot - * currently be hooked. Must be page aligned. - * * * __ioremap_caller is the same as above but takes an explicit caller * reference rather than using __builtin_return_address(0) * @@ -719,6 +715,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); extern void iounmap(volatile void __iomem *addr); +void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size); + int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, @@ -727,10 +725,6 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); -extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea, - unsigned long size, pgprot_t prot); -extern void __iounmap_at(void *ea, unsigned long size); - /* * When CONFIG_PPC_INDIRECT_PIO is set, we use the generic iomap implementation * which needs some additional definitions here. They basically allow PIO diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 69f4cb3b7c56..b92e81b256e5 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -66,7 +66,7 @@ struct pci_controller { void __iomem *io_base_virt; #ifdef CONFIG_PPC64 - void *io_base_alloc; + void __iomem *io_base_alloc; #endif resource_size_t io_base_phys; resource_size_t pci_io_size; diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 1f1169856dc8..112d150354b2 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -748,9 +748,8 @@ void do_IRQ(struct pt_regs *regs) static void *__init alloc_vm_stack(void) { - return __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, - VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL, - 0, NUMA_NO_NODE, (void*)_RET_IP_); + return __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP, + NUMA_NO_NODE, (void *)_RET_IP_); } static void __init vmap_irqstack_init(void) diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 773671b512df..2257d24e6a26 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,22 @@ EXPORT_SYMBOL_GPL(isa_bridge_pcidev); #define ISA_SPACE_MASK 0x1 #define ISA_SPACE_IO 0x1 +static void remap_isa_base(phys_addr_t pa, unsigned long size) +{ + WARN_ON_ONCE(ISA_IO_BASE & ~PAGE_MASK); + WARN_ON_ONCE(pa & ~PAGE_MASK); + WARN_ON_ONCE(size & ~PAGE_MASK); + + if (slab_is_available()) { + if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa, + pgprot_noncached(PAGE_KERNEL))) + unmap_kernel_range(ISA_IO_BASE, size); + } else { + early_ioremap_range(ISA_IO_BASE, pa, size, + pgprot_noncached(PAGE_KERNEL)); + } +} + static void pci_process_ISA_OF_ranges(struct device_node *isa_node, unsigned long phb_io_base_phys) { @@ -105,15 +122,13 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, if (size > 0x10000) size = 0x10000; - __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(phb_io_base_phys, size); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); - __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(phb_io_base_phys, 0x10000); } @@ -248,8 +263,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) * and map it */ isa_io_base = ISA_IO_BASE; - __ioremap_at(pbase, (void *)ISA_IO_BASE, - size, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(pbase, size); pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } @@ -297,7 +311,7 @@ static void isa_bridge_remove(void) isa_bridge_pcidev = NULL; /* Unmap the ISA area */ - __iounmap_at((void *)ISA_IO_BASE, 0x10000); + unmap_kernel_range(ISA_IO_BASE, 0x10000); } /** diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index f83d1f69b1dd..d9ac980c398c 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -109,23 +109,47 @@ int pcibios_unmap_io_space(struct pci_bus *bus) /* Get the host bridge */ hose = pci_bus_to_host(bus); - /* Check if we have IOs allocated */ - if (hose->io_base_alloc == NULL) - return 0; - pr_debug("IO unmapping for PHB %pOF\n", hose->dn); pr_debug(" alloc=0x%p\n", hose->io_base_alloc); - /* This is a PHB, we fully unmap the IO area */ - vunmap(hose->io_base_alloc); - + iounmap(hose->io_base_alloc); return 0; } EXPORT_SYMBOL_GPL(pcibios_unmap_io_space); -static int pcibios_map_phb_io_space(struct pci_controller *hose) +void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) { struct vm_struct *area; + unsigned long addr; + + WARN_ON_ONCE(paddr & ~PAGE_MASK); + WARN_ON_ONCE(size & ~PAGE_MASK); + + /* + * Let's allocate some IO space for that guy. We don't pass VM_IOREMAP + * because we don't care about alignment tricks that the core does in + * that case. Maybe we should due to stupid card with incomplete + * address decoding but I'd rather not deal with those outside of the + * reserved 64K legacy region. + */ + area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END, + __builtin_return_address(0)); + if (!area) + return NULL; + + addr = (unsigned long)area->addr; + if (ioremap_page_range(addr, addr + size, paddr, + pgprot_noncached(PAGE_KERNEL))) { + unmap_kernel_range(addr, size); + return NULL; + } + + return (void __iomem *)addr; +} +EXPORT_SYMBOL_GPL(ioremap_phb); + +static int pcibios_map_phb_io_space(struct pci_controller *hose) +{ unsigned long phys_page; unsigned long size_page; unsigned long io_virt_offset; @@ -146,12 +170,11 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) * with incomplete address decoding but I'd rather not deal with * those outside of the reserved 64K legacy region. */ - area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END); - if (area == NULL) + hose->io_base_alloc = ioremap_phb(phys_page, size_page); + if (!hose->io_base_alloc) return -ENOMEM; - hose->io_base_alloc = area->addr; - hose->io_base_virt = (void __iomem *)(area->addr + - hose->io_base_phys - phys_page); + hose->io_base_virt = hose->io_base_alloc + + hose->io_base_phys - phys_page; pr_debug("IO mapping for PHB %pOF\n", hose->dn); pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n", @@ -159,11 +182,6 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) pr_debug(" size=0x%016llx (alloc=0x%016lx)\n", hose->pci_io_size, size_page); - /* Establish the mapping */ - if (__ioremap_at(phys_page, area->addr, size_page, - pgprot_noncached(PAGE_KERNEL)) == NULL) - return -ENOMEM; - /* Fixup hose IO resource */ io_virt_offset = pcibios_io_space_offset(hose); hose->io_resource.start += io_virt_offset; diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 50a99d9684f7..ba5cbb0d66bd 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -4,56 +4,6 @@ #include #include -/** - * Low level function to establish the page tables for an IO mapping - */ -void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) -{ - int ret; - unsigned long va = (unsigned long)ea; - - /* We don't support the 4K PFN hack with ioremap */ - if (pgprot_val(prot) & H_PAGE_4K_PFN) - return NULL; - - if ((ea + size) >= (void *)IOREMAP_END) { - pr_warn("Outside the supported range\n"); - return NULL; - } - - WARN_ON(pa & ~PAGE_MASK); - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - if (slab_is_available()) { - ret = ioremap_page_range(va, va + size, pa, prot); - if (ret) - unmap_kernel_range(va, size); - } else { - ret = early_ioremap_range(va, pa, size, prot); - } - - if (ret) - return NULL; - - return (void __iomem *)ea; -} -EXPORT_SYMBOL(__ioremap_at); - -/** - * Low level function to tear down the page tables for an IO mapping. This is - * used for mappings that are manipulated manually, like partial unmapping of - * PCI IOs or ISA space. - */ -void __iounmap_at(void *ea, unsigned long size) -{ - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - unmap_kernel_range((unsigned long)ea, size); -} -EXPORT_SYMBOL(__iounmap_at); - void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..d50706ea1c94 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -473,9 +473,9 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define PAGE_SHARED __pgprot(0) #define PAGE_KERNEL __pgprot(0) #define swapper_pg_dir NULL +#define TASK_SIZE 0xffffffffUL #define VMALLOC_START 0 - -#define TASK_SIZE 0xffffffffUL +#define VMALLOC_END TASK_SIZE static inline void __kernel_map_pages(struct page *page, int numpages, int enable) {} diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 7eab76a93106..070505d79b06 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -204,7 +204,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page(struct ptdump_state *pt_st, unsigned long addr, - int level, unsigned long val) + int level, u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); u64 pa = PFN_PHYS(pte_pfn(__pte(val))); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 36445dd40fdb..0f0b140b5558 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -305,12 +305,9 @@ void *restart_stack __section(.data); unsigned long stack_alloc(void) { #ifdef CONFIG_VMAP_STACK - return (unsigned long) - __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + return (unsigned long)__vmalloc_node(THREAD_SIZE, THREAD_SIZE, + THREADINFO_GFP, NUMA_NO_NODE, + __builtin_return_address(0)); #else return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); #endif diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index 934ff84844fa..d432164b23b7 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -103,7 +103,8 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot) #if defined(CONFIG_MMU) struct vm_struct *vma; - vma = __get_vm_area(map->size, VM_ALLOC, map->sq_addr, SQ_ADDRMAX); + vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr, + SQ_ADDRMAX, __builtin_return_address(0)); if (!vma) return -ENOMEM; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index acf76b466db6..e2137070386a 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -97,8 +97,7 @@ static int hv_cpu_init(unsigned int cpu) * not be stopped in the case of CPU offlining and the VM will hang. */ if (!*hvp) { - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); } if (*hvp) { @@ -379,7 +378,7 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + hv_hypercall_pg = vmalloc_exec(PAGE_SIZE); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0a6b35353fc7..e94b3de564d6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1279,8 +1279,7 @@ extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { - return __vmalloc(kvm_x86_ops.vm_size, - GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } void kvm_arch_free_vm(struct kvm *kvm); diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 6deb6cd236e3..7f6ccff0ba72 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -20,6 +20,8 @@ typedef union { #define SHARED_KERNEL_PMD 0 +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED + /* * traditional i386 two-level paging structure: */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 33845d36897c..80fbb4a9ed87 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -27,6 +27,8 @@ typedef union { #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) #endif +#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 52e5f5f2240d..8f63efb2a2cc 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -159,4 +159,6 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) +#define ARCH_PAGE_TABLE_SYNC_MASK (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b6606fe6cfdf..2e7c442cc618 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -194,7 +194,6 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_RX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) @@ -220,7 +219,6 @@ enum page_cache_mode { #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) -#define PAGE_KERNEL_RX __pgprot_mask(__PAGE_KERNEL_RX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) @@ -284,6 +282,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; +static inline pgprot_t pgprot_nx(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | _PAGE_NX); +} +#define pgprot_nx pgprot_nx + #ifdef CONFIG_X86_PAE /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 0e059b73437b..9f69cc497f4b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,27 +12,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -/* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *next) -{ -#ifdef CONFIG_VMAP_STACK - /* - * If we switch to a stack that has a top-level paging entry - * that is not present in the current mm, the resulting #PF will - * will be promoted to a double-fault and we'll panic. Probe - * the new stack now so that vmalloc_fault can fix up the page - * tables if needed. This can only happen if we use a stack - * in vmap space. - * - * We assume that the stack is aligned so that it never spans - * more than one top-level paging entry. - * - * To minimize cache pollution, just follow the stack pointer. - */ - READ_ONCE(*(unsigned char *)next->thread.sp); -#endif -} - asmlinkage void ret_from_fork(void); /* @@ -67,8 +46,6 @@ struct fork_frame { #define switch_to(prev, next, last) \ do { \ - prepare_switch_to(next); \ - \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 12df3a4abfdd..6b32ab009c19 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -43,7 +43,7 @@ static int map_irq_stack(unsigned int cpu) pages[i] = pfn_to_page(pa >> PAGE_SHIFT); } - va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL); if (!va) return -ENOMEM; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e6d7894ad127..fd945ce78554 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void) /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in - * the smpboot asm. We can't reliably pick up percpu mappings - * using vmalloc_fault(), because exception dispatch needs - * percpu data. + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. * * FIXME: Can the later sync in setup_cpu_entry_areas() replace * this call? diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 89f7f3aebd31..5573a97f1520 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -336,8 +336,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 69309cd56fdf..ea9010113f69 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -249,10 +249,22 @@ static void note_wx(struct pg_state *st, unsigned long addr) (void *)st->start_address); } -static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) { - return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | - ((prot1 | prot2) & _PAGE_NX); + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t prot = val & PTE_FLAGS_MASK; + pgprotval_t effective; + + if (level > 0) { + pgprotval_t higher_prot = st->prot_levels[level - 1]; + + effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | + ((higher_prot | prot) & _PAGE_NX); + } else { + effective = prot; + } + + st->prot_levels[level] = effective; } /* @@ -261,7 +273,7 @@ static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) * print what we collected so far. */ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); pgprotval_t new_prot, new_eff; @@ -270,16 +282,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, struct seq_file *m = st->seq; new_prot = val & PTE_FLAGS_MASK; - - if (level > 0) { - new_eff = effective_prot(st->prot_levels[level - 1], - new_prot); - } else { - new_eff = new_prot; - } - - if (level >= 0) - st->prot_levels[level] = new_eff; + if (!val) + new_eff = 0; + else + new_eff = st->prot_levels[level]; /* * If we have a "break" in the series, we need to flush the state that @@ -374,6 +380,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { .note_page = note_page, + .effective_prot = effective_prot, .range = ptdump_ranges }, .level = -1, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a51df516b87b..dffe8e4d3140 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -190,16 +190,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -static void vmalloc_sync(void) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { - unsigned long address; + unsigned long addr; - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < VMALLOC_END; - address += PMD_SIZE) { + for (addr = start & PMD_MASK; + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; + addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); @@ -210,61 +207,13 @@ static void vmalloc_sync(void) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } -void vmalloc_sync_mappings(void) -{ - vmalloc_sync(); -} - -void vmalloc_sync_unmappings(void) -{ - vmalloc_sync(); -} - -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3_pa(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - if (pmd_large(*pmd_k)) - return 0; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -329,96 +278,6 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_mappings(void) -{ - /* - * 64-bit mappings might allocate new p4d/pud pages - * that need to be propagated to all tasks' PGDs. - */ - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); -} - -void vmalloc_sync_unmappings(void) -{ - /* - * Unmappings never allocate or free p4d/pud pages. - * No work is required here. - */ -} - -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_k; - p4d_t *p4d, *p4d_k; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_k = pgd_offset_k(address); - if (pgd_none(*pgd_k)) - return -1; - - if (pgtable_l5_enabled()) { - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); - } - } - - /* With 4-level paging, copying happens on the p4d level. */ - p4d = p4d_offset(pgd, address); - p4d_k = p4d_offset(pgd_k, address); - if (p4d_none(*p4d_k)) - return -1; - - if (p4d_none(*p4d) && !pgtable_l5_enabled()) { - set_p4d(p4d, *p4d_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); - } - - BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); - - pud = pud_offset(p4d, address); - if (pud_none(*pud)) - return -1; - - if (pud_large(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return -1; - - if (pmd_large(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR @@ -1257,29 +1116,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); - /* - * We can fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * Before doing this on-demand faulting, ensure that the - * fault is not any of the following: - * 1. A fault on a PTE with a reserved bit set. - * 2. A fault caused by a user-mode access. (Do not demand- - * fault kernel memory due to user-mode accesses). - * 3. A fault caused by a page-level protection violation. - * (A demand fault would be on a non-present page which - * would have X86_PF_PROT==0). - */ - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8b5f73f5e207..96274a90c5ff 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -218,6 +218,11 @@ void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + sync_global_pgds(start, end); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 843aa10a4cb6..da0fb17a1a36 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -448,13 +448,7 @@ static void __init pti_clone_user_shared(void) * the sp1 and sp2 slots. * * This is done for all possible CPUs during boot to ensure - * that it's propagated to all mms. If we were to add one of - * these mappings during CPU hotplug, we would need to take - * some measure to make sure that every mm that subsequently - * ran on that CPU would have the relevant PGD entry in its - * pagetables. The usual vmalloc_fault() mechanism would not - * work for page faults taken in entry_SYSCALL_64 before RSP - * is set up. + * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 66f96f21a7b6..f3fe261e5936 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -161,34 +161,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static void sync_current_stack_to_mm(struct mm_struct *mm) -{ - unsigned long sp = current_stack_pointer; - pgd_t *pgd = pgd_offset(mm, sp); - - if (pgtable_l5_enabled()) { - if (unlikely(pgd_none(*pgd))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - - set_pgd(pgd, *pgd_ref); - } - } else { - /* - * "pgd" is faked. The top level entries are "p4d"s, so sync - * the p4d. This compiles to approximately the same code as - * the 5-level case. - */ - p4d_t *p4d = p4d_offset(pgd, sp); - - if (unlikely(p4d_none(*p4d))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); - - set_p4d(p4d, *p4d_ref); - } - } -} - static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; @@ -377,15 +349,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ cond_ibpb(tsk); - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - sync_current_stack_to_mm(next); - } - /* * Stop remote flushes for the previous mm. * Skip kernel threads; we never send init_mm TLB flushing IPIs, diff --git a/block/blk-core.c b/block/blk-core.c index 9bfaee050c82..38d7b1f16067 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 24c9642e8fc7..aabe9c5ee515 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -167,12 +167,6 @@ int ghes_estatus_pool_init(int num_ghes) if (!addr) goto err_pool_alloc; - /* - * New allocation must be visible in all pgd before it can be found by - * an NMI allocating from the pool. - */ - vmalloc_sync_mappings(); - rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1); if (rc) goto err_pool_add; diff --git a/drivers/base/node.c b/drivers/base/node.c index 50b8c0d43859..5b02f69769e8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -445,7 +445,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB), #endif nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), - nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + nid, 0UL, nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), nid, K(sreclaimable + diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 15e99697234a..df53dca5d02c 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -396,9 +396,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) bytes = sizeof(struct page *)*want; new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { - new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_ZERO, - PAGE_KERNEL); + new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO); if (!new_pages) return NULL; } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index da693e6a834e..d89c25ba3b89 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -919,7 +919,7 @@ static void loop_unprepare_queue(struct loop_device *lo) static int loop_kthread_worker_fn(void *worker_ptr) { - current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO; + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; return kthread_worker_fn(worker_ptr); } diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1af823b2fe6b..4c0af2eb7e19 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -377,6 +377,7 @@ static int dax_open(struct inode *inode, struct file *filp) inode->i_mapping->a_ops = &dev_dax_aops; filp->f_mapping = inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->f_sb_err = file_sample_sb_err(filp); filp->private_data = dev_dax; inode->i_flags = S_DAX; diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c index ca520028b2cb..f4e6184d1877 100644 --- a/drivers/gpu/drm/drm_scatter.c +++ b/drivers/gpu/drm/drm_scatter.c @@ -43,15 +43,6 @@ #define DEBUG_SCATTER 0 -static inline void *drm_vmalloc_dma(unsigned long size) -{ -#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE) - return __vmalloc(size, GFP_KERNEL, pgprot_noncached_wc(PAGE_KERNEL)); -#else - return vmalloc_32(size); -#endif -} - static void drm_sg_cleanup(struct drm_sg_mem * entry) { struct page *page; @@ -126,7 +117,7 @@ int drm_legacy_sg_alloc(struct drm_device *dev, void *data, return -ENOMEM; } - entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT); + entry->virtual = vmalloc_32(pages << PAGE_SHIFT); if (!entry->virtual) { kfree(entry->busaddr); kfree(entry->pagelist); diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c index 648cf0207309..706af0304ca4 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c @@ -154,8 +154,8 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit) file_size += sizeof(*iter.hdr) * n_obj; /* Allocate the file in vmalloc memory, it's likely to be big */ - iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, - PAGE_KERNEL); + iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_NORETRY); if (!iter.start) { mutex_unlock(&gpu->mmu_context->lock); dev_warn(gpu->dev, "failed to allocate devcoredump file\n"); diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c index 9272bef57092..debaf7b18ab5 100644 --- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c @@ -66,7 +66,7 @@ static void *mock_dmabuf_vmap(struct dma_buf *dma_buf) { struct mock_dmabuf *mock = to_mock(dma_buf); - return vm_map_ram(mock->pages, mock->npages, 0, PAGE_KERNEL); + return vm_map_ram(mock->pages, mock->npages, 0); } static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 9a967a2e83dd..6e677ff62cc9 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -145,9 +145,8 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) int ret = 0; map_size = pblk_trans_map_size(pblk); - pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN - | __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM, - PAGE_KERNEL); + pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM); if (!pblk->trans_map) { pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n", map_size); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2d519c223562..d1786cfd7f22 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -400,13 +400,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, */ if (gfp_mask & __GFP_NORETRY) { unsigned noio_flag = memalloc_noio_save(); - void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + void *ptr = __vmalloc(c->block_size, gfp_mask); memalloc_noio_restore(noio_flag); return ptr; } - return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + return __vmalloc(c->block_size, gfp_mask); } /* diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b952bd45bd6a..95a5f3757fa3 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -324,14 +324,6 @@ static void end_bitmap_write(struct buffer_head *bh, int uptodate) wake_up(&bitmap->write_wait); } -/* copied from buffer.c */ -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} static void free_buffers(struct page *page) { struct buffer_head *bh; @@ -345,7 +337,7 @@ static void free_buffers(struct page *page) free_buffer_head(bh); bh = next; } - __clear_page_buffers(page); + detach_page_private(page); put_page(page); } @@ -374,7 +366,7 @@ static int read_page(struct file *file, unsigned long index, ret = -ENOMEM; goto out; } - attach_page_buffers(page, bh); + attach_page_private(page, bh); blk_cur = index << (PAGE_SHIFT - inode->i_blkbits); while (bh) { block = blk_cur; diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c index 6db60e9d5183..92072a08af25 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c @@ -309,8 +309,7 @@ static void *vb2_dma_sg_vaddr(void *buf_priv) if (buf->db_attach) buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf); else - buf->vaddr = vm_map_ram(buf->pages, - buf->num_pages, -1, PAGE_KERNEL); + buf->vaddr = vm_map_ram(buf->pages, buf->num_pages, -1); } /* add offset in case userptr is not page-aligned */ diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c index 1a4f0ca87c7c..c66fda4a65e4 100644 --- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c +++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c @@ -107,8 +107,7 @@ static void *vb2_vmalloc_get_userptr(struct device *dev, unsigned long vaddr, buf->vaddr = (__force void *) ioremap(__pfn_to_phys(nums[0]), size + offset); } else { - buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1, - PAGE_KERNEL); + buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1); } if (!buf->vaddr) diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c index 5f8883031c9c..0d8372cc364a 100644 --- a/drivers/media/pci/ivtv/ivtv-udma.c +++ b/drivers/media/pci/ivtv/ivtv-udma.c @@ -92,7 +92,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, { struct ivtv_dma_page_info user_dma; struct ivtv_user_dma *dma = &itv->udma; - int i, err; + int err; IVTV_DEBUG_DMA("ivtv_udma_setup, dst: 0x%08x\n", (unsigned int)ivtv_dest_addr); @@ -111,16 +111,15 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, return -EINVAL; } - /* Get user pages for DMA Xfer */ - err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, + /* Pin user pages for DMA Xfer */ + err = pin_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, dma->map, FOLL_FORCE); if (user_dma.page_count != err) { IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", err, user_dma.page_count); if (err >= 0) { - for (i = 0; i < err; i++) - put_page(dma->map[i]); + unpin_user_pages(dma->map, err); return -EINVAL; } return err; @@ -130,9 +129,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, /* Fill SG List with new values */ if (ivtv_udma_fill_sg_list(dma, &user_dma, 0) < 0) { - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; return -ENOMEM; } @@ -153,7 +150,6 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, void ivtv_udma_unmap(struct ivtv *itv) { struct ivtv_user_dma *dma = &itv->udma; - int i; IVTV_DEBUG_INFO("ivtv_unmap_user_dma\n"); @@ -169,10 +165,7 @@ void ivtv_udma_unmap(struct ivtv *itv) /* sync DMA */ ivtv_udma_sync_for_cpu(itv); - /* Release User Pages */ - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; } diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c index cd2fe2d444c0..5f7dc9771f8d 100644 --- a/drivers/media/pci/ivtv/ivtv-yuv.c +++ b/drivers/media/pci/ivtv/ivtv-yuv.c @@ -30,7 +30,6 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, struct yuv_playback_info *yi = &itv->yuv_info; u8 frame = yi->draw_frame; struct yuv_frame_info *f = &yi->new_frame_info[frame]; - int i; int y_pages, uv_pages; unsigned long y_buffer_offset, uv_buffer_offset; int y_decode_height, uv_decode_height, y_size; @@ -62,12 +61,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, ivtv_udma_get_page_info (&y_dma, (unsigned long)args->y_source, 720 * y_decode_height); ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height); - /* Get user pages for DMA Xfer */ - y_pages = get_user_pages_unlocked(y_dma.uaddr, + /* Pin user pages for DMA Xfer */ + y_pages = pin_user_pages_unlocked(y_dma.uaddr, y_dma.page_count, &dma->map[0], FOLL_FORCE); uv_pages = 0; /* silence gcc. value is set and consumed only if: */ if (y_pages == y_dma.page_count) { - uv_pages = get_user_pages_unlocked(uv_dma.uaddr, + uv_pages = pin_user_pages_unlocked(uv_dma.uaddr, uv_dma.page_count, &dma->map[y_pages], FOLL_FORCE); } @@ -81,8 +80,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, uv_pages, uv_dma.page_count); if (uv_pages >= 0) { - for (i = 0; i < uv_pages; i++) - put_page(dma->map[y_pages + i]); + unpin_user_pages(&dma->map[y_pages], uv_pages); rc = -EFAULT; } else { rc = uv_pages; @@ -93,8 +91,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, y_pages, y_dma.page_count); } if (y_pages >= 0) { - for (i = 0; i < y_pages; i++) - put_page(dma->map[i]); + unpin_user_pages(dma->map, y_pages); /* * Inherit the -EFAULT from rc's * initialization, but allow it to be @@ -112,9 +109,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, /* Fill & map SG List */ if (ivtv_udma_fill_sg_list (dma, &uv_dma, ivtv_udma_fill_sg_list (dma, &y_dma, 0)) < 0) { IVTV_DEBUG_WARN("could not allocate bounce buffers for highmem userspace buffers\n"); - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; return -ENOMEM; } diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c index 0c2859844081..e2d56dca5be4 100644 --- a/drivers/media/pci/ivtv/ivtvfb.c +++ b/drivers/media/pci/ivtv/ivtvfb.c @@ -281,10 +281,10 @@ static int ivtvfb_prep_dec_dma_to_device(struct ivtv *itv, /* Map User DMA */ if (ivtv_udma_setup(itv, ivtv_dest_addr, userbuf, size_in_bytes) <= 0) { mutex_unlock(&itv->udma.lock); - IVTVFB_WARN("ivtvfb_prep_dec_dma_to_device, Error with get_user_pages: %d bytes, %d pages returned\n", + IVTVFB_WARN("ivtvfb_prep_dec_dma_to_device, Error with pin_user_pages: %d bytes, %d pages returned\n", size_in_bytes, itv->udma.page_count); - /* get_user_pages must have failed completely */ + /* pin_user_pages must have failed completely */ return -EIO; } diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index b57b84fb97d0..14d890b00d2c 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -1297,7 +1297,7 @@ static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum, if (!ubi_dbg_chk_io(ubi)) return 0; - buf1 = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf1 = __vmalloc(len, GFP_NOFS); if (!buf1) { ubi_err(ubi, "cannot allocate memory to check writes"); return 0; @@ -1361,7 +1361,7 @@ int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len) if (!ubi_dbg_chk_io(ubi)) return 0; - buf = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(len, GFP_NOFS); if (!buf) { ubi_err(ubi, "cannot allocate memory to check for 0xFFs"); return 0; diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index f2741c04289d..35158cfd9c1a 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -178,10 +178,9 @@ static int electra_cf_probe(struct platform_device *ofdev) struct device_node *np = ofdev->dev.of_node; struct electra_cf_socket *cf; struct resource mem, io; - int status; + int status = -ENOMEM; const unsigned int *prop; int err; - struct vm_struct *area; err = of_address_to_resource(np, 0, &mem); if (err) @@ -202,30 +201,19 @@ static int electra_cf_probe(struct platform_device *ofdev) cf->mem_phys = mem.start; cf->mem_size = PAGE_ALIGN(resource_size(&mem)); cf->mem_base = ioremap(cf->mem_phys, cf->mem_size); + if (!cf->mem_base) + goto out_free_cf; cf->io_size = PAGE_ALIGN(resource_size(&io)); - - area = __get_vm_area(cf->io_size, 0, PHB_IO_BASE, PHB_IO_END); - if (area == NULL) { - status = -ENOMEM; - goto fail1; - } - - cf->io_virt = (void __iomem *)(area->addr); + cf->io_virt = ioremap_phb(io.start, cf->io_size); + if (!cf->io_virt) + goto out_unmap_mem; cf->gpio_base = ioremap(0xfc103000, 0x1000); + if (!cf->gpio_base) + goto out_unmap_virt; dev_set_drvdata(device, cf); - if (!cf->mem_base || !cf->io_virt || !cf->gpio_base || - (__ioremap_at(io.start, cf->io_virt, cf->io_size, - pgprot_noncached(PAGE_KERNEL)) == NULL)) { - dev_err(device, "can't ioremap ranges\n"); - status = -ENOMEM; - goto fail1; - } - - cf->io_base = (unsigned long)cf->io_virt - VMALLOC_END; - cf->iomem.start = (unsigned long)cf->mem_base; cf->iomem.end = (unsigned long)cf->mem_base + (mem.end - mem.start); cf->iomem.flags = IORESOURCE_MEM; @@ -305,14 +293,13 @@ fail1: if (cf->irq) free_irq(cf->irq, cf); - if (cf->io_virt) - __iounmap_at(cf->io_virt, cf->io_size); - if (cf->mem_base) - iounmap(cf->mem_base); - if (cf->gpio_base) - iounmap(cf->gpio_base); - if (area) - device_init_wakeup(&ofdev->dev, 0); + iounmap(cf->gpio_base); +out_unmap_virt: + device_init_wakeup(&ofdev->dev, 0); + iounmap(cf->io_virt); +out_unmap_mem: + iounmap(cf->mem_base); +out_free_cf: kfree(cf); return status; @@ -330,7 +317,7 @@ static int electra_cf_remove(struct platform_device *ofdev) free_irq(cf->irq, cf); del_timer_sync(&cf->timer); - __iounmap_at(cf->io_virt, cf->io_size); + iounmap(cf->io_virt); iounmap(cf->mem_base); iounmap(cf->gpio_base); release_mem_region(cf->mem_phys, cf->mem_size); diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index f45c22b09726..8be27426aa66 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -136,8 +136,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, while (bufsize >= SECTOR_SIZE) { buf = __vmalloc(bufsize, - GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY, - PAGE_KERNEL); + GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY); if (buf) { *buflen = bufsize; return buf; diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c index 473b465724f1..0755b11348ed 100644 --- a/drivers/staging/android/ion/ion_heap.c +++ b/drivers/staging/android/ion/ion_heap.c @@ -99,12 +99,12 @@ int ion_heap_map_user(struct ion_heap *heap, struct ion_buffer *buffer, static int ion_heap_clear_pages(struct page **pages, int num, pgprot_t pgprot) { - void *addr = vm_map_ram(pages, num, -1, pgprot); + void *addr = vmap(pages, num, VM_MAP, pgprot); if (!addr) return -ENOMEM; memset(addr, 0, PAGE_SIZE * num); - vm_unmap_ram(addr, num); + vunmap(addr); return 0; } diff --git a/drivers/staging/media/ipu3/ipu3-css-pool.h b/drivers/staging/media/ipu3/ipu3-css-pool.h index f4a60b41401b..a8ccd4f70320 100644 --- a/drivers/staging/media/ipu3/ipu3-css-pool.h +++ b/drivers/staging/media/ipu3/ipu3-css-pool.h @@ -15,14 +15,12 @@ struct imgu_device; * @size: size of the buffer in bytes. * @vaddr: kernel virtual address. * @daddr: iova dma address to access IPU3. - * @vma: private, a pointer to &struct vm_struct, - * used for imgu_dmamap_free. */ struct imgu_css_map { size_t size; void *vaddr; dma_addr_t daddr; - struct vm_struct *vma; + struct page **pages; }; /** diff --git a/drivers/staging/media/ipu3/ipu3-dmamap.c b/drivers/staging/media/ipu3/ipu3-dmamap.c index 7431322379f6..8a19b0024152 100644 --- a/drivers/staging/media/ipu3/ipu3-dmamap.c +++ b/drivers/staging/media/ipu3/ipu3-dmamap.c @@ -96,6 +96,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, unsigned long shift = iova_shift(&imgu->iova_domain); struct device *dev = &imgu->pci_dev->dev; size_t size = PAGE_ALIGN(len); + int count = size >> PAGE_SHIFT; struct page **pages; dma_addr_t iovaddr; struct iova *iova; @@ -114,7 +115,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, /* Call IOMMU driver to setup pgt */ iovaddr = iova_dma_addr(&imgu->iova_domain, iova); - for (i = 0; i < size / PAGE_SIZE; ++i) { + for (i = 0; i < count; ++i) { rval = imgu_mmu_map(imgu->mmu, iovaddr, page_to_phys(pages[i]), PAGE_SIZE); if (rval) @@ -123,33 +124,23 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, iovaddr += PAGE_SIZE; } - /* Now grab a virtual region */ - map->vma = __get_vm_area(size, VM_USERMAP, VMALLOC_START, VMALLOC_END); - if (!map->vma) + map->vaddr = vmap(pages, count, VM_USERMAP, PAGE_KERNEL); + if (!map->vaddr) goto out_unmap; - map->vma->pages = pages; - /* And map it in KVA */ - if (map_vm_area(map->vma, PAGE_KERNEL, pages)) - goto out_vunmap; - + map->pages = pages; map->size = size; map->daddr = iova_dma_addr(&imgu->iova_domain, iova); - map->vaddr = map->vma->addr; dev_dbg(dev, "%s: allocated %zu @ IOVA %pad @ VA %p\n", __func__, - size, &map->daddr, map->vma->addr); + size, &map->daddr, map->vaddr); - return map->vma->addr; - -out_vunmap: - vunmap(map->vma->addr); + return map->vaddr; out_unmap: imgu_dmamap_free_buffer(pages, size); imgu_mmu_unmap(imgu->mmu, iova_dma_addr(&imgu->iova_domain, iova), i * PAGE_SIZE); - map->vma = NULL; out_free_iova: __free_iova(&imgu->iova_domain, iova); @@ -177,8 +168,6 @@ void imgu_dmamap_unmap(struct imgu_device *imgu, struct imgu_css_map *map) */ void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map) { - struct vm_struct *area = map->vma; - dev_dbg(&imgu->pci_dev->dev, "%s: freeing %zu @ IOVA %pad @ VA %p\n", __func__, map->size, &map->daddr, map->vaddr); @@ -187,11 +176,8 @@ void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map) imgu_dmamap_unmap(imgu, map); - if (WARN_ON(!area) || WARN_ON(!area->pages)) - return; - - imgu_dmamap_free_buffer(area->pages, map->size); vunmap(map->vaddr); + imgu_dmamap_free_buffer(map->pages, map->size); map->vaddr = NULL; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 93672c3f1c78..f05e2f2c898d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -614,10 +614,9 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } -static int blkdev_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void blkdev_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); + mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, @@ -2085,7 +2084,7 @@ static int blkdev_writepages(struct address_space *mapping, static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, - .readpages = blkdev_readpages, + .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d10c7be10f3b..7278789ff8a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -980,9 +980,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, "page private not zero on page %llu", (unsigned long long)page_offset(page)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + detach_page_private(page); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 39e45b8a5031..e12eb32d9e17 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3076,22 +3076,16 @@ static int submit_extent_page(unsigned int opf, static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long)eb); - } else { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else WARN_ON(page->private != (unsigned long)eb); - } } void set_page_extent_mapped(struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); } static struct extent_map * @@ -4367,51 +4361,32 @@ int extent_writepages(struct address_space *mapping, return ret; } -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages) +void extent_readahead(struct readahead_control *rac) { struct bio *bio = NULL; unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - int nr = 0; u64 prev_em_start = (u64)-1; + int nr; - while (!list_empty(pages)) { - u64 contig_end = 0; + while ((nr = readahead_page_batch(rac, pagepool))) { + u64 contig_start = page_offset(pagepool[0]); + u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { - struct page *page = lru_to_page(pages); + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - break; - } - - pagepool[nr++] = page; - contig_end = page_offset(page) + PAGE_SIZE - 1; - } - - if (nr) { - u64 contig_start = page_offset(pagepool[0]); - - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - - contiguous_readpages(pagepool, nr, contig_start, - contig_end, &em_cached, &bio, &bio_flags, - &prev_em_start); - } + contiguous_readpages(pagepool, nr, contig_start, contig_end, + &em_cached, &bio, &bio_flags, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) - return submit_one_bio(bio, 0, bio_flags); - return 0; + if (bio) { + if (submit_one_bio(bio, 0, bio_flags)) + return; + } } /* @@ -4929,10 +4904,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) * We need to make sure we haven't be attached * to a new eb. */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - put_page(page); + detach_page_private(page); } if (mapped) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2ed65bd0760e..25594e09fdcd 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -198,8 +198,7 @@ int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages); +void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 320d1062068d..8b3489f229c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4856,8 +4856,8 @@ static void evict_inode_truncate_pages(struct inode *inode) /* * Keep looping until we have no more ranges in the io tree. - * We can have ongoing bios started by readpages (called from readahead) - * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * We can have ongoing bios started by readahead that have + * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before @@ -7050,11 +7050,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent - * call to readpages() (a buffered read or a defrag call + * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not - * complete), which makes readpages() wait for that + * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ @@ -8293,21 +8293,16 @@ static int btrfs_writepages(struct address_space *mapping, return extent_writepages(mapping, wbc); } -static int -btrfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void btrfs_readahead(struct readahead_control *rac) { - return extent_readpages(mapping, pages, nr_pages); + extent_readahead(rac); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + if (ret == 1) + detach_page_private(page); return ret; } @@ -8329,14 +8324,8 @@ static int btrfs_migratepage(struct address_space *mapping, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (PagePrivate2(page)) { ClearPagePrivate2(page); @@ -8458,11 +8447,7 @@ again: } ClearPageChecked(page); - if (PagePrivate(page)) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + detach_page_private(page); } /* @@ -10553,7 +10538,7 @@ static const struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, - .readpages = btrfs_readpages, + .readahead = btrfs_readahead, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, diff --git a/fs/buffer.c b/fs/buffer.c index a60f60396cfa..64fe82ec65ff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -123,14 +123,6 @@ void __wait_on_buffer(struct buffer_head * bh) } EXPORT_SYMBOL(__wait_on_buffer); -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} - static void buffer_io_error(struct buffer_head *bh, char *msg) { if (!test_bit(BH_Quiet, &bh->b_state)) @@ -906,7 +898,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) @@ -1154,12 +1146,19 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { + struct super_block *sb; + set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_page && bh->b_page->mapping) mapping_set_error(bh->b_page->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); + rcu_read_lock(); + sb = READ_ONCE(bh->b_bdev->bd_super); + if (sb) + errseq_set(&sb->s_wb_err, -EIO); + rcu_read_unlock(); } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1580,7 +1579,7 @@ void create_empty_buffers(struct page *page, bh = bh->b_this_page; } while (bh != head); } - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -2567,7 +2566,7 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) bh->b_this_page = head; bh = bh->b_this_page; } while (bh != head); - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } @@ -3227,7 +3226,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = next; } while (bh != head); *buffers_to_free = head; - __clear_page_buffers(page); + detach_page_private(page); return 1; failed: return 0; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fc3a8d8064f8..d0542151e8c4 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -280,47 +280,36 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page) return 0; } -static int erofs_raw_access_readpages(struct file *filp, - struct address_space *mapping, - struct list_head *pages, - unsigned int nr_pages) +static void erofs_raw_access_readahead(struct readahead_control *rac) { erofs_off_t last_block; struct bio *bio = NULL; - gfp_t gfp = readahead_gfp_mask(mapping); - struct page *page = list_last_entry(pages, struct page, lru); + struct page *page; - trace_erofs_readpages(mapping->host, page, nr_pages, true); - - for (; nr_pages; --nr_pages) { - page = list_entry(pages->prev, struct page, lru); + trace_erofs_readpages(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { - bio = erofs_read_raw_page(bio, mapping, page, - &last_block, nr_pages, true); + bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, + readahead_count(rac), true); - /* all the page errors are ignored when readahead */ - if (IS_ERR(bio)) { - pr_err("%s, readahead error at page %lu of nid %llu\n", - __func__, page->index, - EROFS_I(mapping->host)->nid); + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_I(rac->mapping->host)->nid); - bio = NULL; - } + bio = NULL; } - /* pages could still be locked */ put_page(page); } - DBG_BUGON(!list_empty(pages)); /* the rare case (end in gaps) */ if (bio) submit_bio(bio); - return 0; } static int erofs_get_block(struct inode *inode, sector_t iblock, @@ -358,7 +347,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) /* for uncompressed (aligned) files and raw access for other files */ const struct address_space_operations erofs_raw_access_aops = { .readpage = erofs_raw_access_readpage, - .readpages = erofs_raw_access_readpages, + .readahead = erofs_raw_access_readahead, .bmap = erofs_bmap, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 5d2d81940679..7628816f2453 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, i = 0; while (1) { - dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL); + dst = vm_map_ram(rq->out, nrpages_out, -1); /* retry two more times (totally 3 times) */ if (dst || ++i >= 3) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c4b6c9aa87ec..187f93b4900e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1305,28 +1305,23 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi, return nr <= sbi->max_sync_decompress_pages; } -static int z_erofs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void z_erofs_readahead(struct readahead_control *rac) { - struct inode *const inode = mapping->host; + struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, nr_pages); + bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - struct page *head = NULL; + struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(mapping->host, lru_to_page(pages), - nr_pages, false); + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), false); - f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; - - for (; nr_pages; --nr_pages) { - struct page *page = lru_to_page(pages); + f.headoffset = readahead_pos(rac); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); /* * A pure asynchronous readahead is indicated if @@ -1335,11 +1330,6 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, */ sync &= !(PageReadahead(page) && !head); - if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { - list_add(&page->lru, &pagepool); - continue; - } - set_page_private(page, (unsigned long)head); head = page; } @@ -1368,11 +1358,10 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, /* clean up the remaining free pages */ put_pages_list(&pagepool); - return 0; } const struct address_space_operations z_erofs_aops = { .readpage = z_erofs_readpage, - .readpages = z_erofs_readpages, + .readahead = z_erofs_readahead, }; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 06887492f54b..785ead346543 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -372,10 +372,9 @@ static int exfat_readpage(struct file *file, struct page *page) return mpage_readpage(page, exfat_get_block); } -static int exfat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void exfat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, exfat_get_block); + mpage_readahead(rac, exfat_get_block); } static int exfat_writepage(struct page *page, struct writeback_control *wbc) @@ -502,7 +501,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations exfat_aops = { .readpage = exfat_readpage, - .readpages = exfat_readpages, + .readahead = exfat_readahead, .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..2875c0a705b5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -877,11 +877,9 @@ static int ext2_readpage(struct file *file, struct page *page) return mpage_readpage(page, ext2_get_block); } -static int -ext2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext2_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); + mpage_readahead(rac, ext2_get_block); } static int @@ -967,7 +965,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc const struct address_space_operations ext2_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, @@ -981,7 +979,7 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, .write_begin = ext2_nobh_write_begin, .write_end = nobh_write_end, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3147bb0cf82a..15b062efcff1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3317,9 +3317,8 @@ static inline void ext4_set_de_type(struct super_block *sb, } /* readpages.c */ -extern int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); +extern int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..52be85f96159 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3224,23 +3224,20 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return ext4_mpage_readpages(page->mapping, NULL, page, 1, - false); + return ext4_mpage_readpages(inode, NULL, page); return ret; } -static int -ext4_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext4_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; - /* If the file has inline data, no need to do readpages. */ + /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) - return 0; + return; - return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true); + ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidatepage(struct page *page, unsigned int offset, @@ -3605,7 +3602,7 @@ static int ext4_set_page_dirty(struct page *page) static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3622,7 +3619,7 @@ static const struct address_space_operations ext4_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3638,7 +3635,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c1769afbf799..5761e9961682 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -7,8 +7,8 @@ * * This was originally taken from fs/mpage.c * - * The intent is the ext4_mpage_readpages() function here is intended - * to replace mpage_readpages() in the general case, not just for + * The ext4_mpage_readpages() function here is intended to + * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. @@ -221,14 +221,12 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) return i_size_read(inode); } -int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -241,6 +239,7 @@ int ext4_mpage_readpages(struct address_space *mapping, int length; unsigned relative_block = 0; struct ext4_map_blocks map; + unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; @@ -251,14 +250,9 @@ int ext4_mpage_readpages(struct address_space *mapping, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (pages) { - page = lru_to_page(pages); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) - goto next_page; } if (page_has_buffers(page)) @@ -381,7 +375,7 @@ int ext4_mpage_readpages(struct address_space *mapping, bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio_set_op_attrs(bio, REQ_OP_READ, - is_readahead ? REQ_RAHEAD : 0); + rac ? REQ_RAHEAD : 0); } length = first_hole << blkbits; @@ -406,10 +400,9 @@ int ext4_mpage_readpages(struct address_space *mapping, else unlock_page(page); next_page: - if (pages) + if (rac) put_page(page); } - BUG_ON(pages && !list_empty(pages)); if (bio) submit_bio(bio); return 0; diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..dec1244dd062 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -342,37 +342,6 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf, return desc_size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void ext4_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -386,8 +355,8 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - ext4_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..03ec97f28235 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2177,13 +2177,11 @@ out: * use ->readpage() or do the necessary surgery to decouple ->readpages() * from read-ahead. */ -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +static int f2fs_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { @@ -2197,6 +2195,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, .nr_cpages = 0, }; #endif + unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; @@ -2210,15 +2209,9 @@ int f2fs_mpage_readpages(struct address_space *mapping, map.m_may_create = false; for (; nr_pages; nr_pages--) { - if (pages) { - page = list_last_entry(pages, struct page, lru); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page_index(page), - readahead_gfp_mask(mapping))) - goto next_page; } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2228,7 +2221,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2251,7 +2244,7 @@ read_single_page: #endif ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, - &bio, &last_block_in_bio, is_readahead); + &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: @@ -2260,8 +2253,10 @@ set_error_page: zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } +#ifdef CONFIG_F2FS_FS_COMPRESSION next_page: - if (pages) +#endif + if (rac) put_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2271,16 +2266,15 @@ next_page: ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); } } #endif } - BUG_ON(pages && !list_empty(pages)); if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - return pages ? 0 : ret; + return ret; } static int f2fs_read_data_page(struct file *file, struct page *page) @@ -2299,28 +2293,24 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page_file_mapping(page), - NULL, page, 1, false); + ret = f2fs_mpage_readpages(inode, NULL, page); return ret; } -static int f2fs_read_data_pages(struct file *file, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void f2fs_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; - struct page *page = list_last_entry(pages, struct page, lru); + struct inode *inode = rac->mapping->host; - trace_f2fs_readpages(inode, page, nr_pages); + trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) - return 0; + return; /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) - return 0; + return; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); + f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) @@ -3805,7 +3795,7 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, - .readpages = f2fs_read_data_pages, + .readahead = f2fs_readahead, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 157eec348970..5c0149d2f46a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3051,19 +3051,12 @@ static inline void f2fs_set_page_private(struct page *page, if (PagePrivate(page)) return; - get_page(page); - SetPagePrivate(page); - set_page_private(page, data); + attach_page_private(page, (void *)data); } static inline void f2fs_clear_page_private(struct page *page) { - if (!PagePrivate(page)) - return; - - set_page_private(page, 0); - ClearPagePrivate(page); - f2fs_put_page(page, 0); + detach_page_private(page); } /* @@ -3373,9 +3366,6 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index d7d430a6f130..865c9fb774fb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -222,37 +222,6 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, return size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void f2fs_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -266,8 +235,8 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - f2fs_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 71946da84388..e6e68b2274a5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -210,10 +210,9 @@ static int fat_readpage(struct file *file, struct page *page) return mpage_readpage(page, fat_get_block); } -static int fat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void fat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, fat_get_block); + mpage_readahead(rac, fat_get_block); } static void fat_write_failed(struct address_space *mapping, loff_t to) @@ -344,7 +343,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations fat_aops = { .readpage = fat_readpage, - .readpages = fat_readpages, + .readahead = fat_readahead, .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..676e620948d2 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -198,6 +198,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); + file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..c5bdf46e3b4b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1070,7 +1070,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d67b830fb7a..bac51c32d660 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -915,84 +915,40 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) fuse_readpages_end(fc, &ap->args, err); } -struct fuse_fill_data { - struct fuse_io_args *ia; - struct file *file; - struct inode *inode; - unsigned int nr_pages; - unsigned int max_pages; -}; - -static int fuse_readpages_fill(void *_data, struct page *page) +static void fuse_readahead(struct readahead_control *rac) { - struct fuse_fill_data *data = _data; - struct fuse_io_args *ia = data->ia; - struct fuse_args_pages *ap = &ia->ap; - struct inode *inode = data->inode; + struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); + unsigned int i, max_pages, nr_pages = 0; - fuse_wait_on_page_writeback(inode, page->index); - - if (ap->num_pages && - (ap->num_pages == fc->max_pages || - (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || - ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { - data->max_pages = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(ia, data->file); - data->ia = ia = fuse_io_alloc(NULL, data->max_pages); - if (!ia) { - unlock_page(page); - return -ENOMEM; - } - ap = &ia->ap; - } - - if (WARN_ON(ap->num_pages >= data->max_pages)) { - unlock_page(page); - fuse_io_free(ia); - return -EIO; - } - - get_page(page); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = PAGE_SIZE; - ap->num_pages++; - data->nr_pages--; - return 0; -} - -static int fuse_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_data data; - int err; - - err = -EIO; if (is_bad_inode(inode)) - goto out; + return; - data.file = file; - data.inode = inode; - data.nr_pages = nr_pages; - data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); -; - data.ia = fuse_io_alloc(NULL, data.max_pages); - err = -ENOMEM; - if (!data.ia) - goto out; + max_pages = min_t(unsigned int, fc->max_pages, + fc->max_read / PAGE_SIZE); - err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) { - if (data.ia->ap.num_pages) - fuse_send_readpages(data.ia, file); - else - fuse_io_free(data.ia); + for (;;) { + struct fuse_io_args *ia; + struct fuse_args_pages *ap; + + nr_pages = readahead_count(rac) - nr_pages; + if (nr_pages > max_pages) + nr_pages = max_pages; + if (nr_pages == 0) + break; + ia = fuse_io_alloc(NULL, nr_pages); + if (!ia) + return; + ap = &ia->ap; + nr_pages = __readahead_batch(rac, ap->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + fuse_wait_on_page_writeback(inode, + readahead_index(rac) + i); + ap->descs[i].length = PAGE_SIZE; + } + ap->num_pages = nr_pages; + fuse_send_readpages(ia, rac->file); } -out: - return err; } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -3373,10 +3329,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, + .readahead = fuse_readahead, .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_page = fuse_launder_page, - .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 786c1ce8f030..72c9560f4467 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -577,7 +577,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, } /** - * gfs2_readpages - Read a bunch of pages at once + * gfs2_readahead - Read a bunch of pages at once * @file: The file to read from * @mapping: Address space info * @pages: List of pages to read @@ -590,31 +590,24 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. * 2. We don't handle stuffed files here we let readpage do the honours. - * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 3. mpage_readahead() does most of the heavy lifting in the common case. * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. */ -static int gfs2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void gfs2_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - int ret; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (unlikely(ret)) + if (gfs2_glock_nq(&gh)) goto out_uninit; if (!gfs2_is_stuffed(ip)) - ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + mpage_readahead(rac, gfs2_block_map); gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(gfs2_withdrawn(sdp))) - ret = -EIO; - return ret; } /** @@ -833,7 +826,7 @@ static const struct address_space_operations gfs2_aops = { .writepage = gfs2_writepage, .writepages = gfs2_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, @@ -847,7 +840,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepage = gfs2_jdata_writepage, .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .set_page_dirty = jdata_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3f7732415be..c0f2875c946c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,7 +354,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); if (hc == NULL) - hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + hc = __vmalloc(hsize, GFP_NOFS); if (hc == NULL) return ERR_PTR(-ENOMEM); @@ -1166,7 +1166,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); if (hc2 == NULL) - hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS); if (!hc2) return -ENOMEM; @@ -1327,7 +1327,7 @@ static void *gfs2_alloc_sort_buffer(unsigned size) if (size < KMALLOC_MAX_SIZE) ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); if (!ptr) - ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + ptr = __vmalloc(size, GFP_NOFS); return ptr; } @@ -1987,8 +1987,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) - ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, - PAGE_KERNEL); + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO); if (!ht) return -ENOMEM; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8259fef3f986..4b67d47a7e00 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1365,7 +1365,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | - __GFP_ZERO, PAGE_KERNEL); + __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..2de0d3492d15 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -125,10 +125,9 @@ static int hpfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, hpfs_get_block, wbc); } -static int hpfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void hpfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); + mpage_readahead(rac, hpfs_get_block); } static int hpfs_writepages(struct address_space *mapping, @@ -198,7 +197,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, - .readpages = hpfs_readpages, + .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 89e21961d1ad..a1ed7620fbac 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -59,24 +59,19 @@ iomap_page_create(struct inode *inode, struct page *page) * migrate_page_move_mapping() assumes that pages with private data have * their count elevated by 1. */ - get_page(page); - set_page_private(page, (unsigned long)iop); - SetPagePrivate(page); + attach_page_private(page, iop); return iop; } static void iomap_page_release(struct page *page) { - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = detach_page_private(page); if (!iop) return; WARN_ON_ONCE(atomic_read(&iop->read_count)); WARN_ON_ONCE(atomic_read(&iop->write_count)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); kfree(iop); } @@ -214,9 +209,8 @@ iomap_read_end_io(struct bio *bio) struct iomap_readpage_ctx { struct page *cur_page; bool cur_page_in_bio; - bool is_readahead; struct bio *bio; - struct list_head *pages; + struct readahead_control *rac; }; static void @@ -308,7 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (ctx->bio) submit_bio(ctx->bio); - if (ctx->is_readahead) /* same as readahead_gfp_mask */ + if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); /* @@ -319,7 +313,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!ctx->bio) ctx->bio = bio_alloc(orig_gfp, 1); ctx->bio->bi_opf = REQ_OP_READ; - if (ctx->is_readahead) + if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); @@ -367,7 +361,7 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readpages and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page we always * return 0 and just mark the page as PageError on errors. This * should be cleaned up all through the stack eventually. */ @@ -375,36 +369,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readpage); -static struct page * -iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, - loff_t length, loff_t *done) -{ - while (!list_empty(pages)) { - struct page *page = lru_to_page(pages); - - if (page_offset(page) >= (u64)pos + length) - break; - - list_del(&page->lru); - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, - GFP_NOFS)) - return page; - - /* - * If we already have a page in the page cache at index we are - * done. Upper layers don't care if it is uptodate after the - * readpages call itself as every page gets checked again once - * actually needed. - */ - *done += PAGE_SIZE; - put_page(page); - } - - return NULL; -} - static loff_t -iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, +iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; @@ -418,10 +384,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = NULL; } if (!ctx->cur_page) { - ctx->cur_page = iomap_next_page(inode, ctx->pages, - pos, length, &done); - if (!ctx->cur_page) - break; + ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, @@ -431,32 +394,43 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, return done; } -int -iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops) +/** + * iomap_readahead - Attempt to read pages from a file. + * @rac: Describes the pages to be read. + * @ops: The operations vector for the filesystem. + * + * This function is for filesystems to call to implement their readahead + * address_space operation. + * + * Context: The @ops callbacks may submit I/O (eg to read the addresses of + * blocks from disc), and may wait for it. The caller may be trying to + * access a different page, and so sleeping excessively should be avoided. + * It may allocate memory, but should avoid costly allocations. This + * function is called with memalloc_nofs set, so allocations will not cause + * the filesystem to be reentered. + */ +void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { + struct inode *inode = rac->mapping->host; + loff_t pos = readahead_pos(rac); + loff_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { - .pages = pages, - .is_readahead = true, + .rac = rac, }; - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); - loff_t length = last - pos + PAGE_SIZE, ret = 0; - trace_iomap_readpages(mapping->host, nr_pages); + trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - ret = iomap_apply(mapping->host, pos, length, 0, ops, - &ctx, iomap_readpages_actor); + loff_t ret = iomap_apply(inode, pos, length, 0, ops, + &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); - goto done; + break; } pos += ret; length -= ret; } - ret = 0; -done: + if (ctx.bio) submit_bio(ctx.bio); if (ctx.cur_page) { @@ -464,15 +438,8 @@ done: unlock_page(ctx.cur_page); put_page(ctx.cur_page); } - - /* - * Check that we didn't lose a page due to the arcance calling - * conventions.. - */ - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); - return ret; } -EXPORT_SYMBOL_GPL(iomap_readpages); +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a page are @@ -554,14 +521,8 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4df19c66f597..5693a39d52fb 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -39,7 +39,7 @@ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); -DEFINE_READPAGE_EVENT(iomap_readpages); +DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 62c0462dc89f..95b1f377ad09 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1185,10 +1185,9 @@ static int isofs_readpage(struct file *file, struct page *page) return mpage_readpage(page, isofs_get_block); } -static int isofs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void isofs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); + mpage_readahead(rac, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1198,7 +1197,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, - .readpages = isofs_readpages, + .readahead = isofs_readahead, .bmap = _isofs_bmap }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9486afcdac76..6f65bfa9f18d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -296,10 +296,9 @@ static int jfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, jfs_get_block); } -static int jfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void jfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); + mpage_readahead(rac, jfs_get_block); } static void jfs_write_failed(struct address_space *mapping, loff_t to) @@ -358,7 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations jfs_aops = { .readpage = jfs_readpage, - .readpages = jfs_readpages, + .readahead = jfs_readahead, .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, diff --git a/fs/mpage.c b/fs/mpage.c index ccba3c4c4479..830e6cc2a9e7 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -91,7 +91,7 @@ mpage_alloc(struct block_device *bdev, } /* - * support function for mpage_readpages. The fs supplied get_block might + * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows readpage to avoid triggering a duplicate call * to get_block. @@ -338,13 +338,8 @@ confused: } /** - * mpage_readpages - populate an address space with some pages & start reads against them - * @mapping: the address_space - * @pages: The address of a list_head which contains the target pages. These - * pages have their ->index populated and are otherwise uninitialised. - * The page at @pages->prev has the lowest file offset, and reads should be - * issued in @pages->prev to @pages->next order. - * @nr_pages: The number of pages at *@pages + * mpage_readahead - start reads against pages + * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and @@ -381,36 +376,25 @@ confused: * * This all causes the disk requests to be issued in the correct order. */ -int -mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block) +void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { + struct page *page; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; - unsigned page_idx; - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - args.page = page; - args.nr_pages = nr_pages - page_idx; - args.bio = do_mpage_readpage(&args); - } + args.page = page; + args.nr_pages = readahead_count(rac); + args.bio = do_mpage_readpage(&args); put_page(page); } - BUG_ON(!list_empty(pages)); if (args.bio) mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); - return 0; } -EXPORT_SYMBOL(mpage_readpages); +EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all @@ -563,7 +547,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also - * using mpage_readpages then this can rarely happen. + * using mpage_readahead then this can rarely happen. */ goto confused; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 7a57ff2528af..8f7cff7a4293 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -582,7 +582,7 @@ retry: if (!arg->layoutupdate_pages) return -ENOMEM; - start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + start_p = __vmalloc(buffer_size, GFP_NOFS); if (!start_p) { kfree(arg->layoutupdate_pages); return -ENOMEM; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1f32a9fbfdaf..6673a77884d9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -668,7 +668,8 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* - * Record the page as unstable and mark its inode as dirty. + * Record the page as unstable (an extra writeback period) and mark its + * inode as dirty. */ static inline void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) @@ -676,8 +677,11 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_node_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + /* This page is really still in write-back - just that the + * writeback is happening on the server now. + */ + inc_node_page_state(page, NR_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1e767f779c49..639c34fec04a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -946,9 +946,9 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_node_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_WRITEBACK); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_RECLAIMABLE); + WB_WRITEBACK); } /* Called holding the request lock on @req */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0aa02eb18bd3..c3fbab1753ec 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -979,12 +979,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* - * We want less throttling in balance_dirty_pages() - * and shrink_inactive_list() so that nfs to + * We want throttling in balance_dirty_pages() + * and shrink_inactive_list() to only consider + * the backingdev we are writing to, so that nfs to * localhost doesn't cause nfsd to lock up due to all * the client's dirty pages or its congested queue. */ - current->flags |= PF_LESS_THROTTLE; + current->flags |= PF_LOCAL_THROTTLE; exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); @@ -1037,7 +1038,7 @@ out_nfserr: nfserr = nfserrno(host_err); } if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - current_restore_flags(pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LOCAL_THROTTLE); return nfserr; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..ceeb3b441844 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -145,18 +145,9 @@ static int nilfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, nilfs_get_block); } -/** - * nilfs_readpages() - implement readpages() method of nilfs_aops {} - * address_space_operations. - * @file - file struct of the file to be read - * @mapping - address_space struct used for reading multiple pages - * @pages - the pages to be read - * @nr_pages - number of pages to be read - */ -static int nilfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void nilfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); + mpage_readahead(rac, nilfs_get_block); } static int nilfs_writepages(struct address_space *mapping, @@ -308,7 +299,7 @@ const struct address_space_operations nilfs_aops = { .readpage = nilfs_readpage, .writepages = nilfs_writepages, .set_page_dirty = nilfs_set_page_dirty, - .readpages = nilfs_readpages, + .readahead = nilfs_readahead, .write_begin = nilfs_write_begin, .write_end = nilfs_write_end, /* .releasepage = nilfs_releasepage, */ diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 554b744f41bf..bb0a43860ad2 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1732,7 +1732,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } else buffers_to_free = bh; } diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index 842b0bfc3ac9..7068425735f1 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -34,7 +34,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); return NULL; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3aac5c917afe..fbb9f1bc623d 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -504,7 +504,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } bh = head = page_buffers(page); BUG_ON(!bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3a67a6518ddf..3bfb4147895a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -350,14 +350,11 @@ out: * grow out to a tree. If need be, detecting boundary extents could * trivially be added in a future version of ocfs2_get_block(). */ -static int ocfs2_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ocfs2_readahead(struct readahead_control *rac) { - int ret, err = -EIO; - struct inode *inode = mapping->host; + int ret; + struct inode *inode = rac->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start; - struct page *last; /* * Use the nonblocking flag for the dlm code to avoid page @@ -365,36 +362,31 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, */ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); if (ret) - return err; + return; - if (down_read_trylock(&oi->ip_alloc_sem) == 0) { - ocfs2_inode_unlock(inode, 0); - return err; - } + if (down_read_trylock(&oi->ip_alloc_sem) == 0) + goto out_unlock; /* * Don't bother with inline-data. There isn't anything * to read-ahead in that case anyway... */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - goto out_unlock; + goto out_up; /* * Check whether a remote node truncated this file - we just * drop out in that case as it's not worth handling here. */ - last = lru_to_page(pages); - start = (loff_t)last->index << PAGE_SHIFT; - if (start >= i_size_read(inode)) - goto out_unlock; + if (readahead_pos(rac) >= i_size_read(inode)) + goto out_up; - err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + mpage_readahead(rac, ocfs2_get_block); -out_unlock: +out_up: up_read(&oi->ip_alloc_sem); +out_unlock: ocfs2_inode_unlock(inode, 0); - - return err; } /* Note: Because we don't support holes, our allocation has @@ -2474,7 +2466,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, + .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 55a6512e9fde..f105746063ed 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2760,6 +2760,7 @@ leave: * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) + __must_hold(&dlm->spinlock) { int ret; int lock_dropped = 0; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9150cfa4df7d..ee5d98516212 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -279,6 +279,7 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ + OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,7 +674,8 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); + return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) + || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 8caeceeaeda7..4da0e4b1e79b 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -254,14 +254,16 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid) { + if (!si->si_slots[preferred].sl_valid || + !si->si_slots[preferred].sl_node_num) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid) { + if (!si->si_slots[i].sl_valid || + !si->si_slots[i].sl_node_num) { ret = i; break; } @@ -456,24 +458,30 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); - if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (ocfs2_mount_local(osb)) + /* use slot 0 directly in local mode */ + slot = 0; + else { + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " - "allocated to this node!\n", slot, osb->dev_str); + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (slot < 0) { + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " + "already allocated to this node!\n", + slot, osb->dev_str); + } ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ac61eeaf3837..71ea9ce71a6b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -175,6 +175,7 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, + Opt_nocluster, Opt_err, }; @@ -208,6 +209,7 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, + {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -619,6 +621,13 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + tmp = OCFS2_MOUNT_NOCLUSTER; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); + goto out; + } + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -859,6 +868,7 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && + !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1139,6 +1149,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); + if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && + !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) + printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " + "without cluster aware mode.\n", osb->dev_str); + atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1445,6 +1460,9 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; + case Opt_nocluster: + mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1556,6 +1574,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); + if (opts & OCFS2_MOUNT_NOCLUSTER) + seq_printf(s, ",nocluster"); + return 0; } diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d640b9388238..d7b5f09d298c 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -289,10 +289,9 @@ static int omfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, omfs_get_block); } -static int omfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void omfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); + mpage_readahead(rac, omfs_get_block); } static int omfs_writepage(struct page *page, struct writeback_control *wbc) @@ -373,7 +372,7 @@ const struct inode_operations omfs_file_inops = { const struct address_space_operations omfs_aops = { .readpage = omfs_readpage, - .readpages = omfs_readpages, + .readahead = omfs_readahead, .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, diff --git a/fs/open.c b/fs/open.c index e62b1db06638..6cd48a61cda3 100644 --- a/fs/open.c +++ b/fs/open.c @@ -775,9 +775,8 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 12ae630fbed7..48f0547d4850 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -62,12 +62,7 @@ static int orangefs_writepage_locked(struct page *page, } else { ret = 0; } - if (wr) { - kfree(wr); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); return ret; } @@ -409,9 +404,7 @@ static int orangefs_write_begin(struct file *file, wr->len = len; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: return 0; } @@ -459,18 +452,12 @@ static void orangefs_invalidatepage(struct page *page, wr = (struct orangefs_write_range *)page_private(page); if (offset == 0 && length == PAGE_SIZE) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); return; /* write range entirely within invalidate range (or equal) */ } else if (page_offset(page) + offset <= wr->pos && wr->pos + wr->len <= page_offset(page) + offset + length) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); /* XXX is this right? only caller in fs */ cancel_dirty_page(page); return; @@ -535,12 +522,7 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) static void orangefs_freepage(struct page *page) { - if (PagePrivate(page)) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); } static int orangefs_launder_page(struct page *page) @@ -740,9 +722,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) wr->len = PAGE_SIZE; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: file_update_time(vmf->vma->vm_file); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 09cd51c8d23d..ecc63ce01be7 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -110,8 +110,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); - show_val_kb(m, "NFS_Unstable: ", - global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 10a6d472397f..6ad407d5efe2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -546,10 +546,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); - struct page *page; + struct page *page = NULL; - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (pmd_present(*pmd)) { + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } if (IS_ERR_OR_NULL(page)) return; if (PageAnon(page)) @@ -578,8 +585,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - if (pmd_present(*pmd)) - smaps_pmd_entry(pmd, addr, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 345db56c98fd..755293c8c71a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -99,10 +99,9 @@ static int qnx6_readpage(struct file *file, struct page *page) return mpage_readpage(page, qnx6_get_block); } -static int qnx6_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void qnx6_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block); + mpage_readahead(rac, qnx6_get_block); } /* @@ -499,7 +498,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations qnx6_aops = { .readpage = qnx6_readpage, - .readpages = qnx6_readpages, + .readahead = qnx6_readahead, .bmap = qnx6_bmap }; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6419e6dacc39..0031070b3692 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1160,11 +1160,9 @@ failure: return retval; } -static int -reiserfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void reiserfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); + mpage_readahead(rac, reiserfs_get_block); } /* @@ -3434,7 +3432,7 @@ out: const struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, - .readpages = reiserfs_readpages, + .readahead = reiserfs_readahead, .releasepage = reiserfs_releasepage, .invalidatepage = reiserfs_invalidatepage, .write_begin = reiserfs_write_begin, diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 4f9b9fb59362..64f61330564a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -13,6 +13,7 @@ * datablocks and metadata blocks. */ +#include #include #include #include @@ -27,44 +28,103 @@ #include "page_actor.h" /* - * Read the metadata block length, this is stored in the first two - * bytes of the metadata block. + * Returns the amount of bytes copied to the page actor. */ -static struct buffer_head *get_block_length(struct super_block *sb, - u64 *cur_index, int *offset, int *length) +static int copy_bio_to_actor(struct bio *bio, + struct squashfs_page_actor *actor, + int offset, int req_length) { - struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head *bh; + void *actor_addr = squashfs_first_page(actor); + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int copied_bytes = 0; + int actor_offset = 0; - bh = sb_bread(sb, *cur_index); - if (bh == NULL) - return NULL; + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) + return 0; - if (msblk->devblksize - *offset == 1) { - *length = (unsigned char) bh->b_data[*offset]; - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *length |= (unsigned char) bh->b_data[0] << 8; - *offset = 1; - } else { - *length = (unsigned char) bh->b_data[*offset] | - (unsigned char) bh->b_data[*offset + 1] << 8; - *offset += 2; + while (copied_bytes < req_length) { + int bytes_to_copy = min_t(int, bvec->bv_len - offset, + PAGE_SIZE - actor_offset); - if (*offset == msblk->devblksize) { - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *offset = 0; + bytes_to_copy = min_t(int, bytes_to_copy, + req_length - copied_bytes); + memcpy(actor_addr + actor_offset, + page_address(bvec->bv_page) + bvec->bv_offset + offset, + bytes_to_copy); + + actor_offset += bytes_to_copy; + copied_bytes += bytes_to_copy; + offset += bytes_to_copy; + + if (actor_offset >= PAGE_SIZE) { + actor_addr = squashfs_next_page(actor); + if (!actor_addr) + break; + actor_offset = 0; + } + if (offset >= bvec->bv_len) { + if (!bio_next_segment(bio, &iter_all)) + break; + offset = 0; } } - - return bh; + squashfs_finish_page(actor); + return copied_bytes; } +static int squashfs_bio_read(struct super_block *sb, u64 index, int length, + struct bio **biop, int *block_offset) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + const u64 read_start = round_down(index, msblk->devblksize); + const sector_t block = read_start >> msblk->devblksize_log2; + const u64 read_end = round_up(index + length, msblk->devblksize); + const sector_t block_end = read_end >> msblk->devblksize_log2; + int offset = read_start - round_down(index, PAGE_SIZE); + int total_len = (block_end - block) << msblk->devblksize_log2; + const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE); + int error, i; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, page_count); + if (!bio) + return -ENOMEM; + + bio_set_dev(bio, sb->s_bdev); + bio->bi_opf = READ; + bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); + + for (i = 0; i < page_count; ++i) { + unsigned int len = + min_t(unsigned int, PAGE_SIZE - offset, total_len); + struct page *page = alloc_page(GFP_NOIO); + + if (!page) { + error = -ENOMEM; + goto out_free_bio; + } + if (!bio_add_page(bio, page, len, offset)) { + error = -EIO; + goto out_free_bio; + } + offset = 0; + total_len -= len; + } + + error = submit_bio_wait(bio); + if (error) + goto out_free_bio; + + *biop = bio; + *block_offset = index & ((1 << msblk->devblksize_log2) - 1); + return 0; + +out_free_bio: + bio_free_pages(bio); + bio_put(bio); + return error; +} /* * Read and decompress a metadata block or datablock. Length is non-zero @@ -76,129 +136,88 @@ static struct buffer_head *get_block_length(struct super_block *sb, * algorithms). */ int squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) + u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head **bh; - int offset = index & ((1 << msblk->devblksize_log2) - 1); - u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, avail, i; - - bh = kcalloc(((output->length + msblk->devblksize - 1) - >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); - if (bh == NULL) - return -ENOMEM; + struct bio *bio = NULL; + int compressed; + int res; + int offset; if (length) { /* * Datablock. */ - bytes = -offset; compressed = SQUASHFS_COMPRESSED_BLOCK(length); length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); - if (next_index) - *next_index = index + length; - TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", index, compressed ? "" : "un", length, output->length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto read_failure; - - for (b = 0; bytes < length; b++, cur_index++) { - bh[b] = sb_getblk(sb, cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b, bh); } else { /* * Metadata block. */ - if ((index + 2) > msblk->bytes_used) - goto read_failure; + const u8 *data; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); - bh[0] = get_block_length(sb, &cur_index, &offset, &length); - if (bh[0] == NULL) - goto read_failure; - b = 1; + if (index + 2 > msblk->bytes_used) { + res = -EIO; + goto out; + } + res = squashfs_bio_read(sb, index, 2, &bio, &offset); + if (res) + goto out; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + /* Extract the length of the metadata block */ + data = page_address(bvec->bv_page) + bvec->bv_offset; + length = data[offset]; + if (offset <= bvec->bv_len - 1) { + length |= data[offset + 1] << 8; + } else { + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + data = page_address(bvec->bv_page) + bvec->bv_offset; + length |= data[0] << 8; + } + bio_free_pages(bio); + bio_put(bio); - bytes = msblk->devblksize - offset; compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); - if (next_index) - *next_index = index + length + 2; + index += 2; TRACE("Block @ 0x%llx, %scompressed size %d\n", index, - compressed ? "" : "un", length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto block_release; - - for (; bytes < length; b++) { - bh[b] = sb_getblk(sb, ++cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1); + compressed ? "" : "un", length); } + if (next_index) + *next_index = index + length; - for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; - } + res = squashfs_bio_read(sb, index, length, &bio, &offset); + if (res) + goto out; if (compressed) { - if (!msblk->stream) - goto read_failure; - length = squashfs_decompress(msblk, bh, b, offset, length, - output); - if (length < 0) - goto read_failure; - } else { - /* - * Block is uncompressed. - */ - int in, pg_offset = 0; - void *data = squashfs_first_page(output); - - for (bytes = length; k < b; k++) { - in = min(bytes, msblk->devblksize - offset); - bytes -= in; - while (in) { - if (pg_offset == PAGE_SIZE) { - data = squashfs_next_page(output); - pg_offset = 0; - } - avail = min_t(int, in, PAGE_SIZE - - pg_offset); - memcpy(data + pg_offset, bh[k]->b_data + offset, - avail); - in -= avail; - pg_offset += avail; - offset += avail; - } - offset = 0; - put_bh(bh[k]); + if (!msblk->stream) { + res = -EIO; + goto out_free_bio; } - squashfs_finish_page(output); + res = squashfs_decompress(msblk, bio, offset, length, output); + } else { + res = copy_bio_to_actor(bio, output, offset, length); } - kfree(bh); - return length; +out_free_bio: + bio_free_pages(bio); + bio_put(bio); +out: + if (res < 0) + ERROR("Failed to read block 0x%llx: %d\n", index, res); -block_release: - for (; k < b; k++) - put_bh(bh[k]); - -read_failure: - ERROR("squashfs_read_data failed to read block 0x%llx\n", - (unsigned long long) index); - kfree(bh); - return -EIO; + return res; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index ec8617523e56..1b9ccfd0aa51 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -10,13 +10,14 @@ * decompressor.h */ +#include + struct squashfs_decompressor { void *(*init)(struct squashfs_sb_info *, void *); void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); int (*decompress)(struct squashfs_sb_info *, void *, - struct buffer_head **, int, int, int, - struct squashfs_page_actor *); + struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; int supported; diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index c181dee235bb..db9f12a3ea05 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -180,14 +180,15 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); res = msblk->decompressor->decompress(msblk, decomp_stream->stream, - bh, b, offset, length, output); + bio, offset, length, output); put_decomp_stream(decomp_stream, stream); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index e206ebfe003c..b881b9283b7f 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -75,8 +75,8 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output) { struct squashfs_stream *stream; int res; @@ -84,8 +84,8 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, local_lock(&msblk->stream->lock); stream = this_cpu_ptr(msblk->stream); - res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, - offset, length, output); + res = msblk->decompressor->decompress(msblk, stream->stream, bio, + offset, length, output); local_unlock(&msblk->stream->lock); diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 550c3e592032..4eb3d083d45e 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -59,14 +59,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; mutex_lock(&stream->mutex); - res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); mutex_unlock(&stream->mutex); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index c4e47e0588c7..233d5582fbee 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -4,7 +4,7 @@ * Phillip Lougher */ -#include +#include #include #include #include @@ -89,20 +89,23 @@ static void lz4_free(void *strm) static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lz4 *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = LZ4_decompress_safe(stream->input, stream->output, diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index aa3c3dafc33d..97bb7d92ddcd 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include @@ -63,21 +63,24 @@ static void lzo_free(void *strm) static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lzo *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; size_t out_len = output->length; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = lzo1x_decompress_safe(stream->input, (size_t)length, diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 2797763ed046..9783e01c8100 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -40,8 +40,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, - int, int, int, struct squashfs_page_actor *); +extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, + int, int, struct squashfs_page_actor *); extern int squashfs_max_decompressors(void); /* export.c */ diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 4b2f2051a6dc..e80419aed862 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -10,7 +10,7 @@ #include -#include +#include #include #include #include @@ -117,11 +117,12 @@ static void squashfs_xz_free(void *strm) static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - enum xz_ret xz_err; - int avail, total = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int total = 0, error = 0; struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); @@ -131,11 +132,23 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); - do { - if (stream->buf.in_pos == stream->buf.in_size && k < b) { - avail = min(length, msblk->devblksize - offset); + for (;;) { + enum xz_ret xz_err; + + if (stream->buf.in_pos == stream->buf.in_size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* XZ_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->buf.in = bh[k]->b_data + offset; + stream->buf.in = data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; offset = 0; @@ -150,23 +163,17 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } xz_err = xz_dec_run(stream->state, &stream->buf); - - if (stream->buf.in_pos == stream->buf.in_size && k < b) - put_bh(bh[k++]); - } while (xz_err == XZ_OK); + if (xz_err == XZ_STREAM_END) + break; + if (xz_err != XZ_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (xz_err != XZ_STREAM_END || k < b) - goto out; - - return total + stream->buf.out_pos; - -out: - for (; k < b; k++) - put_bh(bh[k]); - - return -EIO; + return error ? error : total + stream->buf.out_pos; } const struct squashfs_decompressor squashfs_xz_comp_ops = { diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index f2226afa1625..bcb881ec47f2 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -10,7 +10,7 @@ #include -#include +#include #include #include #include @@ -50,21 +50,35 @@ static void zlib_free(void *strm) static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - int zlib_err, zlib_init = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int zlib_init = 0, error = 0; z_stream *stream = strm; stream->avail_out = PAGE_SIZE; stream->next_out = squashfs_first_page(output); stream->avail_in = 0; - do { - if (stream->avail_in == 0 && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + int zlib_err; + + if (stream->avail_in == 0) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* Z_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->next_in = bh[k]->b_data + offset; + stream->next_in = data + offset; stream->avail_in = avail; offset = 0; } @@ -78,37 +92,28 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } zlib_init = 1; } zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); - - if (stream->avail_in == 0 && k < b) - put_bh(bh[k++]); - } while (zlib_err == Z_OK); + if (zlib_err == Z_STREAM_END) + break; + if (zlib_err != Z_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (zlib_err != Z_STREAM_END) - goto out; + if (!error) + if (zlib_inflateEnd(stream) != Z_OK) + error = -EIO; - zlib_err = zlib_inflateEnd(stream); - if (zlib_err != Z_OK) - goto out; - - if (k < b) - goto out; - - return stream->total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); - - return -EIO; + return error ? error : stream->total_out; } const struct squashfs_decompressor squashfs_zlib_comp_ops = { diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b448c2a1d0ed..b7cb1faa652d 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include @@ -59,33 +59,44 @@ static void zstd_free(void *strm) static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct workspace *wksp = strm; ZSTD_DStream *stream; size_t total_out = 0; - size_t zstd_err; - int k = 0; + int error = 0; ZSTD_inBuffer in_buf = { NULL, 0, 0 }; ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size); if (!stream) { ERROR("Failed to initialize zstd decompressor\n"); - goto out; + return -EIO; } out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); - do { - if (in_buf.pos == in_buf.size && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + size_t zstd_err; + if (in_buf.pos == in_buf.size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - in_buf.src = bh[k]->b_data + offset; + in_buf.src = data + offset; in_buf.size = avail; in_buf.pos = 0; offset = 0; @@ -97,8 +108,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, /* Shouldn't run out of pages * before stream is done. */ - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } out_buf.pos = 0; out_buf.size = PAGE_SIZE; @@ -107,29 +118,20 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, total_out -= out_buf.pos; zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf); total_out += out_buf.pos; /* add the additional data produced */ + if (zstd_err == 0) + break; - if (in_buf.pos == in_buf.size && k < b) - put_bh(bh[k++]); - } while (zstd_err != 0 && !ZSTD_isError(zstd_err)); + if (ZSTD_isError(zstd_err)) { + ERROR("zstd decompression error: %d\n", + (int)ZSTD_getErrorCode(zstd_err)); + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (ZSTD_isError(zstd_err)) { - ERROR("zstd decompression error: %d\n", - (int)ZSTD_getErrorCode(zstd_err)); - goto out; - } - - if (k < b) - goto out; - - return (int)total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); - - return -EIO; + return error ? error : total_out; } const struct squashfs_decompressor squashfs_zstd_comp_ops = { diff --git a/fs/sync.c b/fs/sync.c index 4d1ff010bc5a..c6f6f5be5682 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -161,7 +161,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) { struct fd f = fdget(fd); struct super_block *sb; - int ret; + int ret, ret2; if (!f.file) return -EBADF; @@ -171,8 +171,10 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret = sync_filesystem(sb); up_read(&sb->s_umount); + ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err); + fdput(f); - return ret; + return ret ? ret : ret2; } /** diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0f5a480fe264..31288d8fa2ce 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -815,7 +815,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); return; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 29826c51883a..22bfda158f7f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1095,7 +1095,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; } - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index ff5e0411cf2d..d76a19e460cd 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1596,7 +1596,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) if (!dbg_is_chk_lprops(c)) return 0; - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for ltab checking"); return 0; @@ -1845,7 +1845,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) void *buf, *p; pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to dump LPT"); return; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 283f9eb48410..2c294085ffed 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -977,7 +977,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) if (c->no_orphs) return 0; - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to check orphans"); return 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e875bc5668ee..adaba8e8b326 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -195,10 +195,9 @@ static int udf_readpage(struct file *file, struct page *page) return mpage_readpage(page, udf_get_block); } -static int udf_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void udf_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, udf_get_block); + mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -234,7 +233,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, - .readpages = udf_readpages, + .readahead = udf_readahead, .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 1da94237a8cf..f1366475c389 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9d9cebf18726..1fd4fb7a607c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -621,14 +621,11 @@ xfs_vm_readpage( return iomap_readpage(page, &xfs_read_iomap_ops); } -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) +STATIC void +xfs_vm_readahead( + struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops); + iomap_readahead(rac, &xfs_read_iomap_ops); } static int @@ -644,7 +641,7 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, + .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9ec3eaf1c618..65538d18e64f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -477,7 +477,7 @@ _xfs_buf_map_pages( nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + -1); if (bp->b_addr) break; vm_unmap_aliases(); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3ce9829a6936..dba874a61fc5 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -78,10 +78,9 @@ static int zonefs_readpage(struct file *unused, struct page *page) return iomap_readpage(page, &zonefs_iomap_ops); } -static int zonefs_readpages(struct file *unused, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void zonefs_readahead(struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_iomap_ops); } /* @@ -128,7 +127,7 @@ static int zonefs_writepages(struct address_space *mapping, static const struct address_space_operations zonefs_file_aops = { .readpage = zonefs_readpage, - .readpages = zonefs_readpages, + .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index 4c74b1c1d13b..58046ddc08d0 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -17,8 +17,9 @@ ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \ NULL : pud_offset(p4d, address)) -#define p4d_alloc(mm, pgd, address) (pgd) -#define p4d_offset(pgd, start) (pgd) +#define p4d_alloc(mm, pgd, address) (pgd) +#define p4d_alloc_track(mm, pgd, address, mask) (pgd) +#define p4d_offset(pgd, start) (pgd) #ifndef __ASSEMBLY__ static inline int p4d_none(p4d_t p4d) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 329b8c8ca703..db7df7daa0d8 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -491,6 +491,10 @@ static inline int arch_unmap_one(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif +#ifndef pgprot_nx +#define pgprot_nx(prot) (prot) +#endif + #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif @@ -1209,6 +1213,29 @@ static inline bool arch_has_pfn_modify_check(void) # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif +/* + * Page Table Modification bits for pgtbl_mod_mask. + * + * These are used by the p?d_alloc_track*() set of functions an in the generic + * vmalloc/ioremap code to track at which page-table levels entries have been + * modified. Based on that the code can better decide when vmalloc and ioremap + * mapping changes need to be synchronized to other page-tables in the system. + */ +#define __PGTBL_PGD_MODIFIED 0 +#define __PGTBL_P4D_MODIFIED 1 +#define __PGTBL_PUD_MODIFIED 2 +#define __PGTBL_PMD_MODIFIED 3 +#define __PGTBL_PTE_MODIFIED 4 + +#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) +#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) +#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) +#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) +#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) + +/* Page-Table Modification Mask */ +typedef unsigned int pgtbl_mod_mask; + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 15b765a181b8..22fb11e2d2e0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -272,14 +272,6 @@ void buffer_init(void); * inline definitions */ -static inline void attach_page_buffers(struct page *page, - struct buffer_head *head) -{ - get_page(page); - SetPagePrivate(page); - set_page_private(page, (unsigned long)head); -} - static inline void get_bh(struct buffer_head *bh) { atomic_inc(&bh->b_count); diff --git a/include/linux/fs.h b/include/linux/fs.h index 109b5d9dbdc7..ef6acd2062eb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -292,6 +292,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +struct readahead_control; /* * Write life time hint values. @@ -375,6 +376,7 @@ struct address_space_operations { */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); + void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -976,6 +978,7 @@ struct file { #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; + errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1520,6 +1523,9 @@ struct super_block { /* Being remounted read-only */ int s_readonly_remount; + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; @@ -2831,6 +2837,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b09463dae0d..bc20bd04c2a2 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -155,8 +155,7 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); -int iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops); +void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); int iomap_set_page_dirty(struct page *page); int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 977edd3b7bd8..bfe9533bb67e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -45,6 +45,7 @@ enum memcg_memory_event { MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, @@ -215,9 +216,6 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Upper bound of normal memory consumption range */ - unsigned long high; - /* Range enforcement for interrupt charges */ struct work_struct high_work; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3468dbb2cad1..6e6c71cdfa13 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1709,6 +1709,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); @@ -2085,13 +2087,54 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } + +static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, + unsigned long address, + pgtbl_mod_mask *mod_mask) + +{ + if (unlikely(pgd_none(*pgd))) { + if (__p4d_alloc(mm, pgd, address)) + return NULL; + *mod_mask |= PGTBL_PGD_MODIFIED; + } + + return p4d_offset(pgd, address); +} + #endif /* !__ARCH_HAS_5LEVEL_HACK */ +static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(p4d_none(*p4d))) { + if (__pud_alloc(mm, p4d, address)) + return NULL; + *mod_mask |= PGTBL_P4D_MODIFIED; + } + + return pud_offset(p4d, address); +} + static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } + +static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(pud_none(*pud))) { + if (__pmd_alloc(mm, pud, address)) + return NULL; + *mod_mask |= PGTBL_PUD_MODIFIED; + } + + return pmd_offset(pud, address); +} #endif /* CONFIG_MMU */ #if USE_SPLIT_PTE_PTLOCKS @@ -2207,6 +2250,11 @@ static inline void pgtable_pte_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) +#define pte_alloc_kernel_track(pmd, address, mask) \ + ((unlikely(pmd_none(*(pmd))) && \ + (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ + NULL: pte_offset_kernel(pmd, address)) + #if USE_SPLIT_PMD_PTLOCKS static struct page *pmd_to_page(pmd_t *pmd) @@ -2608,25 +2656,6 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); int __must_check write_one_page(struct page *page); void task_dirty_inc(struct task_struct *tsk); -/* readahead.c */ -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); - -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - pgoff_t offset, - unsigned long size); - -void page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - struct page *pg, - pgoff_t offset, - unsigned long size); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4aba6c0c2ba8..ef6d3aface8a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,7 +240,11 @@ static inline atomic_t *compound_pincount_ptr(struct page *page) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) #define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) + +static inline void set_page_private(struct page *page, unsigned long private) +{ + page->private = private; +} struct page_frag_cache { void * va; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index acffc3bc6178..fdd9beb5efed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -196,7 +196,6 @@ enum node_stat_item { NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, - NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 001f1fcf9836..f4f5e90a6844 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -13,9 +13,9 @@ #ifdef CONFIG_BLOCK struct writeback_control; +struct readahead_control; -int mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block); +void mpage_readahead(struct readahead_control *, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index bab7e57f659b..85bd413e784e 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -10,6 +10,7 @@ struct page_counter { atomic_long_t usage; unsigned long min; unsigned long low; + unsigned long high; unsigned long max; struct page_counter *parent; @@ -55,6 +56,13 @@ bool page_counter_try_charge(struct page_counter *counter, void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); + +static inline void page_counter_set_high(struct page_counter *counter, + unsigned long nr_pages) +{ + WRITE_ONCE(counter->high, nr_pages); +} + int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a8f7bd8ea1c6..8e085713150c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -51,7 +51,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error) return; /* Record in wb_err for checkers using errseq_t based tracking */ - filemap_set_wb_err(mapping, error); + __filemap_set_wb_err(mapping, error); + + /* Record it in superblock */ + errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) @@ -205,6 +208,43 @@ static inline int page_cache_add_speculative(struct page *page, int count) return __page_cache_add_speculative(page, count); } +/** + * attach_page_private - Attach private data to a page. + * @page: Page to attach data to. + * @data: Data to attach to page. + * + * Attaching private data to a page increments the page's reference count. + * The data must be detached before the page will be freed. + */ +static inline void attach_page_private(struct page *page, void *data) +{ + get_page(page); + set_page_private(page, (unsigned long)data); + SetPagePrivate(page); +} + +/** + * detach_page_private - Detach private data from a page. + * @page: Page to detach data from. + * + * Removes the data that was previously attached to the page and decrements + * the refcount on the page. + * + * Return: Data that was attached to the page. + */ +static inline void *detach_page_private(struct page *page) +{ + void *data = (void *)page_private(page); + + if (!PagePrivate(page)) + return NULL; + ClearPagePrivate(page); + set_page_private(page, 0); + put_page(page); + + return data; +} + #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else @@ -615,6 +655,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); +#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) + +void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, + struct file *, pgoff_t index, unsigned long req_count); +void page_cache_async_readahead(struct address_space *, struct file_ra_state *, + struct file *, struct page *, pgoff_t index, + unsigned long req_count); +void page_cache_readahead_unbounded(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_count); + /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. @@ -631,6 +682,146 @@ static inline int add_to_page_cache(struct page *page, return error; } +/** + * struct readahead_control - Describes a readahead request. + * + * A readahead request is for consecutive pages. Filesystems which + * implement the ->readahead method should call readahead_page() or + * readahead_page_batch() in a loop and attempt to start I/O against + * each page in the request. + * + * Most of the fields in this struct are private and should be accessed + * by the functions below. + * + * @file: The file, used primarily by network filesystems for authentication. + * May be NULL if invoked internally by the filesystem. + * @mapping: Readahead this filesystem object. + */ +struct readahead_control { + struct file *file; + struct address_space *mapping; +/* private: use the readahead_* accessors instead */ + pgoff_t _index; + unsigned int _nr_pages; + unsigned int _batch_count; +}; + +/** + * readahead_page - Get the next page to read. + * @rac: The current readahead request. + * + * Context: The page is locked and has an elevated refcount. The caller + * should decreases the refcount once the page has been submitted for I/O + * and unlock the page once all I/O to that page has completed. + * Return: A pointer to the next page, or %NULL if we are done. + */ +static inline struct page *readahead_page(struct readahead_control *rac) +{ + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + + if (!rac->_nr_pages) { + rac->_batch_count = 0; + return NULL; + } + + page = xa_load(&rac->mapping->i_pages, rac->_index); + VM_BUG_ON_PAGE(!PageLocked(page), page); + rac->_batch_count = hpage_nr_pages(page); + + return page; +} + +static inline unsigned int __readahead_batch(struct readahead_control *rac, + struct page **array, unsigned int array_sz) +{ + unsigned int i = 0; + XA_STATE(xas, &rac->mapping->i_pages, 0); + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + rac->_batch_count = 0; + + xas_set(&xas, rac->_index); + rcu_read_lock(); + xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageTail(page), page); + array[i++] = page; + rac->_batch_count += hpage_nr_pages(page); + + /* + * The page cache isn't using multi-index entries yet, + * so the xas cursor needs to be manually moved to the + * next index. This can be removed once the page cache + * is converted. + */ + if (PageHead(page)) + xas_set(&xas, rac->_index + rac->_batch_count); + + if (i == array_sz) + break; + } + rcu_read_unlock(); + + return i; +} + +/** + * readahead_page_batch - Get a batch of pages to read. + * @rac: The current readahead request. + * @array: An array of pointers to struct page. + * + * Context: The pages are locked and have an elevated refcount. The caller + * should decreases the refcount once the page has been submitted for I/O + * and unlock the page once all I/O to that page has completed. + * Return: The number of pages placed in the array. 0 indicates the request + * is complete. + */ +#define readahead_page_batch(rac, array) \ + __readahead_batch(rac, array, ARRAY_SIZE(array)) + +/** + * readahead_pos - The byte offset into the file of this readahead request. + * @rac: The readahead request. + */ +static inline loff_t readahead_pos(struct readahead_control *rac) +{ + return (loff_t)rac->_index * PAGE_SIZE; +} + +/** + * readahead_length - The number of bytes in this readahead request. + * @rac: The readahead request. + */ +static inline loff_t readahead_length(struct readahead_control *rac) +{ + return (loff_t)rac->_nr_pages * PAGE_SIZE; +} + +/** + * readahead_index - The index of the first page in this readahead request. + * @rac: The readahead request. + */ +static inline pgoff_t readahead_index(struct readahead_control *rac) +{ + return rac->_index; +} + +/** + * readahead_count - The number of pages in this readahead request. + * @rac: The readahead request. + */ +static inline unsigned int readahead_count(struct readahead_control *rac) +{ + return rac->_nr_pages; +} + static inline unsigned long dir_pages(struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index a67065c403c3..2a3a95586425 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -13,7 +13,8 @@ struct ptdump_range { struct ptdump_state { /* level is 0:PGD to 4:PTE, or -1 if unknown */ void (*note_page)(struct ptdump_state *st, unsigned long addr, - int level, unsigned long val); + int level, u64 val); + void (*effective_prot)(struct ptdump_state *st, int level, u64 val); const struct ptdump_range *range; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 57d0ed061ae4..33bb7c539246 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1495,7 +1495,8 @@ extern struct pid *cad_pid; #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ -#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ +#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, + * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 25181d2dd0b9..e92176fc8824 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -183,12 +183,17 @@ enum { #define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ -#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ +/* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ -#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */ -#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ +#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ + +/* Special value in first swap_map */ +#define SWAP_MAP_MAX 0x3e /* Max count */ +#define SWAP_MAP_BAD 0x3f /* Note page is bad */ +#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ + +/* Special value in each swap_map continuation */ +#define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk @@ -247,6 +252,7 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ + unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ @@ -409,7 +415,6 @@ extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); -extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *, swp_entry_t entry); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a95d3cc74d79..48bb681e6c2a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -88,8 +88,7 @@ struct vmap_area { * Highlevel APIs for driver use */ extern void vm_unmap_ram(const void *mem, unsigned int count); -extern void *vm_map_ram(struct page **pages, unsigned int count, - int node, pgprot_t prot); +extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU @@ -107,26 +106,16 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); -#ifndef CONFIG_MMU -extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); -static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, - gfp_t flags, void *caller) -{ - return __vmalloc_node_flags(size, node, flags); -} -#else -extern void *__vmalloc_node_flags_caller(unsigned long size, - int node, gfp_t flags, void *caller); -#endif +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller); extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); @@ -141,8 +130,22 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); -void vmalloc_sync_mappings(void); -void vmalloc_sync_unmappings(void); + +/* + * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values + * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() + * needs to be called. + */ +#ifndef ARCH_PAGE_TABLE_SYNC_MASK +#define ARCH_PAGE_TABLE_SYNC_MASK 0 +#endif + +/* + * There is no default implementation for arch_sync_kernel_mappings(). It is + * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK + * is 0. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end); /* * Lowlevel-APIs (not for driver use!) @@ -161,8 +164,6 @@ static inline size_t get_vm_area_size(const struct vm_struct *area) extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller); -extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end); extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, @@ -170,11 +171,11 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); -extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); static inline void set_vm_flush_reset_perms(void *addr) @@ -191,14 +192,12 @@ map_kernel_range_noflush(unsigned long start, unsigned long size, { return size >> PAGE_SHIFT; } +#define map_kernel_range map_kernel_range_noflush static inline void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { } -static inline void -unmap_kernel_range(unsigned long addr, unsigned long size) -{ -} +#define unmap_kernel_range unmap_kernel_range_noflush static inline void set_vm_flush_reset_perms(void *addr) { } diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2219cce81ca4..0fdbf653b173 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -20,7 +20,7 @@ * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when PGTABLE_MAPPING is selected. + * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected. */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 27f5caa6299a..bf9806fd1306 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -113,10 +113,10 @@ TRACE_EVENT(erofs_readpage, TRACE_EVENT(erofs_readpages, - TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage, + TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage, bool raw), - TP_ARGS(inode, page, nrpage, raw), + TP_ARGS(inode, start, nrpage, raw), TP_STRUCT__entry( __field(dev_t, dev ) @@ -129,7 +129,7 @@ TRACE_EVENT(erofs_readpages, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; - __entry->start = page->index; + __entry->start = start; __entry->nrpage = nrpage; __entry->raw = raw; ), diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index d97adfc327f0..24c2557c37f0 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1376,9 +1376,9 @@ TRACE_EVENT(f2fs_writepages, TRACE_EVENT(f2fs_readpages, - TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage), + TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage), - TP_ARGS(inode, page, nrpage), + TP_ARGS(inode, start, nrpage), TP_STRUCT__entry( __field(dev_t, dev) @@ -1390,7 +1390,7 @@ TRACE_EVENT(f2fs_readpages, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->start = page->index; + __entry->start = start; __entry->nrpage = nrpage; ), diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 85a33bea76f1..10f5d1fa7347 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -541,7 +541,6 @@ TRACE_EVENT(global_dirty_state, TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) - __field(unsigned long, nr_unstable) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) @@ -552,7 +551,6 @@ TRACE_EVENT(global_dirty_state, TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); - __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; @@ -560,12 +558,11 @@ TRACE_EVENT(global_dirty_state, __entry->dirty_limit = global_wb_domain.dirty_limit; ), - TP_printk("dirty=%lu writeback=%lu unstable=%lu " + TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, - __entry->nr_unstable, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 14aa1f74dd10..cf6fe9107f5c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e6dee19a668..42c7a42fc9c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -281,27 +282,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..914ff5a58dd5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -20,23 +20,6 @@ struct page **dma_common_find_pages(void *cpu_addr) return area->pages; } -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, pgprot_t prot, const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - /* * Remaps an array of PAGE_SIZE pages into another vm_area. * Cannot be used in non-sleeping contexts @@ -44,15 +27,12 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { - struct vm_struct *area; + void *vaddr; - area = __dma_common_pages_remap(pages, size, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; + vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + if (vaddr) + find_vm_area(vaddr)->pages = pages; + return vaddr; } /* @@ -62,24 +42,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int i; + int count = size >> PAGE_SHIFT; struct page **pages; - struct vm_struct *area; + void *vaddr; + int i; - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) + for (i = 0; i < count; i++) pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, prot, caller); - + vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kfree(pages); - if (!area) - return NULL; - return area->addr; + return vaddr; } /* diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/module.c b/kernel/module.c index 64a2b4daaaa5..a0f201d2e184 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2951,8 +2951,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/sys.c b/kernel/sys.c index b4a0324a0699..891667a49bb7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2262,7 +2262,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..f12e99b387b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8527,18 +8527,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 929211039bac..27bcc2568c95 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -63,7 +63,7 @@ config UBSAN_SANITIZE_ALL config UBSAN_ALIGNMENT bool "Enable checks for pointers alignment" default !HAVE_EFFICIENT_UNALIGNED_ACCESS - depends on !X86 || !COMPILE_TEST + depends on !UBSAN_TRAP help This option enables the check of unaligned memory accesses. Enabling this option on architectures that support unaligned diff --git a/lib/ioremap.c b/lib/ioremap.c index 3f0e18543de8..ad485f08173b 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -61,13 +61,14 @@ static inline int ioremap_pmd_enabled(void) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -75,6 +76,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -101,21 +103,24 @@ static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, } static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) + if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) { + *mask |= PGTBL_PMD_MODIFIED; continue; + } - if (ioremap_pte_range(pmd, addr, next, phys_addr, prot)) + if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -144,21 +149,24 @@ static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, } static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) + if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) { + *mask |= PGTBL_PUD_MODIFIED; continue; + } - if (ioremap_pmd_range(pud, addr, next, phys_addr, prot)) + if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -187,21 +195,24 @@ static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, } static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) + if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) { + *mask |= PGTBL_P4D_MODIFIED; continue; + } - if (ioremap_pud_range(p4d, addr, next, phys_addr, prot)) + if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -214,6 +225,7 @@ int ioremap_page_range(unsigned long addr, unsigned long start; unsigned long next; int err; + pgtbl_mod_mask mask = 0; might_sleep(); BUG_ON(addr >= end); @@ -222,13 +234,17 @@ int ioremap_page_range(unsigned long addr, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot); + err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot, + &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); flush_cache_vmap(start, end); + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + return err; } diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 8bbefcaddfe8..ddc9685702b1 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -91,12 +91,8 @@ static int random_size_align_alloc_test(void) */ size = ((rnd % 10) + 1) * PAGE_SIZE; - ptr = __vmalloc_node_range(size, align, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(size, align, GFP_KERNEL | __GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; @@ -118,12 +114,8 @@ static int align_shift_alloc_test(void) for (i = 0; i < BITS_PER_LONG; i++) { align = ((unsigned long) 1) << i; - ptr = __vmalloc_node_range(PAGE_SIZE, align, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(PAGE_SIZE, align, GFP_KERNEL|__GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; @@ -139,13 +131,9 @@ static int fix_align_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - ptr = __vmalloc_node_range(5 * PAGE_SIZE, - THREAD_ALIGN << 1, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(5 * PAGE_SIZE, THREAD_ALIGN << 1, + GFP_KERNEL | __GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; diff --git a/mm/Kconfig b/mm/Kconfig index c1acc34c1c35..5c0362bd8d56 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -705,9 +705,9 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config PGTABLE_MAPPING +config ZSMALLOC_PGTABLE_MAPPING bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC + depends on ZSMALLOC=y help By default, zsmalloc uses a copy-based object mapping method to access allocations that span two pages. However, if a particular diff --git a/mm/debug.c b/mm/debug.c index 2189357f0987..f2ede2df585a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -110,13 +110,57 @@ void __dump_page(struct page *page, const char *reason) else if (PageAnon(page)) type = "anon "; else if (mapping) { - if (mapping->host && mapping->host->i_dentry.first) { - struct dentry *dentry; - dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); - } else - pr_warn("%ps\n", mapping->a_ops); + const struct inode *host; + const struct address_space_operations *a_ops; + const struct hlist_node *dentry_first; + const struct dentry *dentry_ptr; + struct dentry dentry; + + /* + * mapping can be invalid pointer and we don't want to crash + * accessing it, so probe everything depending on it carefully + */ + if (probe_kernel_read_strict(&host, &mapping->host, + sizeof(struct inode *)) || + probe_kernel_read_strict(&a_ops, &mapping->a_ops, + sizeof(struct address_space_operations *))) { + pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n"); + goto out_mapping; + } + + if (!host) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + if (probe_kernel_read_strict(&dentry_first, + &host->i_dentry.first, sizeof(struct hlist_node *))) { + pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n", + a_ops, host); + goto out_mapping; + } + + if (!dentry_first) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (probe_kernel_read_strict(&dentry, dentry_ptr, + sizeof(struct dentry))) { + pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n", + a_ops, dentry_ptr); + } else { + /* + * if dentry is corrupted, the %pd handler may still + * crash, but it's unlikely that we reach here with a + * corrupted struct page + */ + pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n", + a_ops, &dentry); + } } +out_mapping: BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags, diff --git a/mm/fadvise.c b/mm/fadvise.c index 4f17c83db575..0e66f2aaeea3 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -22,6 +22,8 @@ #include +#include "internal.h" + /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. @@ -102,10 +104,6 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) if (!nrpages) nrpages = ~0UL; - /* - * Ignore return value because fadvise() shall return - * success even if filesystem can't retrieve a hint, - */ force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: diff --git a/mm/filemap.c b/mm/filemap.c index 23a051a7ef0f..fe079e9219d1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2566,7 +2566,6 @@ page_not_uptodate: if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS; diff --git a/mm/gup.c b/mm/gup.c index 4aa2f5ab6e1f..3edf740a3897 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1183,7 +1183,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma, return true; } -/* +/** * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1191,7 +1191,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma, * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller - * does not allow retry + * does not allow retry. If NULL, the caller must guarantee + * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -1854,7 +1855,7 @@ static long __get_user_pages_remote(struct task_struct *tsk, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } -/* +/** * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1885,13 +1886,13 @@ static long __get_user_pages_remote(struct task_struct *tsk, * * Must be called with mmap_sem held for read or write. * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given + * get_user_pages_remote walks a process's page tables and takes a reference + * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different + * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page @@ -1903,17 +1904,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. + * get_user_pages_remote is typically used for fewer-copy IO operations, + * to get a handle on the memory by some means other than accesses + * via the user virtual addresses. The pages may be submitted for + * DMA to devices or accessed via their kernel linear mapping (via the + * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * - * get_user_pages should be phased out in favor of + * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing - * should use get_user_pages because it cannot pass + * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -1952,7 +1953,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, } #endif /* !CONFIG_MMU */ -/* +/** + * get_user_pages() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task * and mm being operated on are the current task's and don't allow @@ -1975,11 +1986,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * +/** * get_user_pages_locked() is suitable to replace the form: * * down_read(&mm->mmap_sem); @@ -1995,6 +2002,21 @@ EXPORT_SYMBOL(get_user_pages); * get_user_pages_locked(tsk, mm, ..., pages, &locked); * if (locked) * up_read(&mm->mmap_sem); + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -2971,3 +2993,20 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); + +/* + * pin_user_pages_unlocked() is the FOLL_PIN variant of + * get_user_pages_unlocked(). Behavior is the same, except that this one sets + * FOLL_PIN and rejects FOLL_GET. + */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages_unlocked); diff --git a/mm/internal.h b/mm/internal.h index b5634e78f01d..f762a34b0c57 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,18 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -extern unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void force_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read); +void __do_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size); /* * Submit IO for the read-ahead request in file_ra_state. */ -static inline unsigned long ra_submit(struct file_ra_state *ra, +static inline void ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { - return __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); + __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); } /** diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index de3121848ddf..d532c2587731 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -15,14 +15,19 @@ CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE) # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 -CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_init.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_quarantine.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_tags_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING +CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack) +CC_FLAGS_KASAN_RUNTIME += $(call cc-option, -fno-stack-protector) +# Disable branch tracing to avoid recursion. +CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING + +CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME) obj-$(CONFIG_KASAN) := common.o init.o report.o obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2906358e42f0..757d4074fe28 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include @@ -613,24 +612,6 @@ void kasan_free_shadow(const struct vm_struct *vm) } #endif -extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -extern bool report_enabled(void); - -bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) -{ - unsigned long flags = user_access_save(); - bool ret = false; - - if (likely(report_enabled())) { - __kasan_report(addr, size, is_write, ip); - ret = true; - } - - user_access_restore(flags); - - return ret; -} - #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) { diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 80f23c9da6b0..51ec45407a0b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -454,7 +455,7 @@ static void print_shadow_for_address(const void *addr) } } -bool report_enabled(void) +static bool report_enabled(void) { if (current->kasan_depth) return false; @@ -479,7 +480,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags); } -void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) +static void __kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) { struct kasan_access_info info; void *tagged_addr; @@ -518,6 +520,22 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon end_report(&flags); } +bool kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) +{ + unsigned long flags = user_access_save(); + bool ret = false; + + if (likely(report_enabled())) { + __kasan_report(addr, size, is_write, ip); + ret = true; + } + + user_access_restore(flags); + + return ret; +} + #ifdef CONFIG_KASAN_INLINE /* * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a3b97f103966..f973a025569b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1314,7 +1314,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.max); - if (count <= limit) + if (count < limit) margin = min(margin, limit - count); else margin = 0; @@ -1451,6 +1451,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_buf_printf(&s, "workingset_activate %lu\n", memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_restore %lu\n", + memcg_page_state(memcg, WORKINGSET_RESTORE)); seq_buf_printf(&s, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); @@ -2250,7 +2252,8 @@ static void reclaim_high(struct mem_cgroup *memcg, gfp_t gfp_mask) { do { - if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high)) + if (page_counter_read(&memcg->memory) <= + READ_ONCE(memcg->memory.high)) continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); @@ -2319,40 +2322,63 @@ static void high_work_func(struct work_struct *work) #define MEMCG_DELAY_PRECISION_SHIFT 20 #define MEMCG_DELAY_SCALING_SHIFT 14 +static u64 calculate_overage(unsigned long usage, unsigned long high) +{ + u64 overage; + + if (usage <= high) + return 0; + + /* + * Prevent division by 0 in overage calculation by acting as if + * it was a threshold of 1 page + */ + high = max(high, 1UL); + + overage = usage - high; + overage <<= MEMCG_DELAY_PRECISION_SHIFT; + return div64_u64(overage, high); +} + +static u64 mem_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->memory), + READ_ONCE(memcg->memory.high)); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} + +static u64 swap_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->swap), + READ_ONCE(memcg->swap.high)); + if (overage) + memcg_memory_event(memcg, MEMCG_SWAP_HIGH); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} + /* * Get the number of jiffies that we should penalise a mischievous cgroup which * is exceeding its memory.high by checking both it and its ancestors. */ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, - unsigned int nr_pages) + unsigned int nr_pages, + u64 max_overage) { unsigned long penalty_jiffies; - u64 max_overage = 0; - - do { - unsigned long usage, high; - u64 overage; - - usage = page_counter_read(&memcg->memory); - high = READ_ONCE(memcg->high); - - if (usage <= high) - continue; - - /* - * Prevent division by 0 in overage calculation by acting as if - * it was a threshold of 1 page - */ - high = max(high, 1UL); - - overage = usage - high; - overage <<= MEMCG_DELAY_PRECISION_SHIFT; - overage = div64_u64(overage, high); - - if (overage > max_overage) - max_overage = overage; - } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); if (!max_overage) return 0; @@ -2377,14 +2403,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or * larger the current charge patch is than that. */ - penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; - - /* - * Clamp the max delay per usermode return so as to still keep the - * application moving forwards and also permit diagnostics, albeit - * extremely slowly. - */ - return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; } /* @@ -2409,7 +2428,18 @@ void mem_cgroup_handle_over_high(void) * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. */ - penalty_jiffies = calculate_high_delay(memcg, nr_pages); + penalty_jiffies = calculate_high_delay(memcg, nr_pages, + mem_find_max_overage(memcg)); + + penalty_jiffies += calculate_high_delay(memcg, nr_pages, + swap_find_max_overage(memcg)); + + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); /* * Don't sleep if the amount of jiffies this memcg owes us is so low @@ -2594,12 +2624,32 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) { - /* Don't bother a random interrupted task */ - if (in_interrupt()) { + bool mem_high, swap_high; + + mem_high = page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high); + swap_high = page_counter_read(&memcg->swap) > + READ_ONCE(memcg->swap.high); + + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + if (mem_high) { schedule_work(&memcg->high_work); break; } + continue; + } + + if (mem_high || swap_high) { + /* + * The allocating tasks in this cgroup will need to do + * reclaim or be throttled to prevent further growth + * of the memory or swap footprints. + * + * Target some best-effort fairness between the tasks, + * and distribute reclaim work and delay penalties + * based on how much each task is actually allocating. + */ current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -2802,7 +2852,12 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, static inline bool memcg_kmem_bypass(void) { - if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + if (in_interrupt()) + return true; + + /* Allow remote memcg charging in kthread contexts. */ + if ((!current->mm || (current->flags & PF_KTHREAD)) && + !current->active_memcg) return true; return false; } @@ -4330,7 +4385,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); - /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); @@ -4338,7 +4392,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(READ_ONCE(memcg->memory.max), - READ_ONCE(memcg->high)); + READ_ONCE(memcg->memory.high)); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -5063,8 +5117,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(memcg)) return ERR_CAST(memcg); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; @@ -5216,8 +5271,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); memcg_wb_domain_size_changed(memcg); } @@ -6015,7 +6071,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -6032,7 +6089,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; - WRITE_ONCE(memcg->high, high); + page_counter_set_high(&memcg->memory, high); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -6227,7 +6284,6 @@ static struct cftype memory_files[] = { }, { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, { @@ -7131,10 +7187,13 @@ bool mem_cgroup_swap_full(struct page *page) if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= - READ_ONCE(memcg->swap.max)) + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + unsigned long usage = page_counter_read(&memcg->swap); + + if (usage * 2 >= READ_ONCE(memcg->swap.high) || + usage * 2 >= READ_ONCE(memcg->swap.max)) return true; + } return false; } @@ -7164,6 +7223,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } +static int swap_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); +} + +static ssize_t swap_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->swap, high); + + return nbytes; +} + static int swap_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -7191,6 +7273,8 @@ static int swap_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); seq_printf(m, "fail %lu\n", @@ -7205,6 +7289,12 @@ static struct cftype swap_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = swap_current_read, }, + { + .name = "swap.high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_high_show, + .write = swap_high_write, + }, { .name = "swap.max", .flags = CFTYPE_NOT_ON_ROOT, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a96364be8ab4..dd3862fcf2e9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -210,14 +210,17 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) { struct task_struct *t = tk->tsk; short addr_lsb = tk->size_shift; - int ret; + int ret = 0; - pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + if ((t->mm == current->mm) || !(flags & MF_ACTION_REQUIRED)) + pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); - if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { - ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, - addr_lsb); + if (flags & MF_ACTION_REQUIRED) { + if (t->mm == current->mm) + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); + /* send no signal to non-current processes */ } else { /* * Don't use force here, it's convenient if the signal diff --git a/mm/memory.c b/mm/memory.c index f703fe8c8346..21438278afca 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -802,8 +802,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; - } else if (pte_devmap(pte)) { - page = pte_page(pte); } out_set_pte: diff --git a/mm/migrate.c b/mm/migrate.c index 7160c1556f79..fb425d86c115 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -797,10 +797,7 @@ recheck_buffers: if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); + attach_page_private(newpage, detach_page_private(page)); get_page(newpage); bh = head; @@ -810,8 +807,6 @@ recheck_buffers: } while (bh != head); - SetPagePrivate(newpage); - if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else @@ -1032,7 +1027,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. - * mpage_readpages). If an allocation happens for the + * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, diff --git a/mm/mm_init.c b/mm/mm_init.c index 7da6991d9435..435e5f794b3b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -67,26 +67,30 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_CPUPID_SHIFT); + LAST_CPUPID_SHIFT, + KASAN_TAG_WIDTH); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", + "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_CPUPID_PGSHIFT); + (unsigned long)LAST_CPUPID_PGSHIFT, + (unsigned long)KASAN_TAG_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), diff --git a/mm/nommu.c b/mm/nommu.c index 318df4e236c9..dfae55f41901 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,7 +140,7 @@ void vfree(const void *addr) } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() @@ -150,16 +150,25 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) { - return __vmalloc(size, flags, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); +} + +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) +{ + return __vmalloc(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, flags, PAGE_KERNEL); + ret = __vmalloc(size, flags); if (ret) { struct vm_area_struct *vma; @@ -179,12 +188,6 @@ void *vmalloc_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_user); -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_user_flags(size, flags | __GFP_ZERO); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); @@ -230,7 +233,7 @@ long vwrite(char *buf, char *addr, unsigned long count) */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -248,8 +251,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -302,7 +304,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } /** @@ -314,7 +316,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_32); @@ -351,7 +353,7 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { BUG(); return NULL; @@ -369,18 +371,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -/* - * Implement a stub for vmalloc_sync_[un]mapping() if the architecture - * chose not to have one. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..718565266257 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -387,8 +387,7 @@ static unsigned long global_dirtyable_memory(void) * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. + * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { @@ -436,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) if (bg_thresh >= thresh) bg_thresh = thresh / 2; tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } @@ -486,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) else dirty = vm_dirty_ratio * node_memory / 100; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + if (rt_task(tsk)) dirty += dirty / 4; return dirty; @@ -505,7 +504,6 @@ bool node_dirty_ok(struct pglist_data *pgdat) unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; @@ -759,7 +757,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1567,7 +1565,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; @@ -1587,14 +1585,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; - /* - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); @@ -1653,8 +1644,12 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { - unsigned long intv = dirty_poll_interval(dirty, thresh); - unsigned long m_intv = ULONG_MAX; + unsigned long intv; + unsigned long m_intv; + +free_running: + intv = dirty_poll_interval(dirty, thresh); + m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; @@ -1673,9 +1668,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * Calculate global domain's pos_ratio and select the * global dtc by default. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(gdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + gdtc->wb_dirty < + dirty_freerun_ceiling(gdtc->wb_thresh, + gdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be throttled + * when below the per-wb freerun ceiling. + */ + goto free_running; + } + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); @@ -1689,9 +1695,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(mdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + mdtc->wb_dirty < + dirty_freerun_ceiling(mdtc->wb_thresh, + mdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be + * throttled when below the per-wb + * freerun ceiling. + */ + goto free_running; + } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); @@ -1938,8 +1955,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) @@ -2164,7 +2180,6 @@ int write_cache_pages(struct address_space *mapping, int error; struct pagevec pvec; int nr_pages; - pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -2173,8 +2188,7 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec); if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; + index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbf030160205..ca864102bebe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5319,7 +5319,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", @@ -5332,7 +5332,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), global_node_page_state(NR_SLAB_RECLAIMABLE), global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), @@ -5365,7 +5364,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " anon_thp: %lukB" #endif " writeback_tmp:%lukB" - " unstable:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5387,7 +5385,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -8253,7 +8250,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags); virt = true; } else { /* diff --git a/mm/percpu.c b/mm/percpu.c index 7da7d7737dab..696367b18222 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -482,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return kzalloc(size, gfp); else - return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO); } /** diff --git a/mm/ptdump.c b/mm/ptdump.c index 26208d0d03b7..f4ce916f5602 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 0, pgd_val(val)); + if (pgd_leaf(val)) st->note_page(st, addr, 0, pgd_val(val)); @@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 1, p4d_val(val)); + if (p4d_leaf(val)) st->note_page(st, addr, 1, p4d_val(val)); @@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 2, pud_val(val)); + if (pud_leaf(val)) st->note_page(st, addr, 2, pud_val(val)); @@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 3, pmd_val(val)); if (pmd_leaf(val)) st->note_page(st, addr, 3, pmd_val(val)); @@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; + pte_t val = READ_ONCE(*pte); - st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte))); + if (st->effective_prot) + st->effective_prot(st, 4, pte_val(val)); + + st->note_page(st, addr, 4, pte_val(val)); return 0; } diff --git a/mm/readahead.c b/mm/readahead.c index 2fe72cd29b47..3c9a8dd7c56c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" @@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); -static int read_pages(struct address_space *mapping, struct file *filp, - struct list_head *pages, unsigned int nr_pages, gfp_t gfp) +static void read_pages(struct readahead_control *rac, struct list_head *pages, + bool skip_page) { + const struct address_space_operations *aops = rac->mapping->a_ops; + struct page *page; struct blk_plug plug; - unsigned page_idx; - int ret; + + if (!readahead_count(rac)) + goto out; blk_start_plug(&plug); - if (mapping->a_ops->readpages) { - ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + if (aops->readahead) { + aops->readahead(rac); + /* Clean up the remaining pages */ + while ((page = readahead_page(rac))) { + unlock_page(page); + put_page(page); + } + } else if (aops->readpages) { + aops->readpages(rac->file, rac->mapping, pages, + readahead_count(rac)); /* Clean up the remaining pages */ put_pages_list(pages); - goto out; + rac->_index += rac->_nr_pages; + rac->_nr_pages = 0; + } else { + while ((page = readahead_page(rac))) { + aops->readpage(rac->file, page); + put_page(page); + } } - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) - mapping->a_ops->readpage(filp, page); - put_page(page); - } - ret = 0; - -out: blk_finish_plug(&plug); - return ret; + BUG_ON(!list_empty(pages)); + BUG_ON(readahead_count(rac)); + +out: + if (skip_page) + rac->_index++; } -/* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates - * the pages first, then submits them for I/O. This avoids the very bad - * behaviour which would occur if page allocations are causing VM writeback. - * We really don't want to intermingle reads and writes like that. +/** + * page_cache_readahead_unbounded - Start unchecked readahead. + * @mapping: File address space. + * @file: This instance of the open file; used for authentication. + * @index: First page index to read. + * @nr_to_read: The number of pages to read. + * @lookahead_size: Where to start the next readahead. * - * Returns the number of pages requested, or the maximum amount of I/O allowed. + * This function is for filesystems to call when they want to start + * readahead beyond a file's stated i_size. This is almost certainly + * not the function you want to call. Use page_cache_async_readahead() + * or page_cache_sync_readahead() instead. + * + * Context: File is referenced by caller. Mutexes may be held by caller. + * May sleep, but will not reenter filesystem to reclaim memory. */ -unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void page_cache_readahead_unbounded(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size) { - struct inode *inode = mapping->host; - struct page *page; - unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); - int page_idx; - unsigned int nr_pages = 0; - loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); + struct readahead_control rac = { + .mapping = mapping, + .file = file, + ._index = index, + }; + unsigned long i; - if (isize == 0) - goto out; - - end_index = ((isize - 1) >> PAGE_SHIFT); + /* + * Partway through the readahead operation, we will have added + * locked pages to the page cache, but will not yet have submitted + * them for I/O. Adding another page may need to allocate memory, + * which can trigger memory reclaim. Telling the VM we're in + * the middle of a filesystem operation will cause it to not + * touch file-backed pages, preventing a deadlock. Most (all?) + * filesystems already specify __GFP_NOFS in their mapping's + * gfp_mask, but let's be explicit here. + */ + unsigned int nofs = memalloc_nofs_save(); /* * Preallocate as many pages as we will need. */ - for (page_idx = 0; page_idx < nr_to_read; page_idx++) { - pgoff_t page_offset = offset + page_idx; + for (i = 0; i < nr_to_read; i++) { + struct page *page = xa_load(&mapping->i_pages, index + i); - if (page_offset > end_index) - break; + BUG_ON(index + i != rac._index + rac._nr_pages); - page = xa_load(&mapping->i_pages, page_offset); if (page && !xa_is_value(page)) { /* - * Page already present? Kick off the current batch of - * contiguous pages before continuing with the next - * batch. + * Page already present? Kick off the current batch + * of contiguous pages before continuing with the + * next batch. This page may be the one we would + * have intended to mark as Readahead, but we don't + * have a stable reference to this page, and it's + * not worth getting one just for that. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, - gfp_mask); - nr_pages = 0; + read_pages(&rac, &page_pool, true); continue; } page = __page_cache_alloc(gfp_mask); if (!page) break; - page->index = page_offset; - list_add(&page->lru, &page_pool); - if (page_idx == nr_to_read - lookahead_size) + if (mapping->a_ops->readpages) { + page->index = index + i; + list_add(&page->lru, &page_pool); + } else if (add_to_page_cache_lru(page, mapping, index + i, + gfp_mask) < 0) { + put_page(page); + read_pages(&rac, &page_pool, true); + continue; + } + if (i == nr_to_read - lookahead_size) SetPageReadahead(page); - nr_pages++; + rac._nr_pages++; } /* @@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); - BUG_ON(!list_empty(&page_pool)); -out: - return nr_pages; + read_pages(&rac, &page_pool, false); + memalloc_nofs_restore(nofs); +} +EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); + +/* + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + */ +void __do_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size) +{ + struct inode *inode = mapping->host; + loff_t isize = i_size_read(inode); + pgoff_t end_index; /* The last page we want to read */ + + if (isize == 0) + return; + + end_index = (isize - 1) >> PAGE_SHIFT; + if (index > end_index) + return; + /* Don't read past the page containing the last byte of the file */ + if (nr_to_read > end_index - index) + nr_to_read = end_index - index + 1; + + page_cache_readahead_unbounded(mapping, file, index, nr_to_read, + lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) +void force_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t index, unsigned long nr_to_read) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); struct file_ra_state *ra = &filp->f_ra; unsigned long max_pages; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) - return -EINVAL; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && + !mapping->a_ops->readahead)) + return; /* * If the request exceeds the readahead window, allow the read to @@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (this_chunk > nr_to_read) this_chunk = nr_to_read; - __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); + __do_page_cache_readahead(mapping, filp, index, this_chunk, 0); - offset += this_chunk; + index += this_chunk; nr_to_read -= this_chunk; } - return 0; } /* @@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, */ /* - * Count contiguously cached pages from @offset-1 to @offset-@max, + * Count contiguously cached pages from @index-1 to @index-@max, * this count is a conservative estimation of * - length of the sequential read sequence, or * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - pgoff_t offset, unsigned long max) + pgoff_t index, unsigned long max) { pgoff_t head; rcu_read_lock(); - head = page_cache_prev_miss(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, index - 1, max); rcu_read_unlock(); - return offset - 1 - head; + return index - 1 - head; } /* @@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping, */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, - pgoff_t offset, + pgoff_t index, unsigned long req_size, unsigned long max) { pgoff_t size; - size = count_history_pages(mapping, offset, max); + size = count_history_pages(mapping, index, max); /* * not enough history pages: @@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping, * starts from beginning of file: * it is a strong indication of long-run stream (or whole-file-read) */ - if (size >= offset) + if (size >= index) size *= 2; - ra->start = offset; + ra->start = index; ra->size = min(size + req_size, max); ra->async_size = 1; @@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping, /* * A minimal readahead algorithm for trivial sequential/random reads. */ -static unsigned long -ondemand_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - bool hit_readahead_marker, pgoff_t offset, - unsigned long req_size) +static void ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t index, + unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; unsigned long add_pages; - pgoff_t prev_offset; + pgoff_t prev_index; /* * If the request exceeds the readahead window, allow the read to @@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping, /* * start of file */ - if (!offset) + if (!index) goto initial_readahead; /* - * It's the expected callback offset, assume sequential access. + * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if ((offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((index == (ra->start + ra->size - ra->async_size) || + index == (ra->start + ra->size))) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_miss(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, index + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max_pages) - return 0; + if (!start || start - index > max_pages) + return; ra->start = start; - ra->size = start - offset; /* old async_size */ + ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss - * trivial case: (offset - prev_offset) == 1 - * unaligned reads: (offset - prev_offset) == 0 + * trivial case: (index - prev_index) == 1 + * unaligned reads: (index - prev_index) == 0 */ - prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; - if (offset - prev_offset <= 1UL) + prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; + if (index - prev_index <= 1UL) goto initial_readahead; /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) + if (try_context_readahead(mapping, ra, index, req_size, max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ - return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + __do_page_cache_readahead(mapping, filp, index, req_size, 0); + return; initial_readahead: - ra->start = offset; + ra->start = index; ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; @@ -478,7 +537,7 @@ readit: * the resulted next readahead window into the current one. * Take care of maximum IO pages as above. */ - if (offset == ra->start && ra->size == ra->async_size) { + if (index == ra->start && ra->size == ra->async_size) { add_pages = get_next_ra_size(ra, max_pages); if (ra->size + add_pages <= max_pages) { ra->async_size = add_pages; @@ -489,7 +548,7 @@ readit: } } - return ra_submit(ra, mapping, filp); + ra_submit(ra, mapping, filp); } /** @@ -497,9 +556,8 @@ readit: * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more @@ -508,7 +566,7 @@ readit: */ void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - pgoff_t offset, unsigned long req_size) + pgoff_t index, unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping, /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { - force_page_cache_readahead(mapping, filp, offset, req_size); + force_page_cache_readahead(mapping, filp, index, req_count); return; } /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, false, offset, req_size); + ondemand_readahead(mapping, ra, filp, false, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_readahead); @@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @page: the page at @offset which has the PG_readahead flag set - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which - * has the PG_readahead flag; this is a marker to suggest that the application + * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - struct page *page, pgoff_t offset, - unsigned long req_size) + struct page *page, pgoff_t index, + unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping, return; /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, true, offset, req_size); + ondemand_readahead(mapping, ra, filp, true, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/slab_common.c b/mm/slab_common.c index 23c7500eea7d..9e72ba224175 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags) kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( kmalloc_info[i].name[KMALLOC_DMA], kmalloc_info[i].size, - SLAB_CACHE_DMA | flags, 0, 0); + SLAB_CACHE_DMA | flags, 0, + kmalloc_info[i].size); } } #endif diff --git a/mm/slub.c b/mm/slub.c index b762450fc9f0..2c56cc9e4ff2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -679,6 +679,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) va_end(args); } +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + if ((s->flags & SLAB_CONSISTENCY_CHECKS) && + !check_valid_pointer(s, page, nextfree)) { + object_err(s, page, freelist, "Freechain corrupt"); + freelist = NULL; + slab_fix(s, "Isolate corrupted freechain"); + return true; + } + + return false; +} + static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ @@ -1410,6 +1424,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + return false; +} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -2093,6 +2112,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *prior; unsigned long counters; + /* + * If 'nextfree' is invalid, it is possible that the object at + * 'freelist' is already corrupted. So isolate all objects + * starting at 'freelist'. + */ + if (freelist_corrupted(s, page, freelist, nextfree)) + break; + do { prior = page->freelist; counters = page->counters; @@ -3739,12 +3766,14 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text) + const char *text, unsigned long *map) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map; + + if (!map) + return; slab_err(s, page, text, s->name); slab_lock(page); @@ -3757,8 +3786,6 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } - put_map(map); - slab_unlock(page); #endif } @@ -3772,6 +3799,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; + unsigned long *map = NULL; + +#ifdef CONFIG_SLUB_DEBUG + map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); +#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3781,11 +3813,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()"); + "Objects remaining in %s on __kmem_cache_shutdown()", + map); } } spin_unlock_irq(&n->list_lock); +#ifdef CONFIG_SLUB_DEBUG + bitmap_free(map); +#endif + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } @@ -5654,7 +5691,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) && + !IS_ENABLED(CONFIG_SLUB_STATS)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5688,19 +5726,6 @@ static struct kobj_type slab_ktype = { .release = kmem_cache_release, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &slab_ktype) - return 1; - return 0; -} - -static const struct kset_uevent_ops slab_uevent_ops = { - .filter = uevent_filter, -}; - static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) @@ -5768,7 +5793,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); out: kobject_put(&s->kobj); } @@ -5826,7 +5850,6 @@ static int sysfs_slab_add(struct kmem_cache *s) } #endif - kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); @@ -5907,7 +5930,7 @@ static int __init slab_sysfs_init(void) mutex_lock(&slab_mutex); - slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); pr_err("Cannot register slab subsystem.\n"); diff --git a/mm/swap_state.c b/mm/swap_state.c index ebed37bbf7a3..8238954ae781 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -509,10 +509,11 @@ static unsigned long swapin_nr_pages(unsigned long offset) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); - pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, + max_pages, atomic_read(&last_readahead_pages)); if (!hits) - prev_offset = offset; + WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; diff --git a/mm/swapfile.c b/mm/swapfile.c index 5871a2aa86a5..63ac67208453 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -601,7 +601,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, { struct percpu_cluster *cluster; struct swap_cluster_info *ci; - bool found_free; unsigned long tmp, max; new_cluster: @@ -614,17 +613,17 @@ new_cluster: } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); - *scan_base = *offset = si->cluster_next; + *scan_base = this_cpu_read(*si->cluster_next_cpu); + *offset = *scan_base; goto new_cluster; } else return false; } - found_free = false; - /* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster @@ -632,27 +631,23 @@ new_cluster: tmp = cluster->next; max = min_t(unsigned long, si->max, (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); - if (tmp >= max) { - cluster_set_null(&cluster->index); - goto new_cluster; - } - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (!si->swap_map[tmp]) { - found_free = true; - break; + if (tmp < max) { + ci = lock_cluster(si, tmp); + while (tmp < max) { + if (!si->swap_map[tmp]) + break; + tmp++; } - tmp++; + unlock_cluster(ci); } - unlock_cluster(ci); - if (!found_free) { + if (tmp >= max) { cluster_set_null(&cluster->index); goto new_cluster; } cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; - return found_free; + return true; } static void __del_from_avail_list(struct swap_info_struct *p) @@ -729,6 +724,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, } } +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +{ + unsigned long prev; + + if (!(si->flags & SWP_SOLIDSTATE)) { + si->cluster_next = next; + return; + } + + prev = this_cpu_read(*si->cluster_next_cpu); + /* + * Cross the swap address space size aligned trunk, choose + * another trunk randomly to avoid lock contention on swap + * address space if possible. + */ + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { + /* No free swap slots available */ + if (si->highest_bit <= si->lowest_bit) + return; + next = si->lowest_bit + + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); + next = max_t(unsigned int, next, si->lowest_bit); + } + this_cpu_write(*si->cluster_next_cpu, next); +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -739,9 +762,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int n_ret = 0; - - if (nr > SWAP_BATCH) - nr = SWAP_BATCH; + bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -755,17 +776,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si, */ si->flags += SWP_SCANNING; - scan_base = offset = si->cluster_next; + /* + * Use percpu scan base for SSD to reduce lock contention on + * cluster and swap cache. For HDD, sequential access is more + * important. + */ + if (si->flags & SWP_SOLIDSTATE) + scan_base = this_cpu_read(*si->cluster_next_cpu); + else + scan_base = si->cluster_next; + offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) - goto checks; - else + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; - } - - if (unlikely(!si->cluster_nr--)) { + } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -848,7 +874,6 @@ checks: unlock_cluster(ci); swap_range_alloc(si, offset, 1); - si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -871,19 +896,33 @@ checks: if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto checks; - else - goto done; - } - /* non-ssd case */ - ++offset; - - /* non-ssd case, still more slots in cluster? */ - if (si->cluster_nr && !si->swap_map[offset]) { + } else if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } + /* + * Even if there's no free clusters available (fragmented), + * try to scan a little more quickly with lock held unless we + * have scanned too many slots already. + */ + if (!scanned_many) { + unsigned long scan_limit; + + if (offset < scan_base) + scan_limit = scan_base; + else + scan_limit = si->highest_bit; + for (; offset <= scan_limit && --latency_ration > 0; + offset++) { + if (!si->swap_map[offset]) + goto checks; + } + } + done: + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; @@ -901,6 +940,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } } offset = si->lowest_bit; @@ -916,6 +956,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } offset++; } @@ -1004,11 +1045,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) if (avail_pgs <= 0) goto noswap; - if (n_goal > SWAP_BATCH) - n_goal = SWAP_BATCH; - - if (n_goal > avail_pgs) - n_goal = avail_pgs; + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); @@ -1275,13 +1312,14 @@ unlock_out: } static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); + unsigned char usage; ci = lock_cluster_or_swap_info(p, offset); - usage = __swap_entry_free_locked(p, offset, usage); + usage = __swap_entry_free_locked(p, offset, 1); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); @@ -1316,7 +1354,7 @@ void swap_free(swp_entry_t entry) p = _swap_info_get(entry); if (p) - __swap_entry_free(p, entry, 1); + __swap_entry_free(p, entry); } /* @@ -1739,7 +1777,7 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { - count = __swap_entry_free(p, entry, 1); + count = __swap_entry_free(p, entry); if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), @@ -1937,10 +1975,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); swap_map = &si->swap_map[offset]; - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + page = lookup_swap_cache(entry, vma, addr); + if (!page) { + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + &vmf); + } if (!page) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; @@ -2650,6 +2692,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); @@ -2757,20 +2801,24 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; + unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } + bytes = si->pages << (PAGE_SHIFT - 10); + inuse = si->inuse_pages << (PAGE_SHIFT - 10); + file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", + seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", - si->pages << (PAGE_SHIFT - 10), - si->inuse_pages << (PAGE_SHIFT - 10), + bytes, bytes < 10000000 ? "\t" : "", + inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } @@ -3202,11 +3250,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; + p->cluster_next_cpu = alloc_percpu(unsigned int); + if (!p->cluster_next_cpu) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } /* * select a random position to start with to help wear leveling * SSD */ - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + for_each_possible_cpu(cpu) { + per_cpu(*p->cluster_next_cpu, cpu) = + 1 + prandom_u32_max(p->highest_bit); + } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), @@ -3322,6 +3378,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); @@ -3654,7 +3712,7 @@ static bool swap_count_continued(struct swap_info_struct *si, spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; - page = list_entry(head->lru.next, struct page, lru); + page = list_next_entry(head, lru); map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ @@ -3666,13 +3724,13 @@ static bool swap_count_continued(struct swap_info_struct *si, */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; @@ -3682,12 +3740,10 @@ init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = true; /* incremented */ @@ -3698,7 +3754,7 @@ init_map: *map = 0; /* we didn't zero the page */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } @@ -3707,13 +3763,11 @@ init_map: *map = 0; /* we didn't zero the page */ if (*map == 0) count = 0; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = count == COUNT_CONTINUED; } diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..6d5868adbe18 100644 --- a/mm/util.c +++ b/mm/util.c @@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; - return __vmalloc_node_flags_caller(size, node, flags, + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9a8227afa073..1e94497b7388 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -69,7 +69,8 @@ static void free_work(struct work_struct *w) /*** Page table manipulation functions ***/ -static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -78,73 +79,118 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; } -static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; + int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_clear_huge(pmd)) + + cleared = pmd_clear_huge(pmd); + if (cleared || pmd_bad(*pmd)) + *mask |= PGTBL_PMD_MODIFIED; + + if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; - vunmap_pte_range(pmd, addr, next); + vunmap_pte_range(pmd, addr, next, mask); } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; + int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_clear_huge(pud)) + + cleared = pud_clear_huge(pud); + if (cleared || pud_bad(*pud)) + *mask |= PGTBL_PUD_MODIFIED; + + if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; - vunmap_pmd_range(pud, addr, next); + vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } -static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; + int cleared; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_clear_huge(p4d)) + + cleared = p4d_clear_huge(p4d); + if (cleared || p4d_bad(*p4d)) + *mask |= PGTBL_P4D_MODIFIED; + + if (cleared) continue; if (p4d_none_or_clear_bad(p4d)) continue; - vunmap_pud_range(p4d, addr, next); + vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } -static void vunmap_page_range(unsigned long addr, unsigned long end) +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @start: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify + * should have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this + * function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) { - pgd_t *pgd; + unsigned long end = start + size; unsigned long next; + pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); + start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_p4d_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -153,7 +199,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, * callers keep track of where we're up to. */ - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -166,94 +212,117 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -/* - * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and - * will have pfns corresponding to the "pages" array. +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map * - * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * 0 on success, -errno on failure. */ -static int vmap_page_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) { - pgd_t *pgd; + unsigned long start = addr; + unsigned long end = addr + size; unsigned long next; - unsigned long addr = start; + pgd_t *pgd; int err = 0; int nr = 0; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); - return nr; + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return 0; } -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret; - ret = vmap_page_range_noflush(start, end, prot, pages); - flush_cache_vmap(start, end); + ret = map_kernel_range_noflush(start, size, prot, pages); + flush_cache_vmap(start, start + size); return ret; } @@ -1222,14 +1291,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); -/* - * Clear the pagetable entries of a given vmap_area - */ -static void unmap_vmap_area(struct vmap_area *va) -{ - vunmap_page_range(va->va_start, va->va_end); -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -1292,12 +1353,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) if (unlikely(valist == NULL)) return false; - /* - * First make sure the mappings are removed from all page-tables - * before they are freed. - */ - vmalloc_sync_unmappings(); - /* * TODO: to calculate a flush range without looping. * The list can be up to lazy_max_pages() elements. @@ -1391,7 +1446,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_vmap_area(va); + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1665,7 +1720,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) return vaddr; } -static void vb_free(const void *addr, unsigned long size) +static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned long vb_idx; @@ -1675,24 +1730,22 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); - flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + flush_cache_vunmap(addr, addr + size); order = get_order(size); - offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); - offset >>= PAGE_SHIFT; + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; - vb_idx = addr_to_vb_idx((unsigned long)addr); + vb_idx = addr_to_vb_idx(addr); rcu_read_lock(); vb = radix_tree_lookup(&vmap_block_tree, vb_idx); rcu_read_unlock(); BUG_ON(!vb); - vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + unmap_kernel_range_noflush(addr, size); if (debug_pagealloc_enabled_static()) - flush_tlb_kernel_range((unsigned long)addr, - (unsigned long)addr + size); + flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); @@ -1792,7 +1845,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); - vb_free(mem, size); + vb_free(addr, size); return; } @@ -1819,7 +1872,7 @@ EXPORT_SYMBOL(vm_unmap_ram); * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; @@ -1843,7 +1896,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro kasan_unpoison_vmalloc(mem, size); - if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } @@ -1987,51 +2040,6 @@ void __init vmalloc_init(void) vmap_initialized = true; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vmap() on to-be-mapped areas - * before calling this function. - * - * RETURNS: - * The number of pages mapped on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_page_range_noflush(addr, addr + size, prot, pages); -} - -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vunmap() on to-be-mapped areas - * before calling this function and flush_tlb_kernel_range() after. - */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ - vunmap_page_range(addr, addr + size); -} -EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); - /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB * @addr: start of the VM area to unmap @@ -2045,22 +2053,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) unsigned long end = addr + size; flush_cache_vunmap(addr, end); - vunmap_page_range(addr, end); + unmap_kernel_range_noflush(addr, size); flush_tlb_kernel_range(addr, end); } -EXPORT_SYMBOL_GPL(unmap_kernel_range); - -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + get_vm_area_size(area); - int err; - - err = vmap_page_range(addr, end, prot, pages); - - return err > 0 ? 0 : err; -} -EXPORT_SYMBOL_GPL(map_vm_area); static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) @@ -2128,14 +2123,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return area; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, __builtin_return_address(0)); -} -EXPORT_SYMBOL_GPL(__get_vm_area); - struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) @@ -2441,7 +2428,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2450,9 +2438,6 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -2470,7 +2455,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - PAGE_KERNEL, node, area->caller); + node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2504,8 +2489,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr; fail: @@ -2573,27 +2560,16 @@ fail: return NULL; } -/* - * This is only for performance analysis of vmalloc and stress purpose. - * It is required by vmalloc test module, therefore do not use it other - * than that. - */ -#ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node_range); -#endif - /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level allocator with + * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported @@ -2603,35 +2579,28 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * * Return: pointer to the allocated memory or %NULL on error */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller) +void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, - gfp_mask, prot, 0, node, caller); + gfp_mask, PAGE_KERNEL, 0, node, caller); } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node); +#endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); -static inline void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, - node, __builtin_return_address(0)); -} - - -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); -} - /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -2646,8 +2615,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL); + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -2666,8 +2635,8 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); @@ -2704,8 +2673,8 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, - node, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -2718,38 +2687,15 @@ EXPORT_SYMBOL(vmalloc_node); * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * - * For tight control over page level allocator and protection flags - * use __vmalloc_node() instead. - * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { - return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_user_node_flags - allocate memory for userspace on a specific node - * @size: allocation size - * @node: numa node - * @flags: flags for the page level allocator - * - * The resulting memory area is zeroed so it can be mapped to userspace - * without leaking data. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, - flags | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, node, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size @@ -2793,8 +2739,8 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -3137,21 +3083,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -/* - * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose - * not to have one. - * - * The purpose of this function is to make sure the vmalloc area - * mappings are identical in all page-tables in the system. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; diff --git a/mm/vmscan.c b/mm/vmscan.c index a37c87b5aee2..b2f5deb3603c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1878,13 +1878,13 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, /* * If a kernel thread (such as nfsd for loop-back mounts) services - * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. * In that case we should only throttle if the backing device it is * writing to is congested. In other cases it is safe to throttle. */ static int current_may_throttle(void) { - return !(current->flags & PF_LESS_THROTTLE) || + return !(current->flags & PF_LOCAL_THROTTLE) || current->backing_dev_info == NULL || bdi_write_congested(current->backing_dev_info); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 2435d2c24657..5e241434cab2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1108,7 +1108,7 @@ int fragmentation_index(struct zone *zone, unsigned int order) TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* enum zone_stat_item countes */ + /* enum zone_stat_item counters */ "nr_free_pages", "nr_zone_inactive_anon", "nr_zone_active_anon", @@ -1165,7 +1165,6 @@ const char * const vmstat_text[] = { "nr_file_hugepages", "nr_file_pmdmapped", "nr_anon_transparent_hugepages", - "nr_unstable", "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", @@ -1726,6 +1725,14 @@ static int vmstat_show(struct seq_file *m, void *arg) seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } return 0; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2f836a2b993f..f6dc0673e62c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -293,7 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING struct vm_struct *vm; /* vm area for mapping object that span pages */ #else char *vm_buf; /* copy buffer for objects that span pages */ @@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + unsigned long addr = (unsigned long)area->vm->addr; + + BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -1151,7 +1153,7 @@ static inline void __zs_unmap_object(struct mapping_area *area, unmap_kernel_range(addr, PAGE_SIZE * 2); } -#else /* CONFIG_PGTABLE_MAPPING */ +#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static inline int __zs_cpu_up(struct mapping_area *area) { @@ -1233,7 +1235,7 @@ out: pagefault_enable(); } -#endif /* CONFIG_PGTABLE_MAPPING */ +#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static int zs_cpu_prepare(unsigned int cpu) { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 78db58c7aec2..7e869284e052 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1095,16 +1095,14 @@ static int do_replace(struct net *net, const void __user *user, tmp.name[sizeof(tmp.name) - 1] = 0; countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; - newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT); if (!newinfo) return -ENOMEM; if (countersize) memset(newinfo->counters, 0, countersize); - newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a0e97f6c1072..66f22e8aa529 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,8 +190,7 @@ EXPORT_SYMBOL(ceph_compare_options); * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are * compatible with (a superset of) GFP_KERNEL. This is because while the * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take - * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * are always allocated with GFP_KERNEL. * * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. */ diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index a83553fbedf0..bea46ed157a6 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -143,7 +143,7 @@ int snd_dma_alloc_pages(int type, struct device *device, size_t size, break; case SNDRV_DMA_TYPE_VMALLOC: gfp = snd_mem_get_gfp_flags(device, GFP_KERNEL | __GFP_HIGHMEM); - dmab->area = __vmalloc(size, gfp, PAGE_KERNEL); + dmab->area = __vmalloc(size, gfp); dmab->addr = 0; break; #ifdef CONFIG_HAS_DMA diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c index fcab37ea6641..860935e3aea4 100644 --- a/sound/core/pcm_memory.c +++ b/sound/core/pcm_memory.c @@ -460,7 +460,7 @@ int _snd_pcm_lib_alloc_vmalloc_buffer(struct snd_pcm_substream *substream, return 0; /* already large enough */ vfree(runtime->dma_area); } - runtime->dma_area = __vmalloc(size, gfp_flags, PAGE_KERNEL); + runtime->dma_area = __vmalloc(size, gfp_flags); if (!runtime->dma_area) return -ENOMEM; runtime->dma_bytes = size;