diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 16afe3f59cbd..6fc2c2efe8ab 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -100,6 +100,17 @@ Description: This attribute indicates the mode that the irq vector named by the file is in (msi vs. msix) +What: /sys/bus/pci/devices/.../irq +Date: August 2021 +Contact: Linux PCI developers +Description: + If a driver has enabled MSI (not MSI-X), "irq" contains the + IRQ of the first MSI vector. Otherwise "irq" contains the + IRQ of the legacy INTx interrupt. + + "irq" being set to 0 indicates that the device isn't + capable of generating legacy INTx interrupts. + What: /sys/bus/pci/devices/.../remove Date: January 2009 Contact: Linux PCI developers diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 700329d25f57..3e11926a4df9 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -328,6 +328,14 @@ as idle:: From now on, any pages on zram are idle pages. The idle mark will be removed until someone requests access of the block. IOW, unless there is access request, those pages are still idle pages. +Additionally, when CONFIG_ZRAM_MEMORY_TRACKING is enabled pages can be +marked as idle based on how long (in seconds) it's been since they were +last accessed:: + + echo 86400 > /sys/block/zramX/idle + +In this example all pages which haven't been accessed in more than 86400 +seconds (one day) will be marked idle. Admin can request writeback of those idle pages at right timing via:: diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 41191b5fb69d..faac50149a22 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -87,10 +87,8 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node - memory.kmem.limit_in_bytes set/show hard limit for kernel memory - This knob is deprecated and shouldn't be - used. It is planned that this be removed in - the foreseeable future. + memory.kmem.limit_in_bytes This knob is deprecated and writing to + it will return -ENOTSUPP. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits @@ -518,11 +516,6 @@ will be charged as a new owner of it. charged file caches. Some out-of-use page caches may keep charged until memory pressure happens. If you want to avoid that, force_empty will be useful. - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - 5.2 stat file ------------- diff --git a/Documentation/admin-guide/filesystem-monitoring.rst b/Documentation/admin-guide/filesystem-monitoring.rst new file mode 100644 index 000000000000..ab8dba76283c --- /dev/null +++ b/Documentation/admin-guide/filesystem-monitoring.rst @@ -0,0 +1,78 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================== +File system Monitoring with fanotify +==================================== + +File system Error Reporting +=========================== + +Fanotify supports the FAN_FS_ERROR event type for file system-wide error +reporting. It is meant to be used by file system health monitoring +daemons, which listen for these events and take actions (notify +sysadmin, start recovery) when a file system problem is detected. + +By design, a FAN_FS_ERROR notification exposes sufficient information +for a monitoring tool to know a problem in the file system has happened. +It doesn't necessarily provide a user space application with semantics +to verify an IO operation was successfully executed. That is out of +scope for this feature. Instead, it is only meant as a framework for +early file system problem detection and reporting recovery tools. + +When a file system operation fails, it is common for dozens of kernel +errors to cascade after the initial failure, hiding the original failure +log, which is usually the most useful debug data to troubleshoot the +problem. For this reason, FAN_FS_ERROR tries to report only the first +error that occurred for a file system since the last notification, and +it simply counts additional errors. This ensures that the most +important pieces of information are never lost. + +FAN_FS_ERROR requires the fanotify group to be setup with the +FAN_REPORT_FID flag. + +At the time of this writing, the only file system that emits FAN_FS_ERROR +notifications is Ext4. + +A FAN_FS_ERROR Notification has the following format:: + + :: + + [ Notification Metadata (Mandatory) ] + [ Generic Error Record (Mandatory) ] + [ FID record (Mandatory) ] + +The order of records is not guaranteed, and new records might be added +in the future. Therefore, applications must not rely on the order and +must be prepared to skip over unknown records. Please refer to +``samples/fanotify/fs-monitor.c`` for an example parser. + +Generic error record +-------------------- + +The generic error record provides enough information for a file system +agnostic tool to learn about a problem in the file system, without +providing any additional details about the problem. This record is +identified by ``struct fanotify_event_info_header.info_type`` being set +to FAN_EVENT_INFO_TYPE_ERROR. + + :: + + struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; + }; + +The `error` field identifies the type of error using errno values. +`error_count` tracks the number of errors that occurred and were +suppressed to preserve the original error information, since the last +notification. + +FID record +---------- + +The FID record can be used to uniquely identify the inode that triggered +the error through the combination of fsid and file handle. A file system +specific application can use that information to attempt a recovery +procedure. Errors that are not related to an inode are reported with an +empty file handle of type FILEID_INVALID. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index dc00afcabb95..1bedab498104 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -82,6 +82,7 @@ configure specific aspects of kernel behavior to your liking. edid efi-stub ext4 + filesystem-monitoring nfs/index gpio/index highuid diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 43a74e0d46b2..a433eaa1ab66 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1582,8 +1582,10 @@ registers. Default set by CONFIG_HPET_MMAP_DEFAULT. hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation - of gigantic hugepages. - Format: nn[KMGTPE] + of gigantic hugepages. Or using node format, the size + of a CMA area per node can be specified. + Format: nn[KMGTPE] or (node format) + :nn[KMGTPE][,:nn[KMGTPE]] Reserve a CMA area of given size and allocate gigantic hugepages using the CMA allocator. If enabled, the @@ -1594,9 +1596,11 @@ the number of pages of hugepagesz to be allocated. If this is the first HugeTLB parameter on the command line, it specifies the number of pages to allocate for - the default huge page size. See also - Documentation/admin-guide/mm/hugetlbpage.rst. - Format: + the default huge page size. If using node format, the + number of pages to allocate per-node can be specified. + See also Documentation/admin-guide/mm/hugetlbpage.rst. + Format: or (node format) + :[,:] hugepagesz= [HW] The size of the HugeTLB pages. This is used in @@ -4988,6 +4992,18 @@ an IOTLB flush. Default is lazy flushing before reuse, which is faster. + s390_iommu_aperture= [KNL,S390] + Specifies the size of the per device DMA address space + accessible through the DMA and IOMMU APIs as a decimal + factor of the size of main memory. + The default is 1 meaning that one can concurrently use + as many DMA addresses as physical memory is installed, + if supported by hardware, and thus map all of memory + once. With a value of 2 one can map all of memory twice + and so on. As a special case a factor of 0 imposes no + restrictions other than those given by hardware at the + cost of significant additional memory use for tables. + sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 8c5dde3a5754..61aff88347f3 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -13,3 +13,4 @@ optimize those. start usage + reclaim diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst new file mode 100644 index 000000000000..fb9def3a7355 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -0,0 +1,235 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +DAMON-based Reclamation +======================= + +DAMON-based Reclamation (DAMON_RECLAIM) is a static kernel module that aimed to +be used for proactive and lightweight reclamation under light memory pressure. +It doesn't aim to replace the LRU-list based page_granularity reclamation, but +to be selectively used for different level of memory pressure and requirements. + +Where Proactive Reclamation is Required? +======================================== + +On general memory over-committed systems, proactively reclaiming cold pages +helps saving memory and reducing latency spikes that incurred by the direct +reclaim of the process or CPU consumption of kswapd, while incurring only +minimal performance degradation [1]_ [2]_ . + +Free Pages Reporting [3]_ based memory over-commit virtualization systems are +good example of the cases. In such systems, the guest VMs reports their free +memory to host, and the host reallocates the reported memory to other guests. +As a result, the memory of the systems are fully utilized. However, the +guests could be not so memory-frugal, mainly because some kernel subsystems and +user-space applications are designed to use as much memory as available. Then, +guests could report only small amount of memory as free to host, results in +memory utilization drop of the systems. Running the proactive reclamation in +guests could mitigate this problem. + +How It Works? +============= + +DAMON_RECLAIM finds memory regions that didn't accessed for specific time +duration and page out. To avoid it consuming too much CPU for the paging out +operation, a speed limit can be configured. Under the speed limit, it pages +out memory regions that didn't accessed longer time first. System +administrators can also configure under what situation this scheme should +automatically activated and deactivated with three memory pressure watermarks. + +Interface: Module Parameters +============================ + +To use this feature, you should first ensure your system is running on a kernel +that is built with ``CONFIG_DAMON_RECLAIM=y``. + +To let sysadmins enable or disable it and tune for the given system, +DAMON_RECLAIM utilizes module parameters. That is, you can put +``damon_reclaim.=`` on the kernel boot command line or write +proper values to ``/sys/modules/damon_reclaim/parameters/`` files. + +Note that the parameter values except ``enabled`` are applied only when +DAMON_RECLAIM starts. Therefore, if you want to apply new parameter values in +runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable +it via ``enabled`` parameter file. Writing of the new values to proper +parameter values should be done before the re-enablement. + +Below are the description of each parameter. + +enabled +------- + +Enable or disable DAMON_RECLAIM. + +You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. +Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could do +no real monitoring and reclamation due to the watermarks-based activation +condition. Refer to below descriptions for the watermarks parameter for this. + +min_age +------- + +Time threshold for cold memory regions identification in microseconds. + +If a memory region is not accessed for this or longer time, DAMON_RECLAIM +identifies the region as cold, and reclaims it. + +120 seconds by default. + +quota_ms +-------- + +Limit of time for the reclamation in milliseconds. + +DAMON_RECLAIM tries to use only up to this time within a time window +(quota_reset_interval_ms) for trying reclamation of cold pages. This can be +used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, the +limit is disabled. + +10 ms by default. + +quota_sz +-------- + +Limit of size of memory for the reclamation in bytes. + +DAMON_RECLAIM charges amount of memory which it tried to reclaim within a time +window (quota_reset_interval_ms) and makes no more than this limit is tried. +This can be used for limiting consumption of CPU and IO. If this value is +zero, the limit is disabled. + +128 MiB by default. + +quota_reset_interval_ms +----------------------- + +The time/size quota charge reset interval in milliseconds. + +The charget reset interval for the quota of time (quota_ms) and size +(quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than +quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms +milliseconds. + +1 second by default. + +wmarks_interval +--------------- + +Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is +enabled but inactive due to its watermarks rule. + +wmarks_high +----------- + +Free memory rate (per thousand) for the high watermark. + +If free memory of the system in bytes per thousand bytes is higher than this, +DAMON_RECLAIM becomes inactive, so it does nothing but only periodically checks +the watermarks. + +wmarks_mid +---------- + +Free memory rate (per thousand) for the middle watermark. + +If free memory of the system in bytes per thousand bytes is between this and +the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring and +the reclaiming. + +wmarks_low +---------- + +Free memory rate (per thousand) for the low watermark. + +If free memory of the system in bytes per thousand bytes is lower than this, +DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks the +watermarks. In the case, the system falls back to the LRU-list based page +granularity reclamation logic. + +sample_interval +--------------- + +Sampling interval for the monitoring in microseconds. + +The sampling interval of DAMON for the cold memory monitoring. Please refer to +the DAMON documentation (:doc:`usage`) for more detail. + +aggr_interval +------------- + +Aggregation interval for the monitoring in microseconds. + +The aggregation interval of DAMON for the cold memory monitoring. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. + +min_nr_regions +-------------- + +Minimum number of monitoring regions. + +The minimal number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set lower-bound of the monitoring quality. +But, setting this too high could result in increased monitoring overhead. +Please refer to the DAMON documentation (:doc:`usage`) for more detail. + +max_nr_regions +-------------- + +Maximum number of monitoring regions. + +The maximum number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set upper-bound of the monitoring overhead. +However, setting this too low could result in bad monitoring quality. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. + +monitor_region_start +-------------------- + +Start of target memory region in physical address. + +The start physical address of memory region that DAMON_RECLAIM will do work +against. That is, DAMON_RECLAIM will find cold memory regions in this region +and reclaims. By default, biggest System RAM is used as the region. + +monitor_region_end +------------------ + +End of target memory region in physical address. + +The end physical address of memory region that DAMON_RECLAIM will do work +against. That is, DAMON_RECLAIM will find cold memory regions in this region +and reclaims. By default, biggest System RAM is used as the region. + +kdamond_pid +----------- + +PID of the DAMON thread. + +If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else, +-1. + +Example +======= + +Below runtime example commands make DAMON_RECLAIM to find memory regions that +not accessed for 30 seconds or more and pages out. The reclamation is limited +to be done only up to 1 GiB per second to avoid DAMON_RECLAIM consuming too +much CPU time for the paging out operation. It also asks DAMON_RECLAIM to do +nothing if the system's free memory rate is more than 50%, but start the real +works if it becomes lower than 40%. If DAMON_RECLAIM doesn't make progress and +therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to +do nothing again, so that we can fall back to the LRU-list based page +granularity reclamation. :: + + # cd /sys/modules/damon_reclaim/parameters + # echo 30000000 > min_age + # echo $((1 * 1024 * 1024 * 1024)) > quota_sz + # echo 1000 > quota_reset_interval_ms + # echo 500 > wmarks_high + # echo 400 > wmarks_mid + # echo 200 > wmarks_low + # echo Y > enabled + +.. [1] https://research.google/pubs/pub48551/ +.. [2] https://lwn.net/Articles/787611/ +.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index d5eb89a8fc38..4d5ca2c46288 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -6,39 +6,9 @@ Getting Started This document briefly describes how you can use DAMON by demonstrating its default user space tool. Please note that this document describes only a part -of its features for brevity. Please refer to :doc:`usage` for more details. - - -TL; DR -====== - -Follow the commands below to monitor and visualize the memory access pattern of -your workload. :: - - # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot - # mount -t debugfs none /sys/kernel/debug/ - # git clone https://github.com/awslabs/damo - # ./damo/damo record $(pidof ) - # ./damo/damo report heat --plot_ascii - -The final command draws the access heatmap of ````. The heatmap -shows which memory region (x-axis) is accessed when (y-axis) and how frequently -(number; the higher the more accesses have been observed). :: - - 111111111111111111111111111111111111111111111111111111110000 - 111121111111111111111111111111211111111111111111111111110000 - 000000000000000000000000000000000000000000000000001555552000 - 000000000000000000000000000000000000000000000222223555552000 - 000000000000000000000000000000000000000011111677775000000000 - 000000000000000000000000000000000000000488888000000000000000 - 000000000000000000000000000000000177888400000000000000000000 - 000000000000000000000000000046666522222100000000000000000000 - 000000000000000000000014444344444300000000000000000000000000 - 000000000000000002222245555510000000000000000000000000000000 - # access_frequency: 0 1 2 3 4 5 6 7 8 9 - # x-axis: space (140286319947776-140286426374096: 101.496 MiB) - # y-axis: time (605442256436361-605479951866441: 37.695430s) - # resolution: 60x10 (1.692 MiB and 3.770s for each character) +of its features for brevity. Please refer to the usage `doc +`_ of the tool for more +details. Prerequisites @@ -91,24 +61,74 @@ pattern in the ``damon.data`` file. Visualizing Recorded Patterns ============================= -The following three commands visualize the recorded access patterns and save -the results as separate image files. :: +You can visualize the pattern in a heatmap, showing which memory region +(x-axis) got accessed when (y-axis) and how frequently (number).:: - $ damo report heats --heatmap access_pattern_heatmap.png - $ damo report wss --range 0 101 1 --plot wss_dist.png - $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png + $ sudo damo report heats --heatmap stdout + 22222222222222222222222222222222222222211111111111111111111111111111111111111100 + 44444444444444444444444444444444444444434444444444444444444444444444444444443200 + 44444444444444444444444444444444444444433444444444444444444444444444444444444200 + 33333333333333333333333333333333333333344555555555555555555555555555555555555200 + 33333333333333333333333333333333333344444444444444444444444444444444444444444200 + 22222222222222222222222222222222222223355555555555555555555555555555555555555200 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + 33333333333333333333333333333333333333355555555555555555555555555555555555555200 + 88888888888888888888888888888888888888600000000000000000000000000000000000000000 + 88888888888888888888888888888888888888600000000000000000000000000000000000000000 + 33333333333333333333333333333333333333444444444444444444444444444444444444443200 + 00000000000000000000000000000000000000288888888888888888888888888888888888888400 + [...] + # access_frequency: 0 1 2 3 4 5 6 7 8 9 + # x-axis: space (139728247021568-139728453431248: 196.848 MiB) + # y-axis: time (15256597248362-15326899978162: 1 m 10.303 s) + # resolution: 80x40 (2.461 MiB and 1.758 s for each character) -- ``access_pattern_heatmap.png`` will visualize the data access pattern in a - heatmap, showing which memory region (y-axis) got accessed when (x-axis) - and how frequently (color). -- ``wss_dist.png`` will show the distribution of the working set size. -- ``wss_chron_change.png`` will show how the working set size has - chronologically changed. +You can also visualize the distribution of the working set size, sorted by the +size.:: -You can view the visualizations of this example workload at [1]_. -Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_. + $ sudo damo report wss --range 0 101 10 + # + # target_id 18446632103789443072 + # avr: 107.708 MiB + 0 0 B | | + 10 95.328 MiB |**************************** | + 20 95.332 MiB |**************************** | + 30 95.340 MiB |**************************** | + 40 95.387 MiB |**************************** | + 50 95.387 MiB |**************************** | + 60 95.398 MiB |**************************** | + 70 95.398 MiB |**************************** | + 80 95.504 MiB |**************************** | + 90 190.703 MiB |********************************************************* | + 100 196.875 MiB |***********************************************************| -.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns -.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html -.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html -.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html +Using ``--sortby`` option with the above command, you can show how the working +set size has chronologically changed.:: + + $ sudo damo report wss --range 0 101 10 --sortby time + # + # target_id 18446632103789443072 + # avr: 107.708 MiB + 0 3.051 MiB | | + 10 190.703 MiB |***********************************************************| + 20 95.336 MiB |***************************** | + 30 95.328 MiB |***************************** | + 40 95.387 MiB |***************************** | + 50 95.332 MiB |***************************** | + 60 95.320 MiB |***************************** | + 70 95.398 MiB |***************************** | + 80 95.398 MiB |***************************** | + 90 95.340 MiB |***************************** | + 100 95.398 MiB |***************************** | + + +Data Access Pattern Aware Memory Management +=========================================== + +Below three commands make every memory region of size >=4K that doesn't +accessed for >=60 seconds in your workload to be swapped out. :: + + $ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme + $ echo "4K max 0 0 60s max pageout" >> test_scheme + $ damo schemes -c test_scheme diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index a72cda374aba..ed96bbf0daff 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -10,15 +10,16 @@ DAMON provides below three interfaces for different users. This is for privileged people such as system administrators who want a just-working human-friendly interface. Using this, users can use the DAMON’s major features in a human-friendly way. It may not be highly tuned for - special cases, though. It supports only virtual address spaces monitoring. + special cases, though. It supports both virtual and physical address spaces + monitoring. - *debugfs interface.* This is for privileged user space programmers who want more optimized use of DAMON. Using this, users can use DAMON’s major features by reading from and writing to special debugfs files. Therefore, you can write and use your personalized DAMON debugfs wrapper programs that reads/writes the debugfs files instead of you. The DAMON user space tool is also a reference - implementation of such programs. It supports only virtual address spaces - monitoring. + implementation of such programs. It supports both virtual and physical + address spaces monitoring. - *Kernel Space Programming Interface.* This is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by writing kernel space @@ -34,8 +35,9 @@ the reason, this document describes only the debugfs interface debugfs Interface ================= -DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under -its debugfs directory, ``/damon/``. +DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``, +``schemes`` and ``monitor_on`` under its debugfs directory, +``/damon/``. Attributes @@ -71,9 +73,106 @@ check it again:: # cat target_ids 42 4242 +Users can also monitor the physical memory address space of the system by +writing a special keyword, "``paddr\n``" to the file. Because physical address +space monitoring doesn't support multiple targets, reading the file will show a +fake value, ``42``, as below:: + + # cd /damon + # echo paddr > target_ids + # cat target_ids + 42 + Note that setting the target ids doesn't start the monitoring. +Initial Monitoring Target Regions +--------------------------------- + +In case of the virtual address space monitoring, DAMON automatically sets and +updates the monitoring target regions so that entire memory mappings of target +processes can be covered. However, users can want to limit the monitoring +region to specific address ranges, such as the heap, the stack, or specific +file-mapped area. Or, some users can know the initial access pattern of their +workloads and therefore want to set optimal initial regions for the 'adaptive +regions adjustment'. + +In contrast, DAMON do not automatically sets and updates the monitoring target +regions in case of physical memory monitoring. Therefore, users should set the +monitoring target regions by themselves. + +In such cases, users can explicitly set the initial monitoring target regions +as they want, by writing proper values to the ``init_regions`` file. Each line +of the input should represent one region in below form.:: + + + +The ``target id`` should already in ``target_ids`` file, and the regions should +be passed in address order. For example, below commands will set a couple of +address ranges, ``1-100`` and ``100-200`` as the initial monitoring target +region of process 42, and another couple of address ranges, ``20-40`` and +``50-100`` as that of process 4242.:: + + # cd /damon + # echo "42 1 100 + 42 100 200 + 4242 20 40 + 4242 50 100" > init_regions + +Note that this sets the initial monitoring target regions only. In case of +virtual memory monitoring, DAMON will automatically updates the boundary of the +regions after one ``regions update interval``. Therefore, users should set the +``regions update interval`` large enough in this case, if they don't want the +update. + + +Schemes +------- + +For usual DAMON-based data access aware memory management optimizations, users +would simply want the system to apply a memory management action to a memory +region of a specific size having a specific access frequency for a specific +time. DAMON receives such formalized operation schemes from the user and +applies those to the target processes. It also counts the total number and +size of regions that each scheme is applied. This statistics can be used for +online analysis or tuning of the schemes. + +Users can get and set the schemes by reading from and writing to ``schemes`` +debugfs file. Reading the file also shows the statistics of each scheme. To +the file, each of the schemes should be represented in each line in below form: + + min-size max-size min-acc max-acc min-age max-age action + +Note that the ranges are closed interval. Bytes for the size of regions +(``min-size`` and ``max-size``), number of monitored accesses per aggregate +interval for access frequency (``min-acc`` and ``max-acc``), number of +aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a +predefined integer for memory management actions should be used. The supported +numbers and their meanings are as below. + + - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` + - 1: Call ``madvise()`` for the region with ``MADV_COLD`` + - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` + - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` + - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` + - 5: Do nothing but count the statistics + +You can disable schemes by simply writing an empty string to the file. For +example, below commands applies a scheme saying "If a memory region of size in +[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate +interval in [10, 20], page out the region", check the entered scheme again, and +finally remove the scheme. :: + + # cd /damon + # echo "4096 8192 0 5 10 20 2" > schemes + # cat schemes + 4096 8192 0 5 10 20 2 0 0 + # echo > schemes + +The last two integers in the 4th line of above example is the total number and +the total size of the regions that the scheme is applied. + + Turning On/Off -------------- diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 8abaeb144e44..0166f9de3428 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -128,7 +128,9 @@ hugepages implicitly specifies the number of huge pages of default size to allocate. If the number of huge pages of default size is implicitly specified, it can not be overwritten by a hugepagesz,hugepages - parameter pair for the default size. + parameter pair for the default size. This parameter also has a + node format. The node format specifies the number of huge pages + to allocate on specific nodes. For example, on an architecture with 2M default huge page size:: @@ -138,6 +140,14 @@ hugepages indicating that the hugepages=512 parameter is ignored. If a hugepages parameter is preceded by an invalid hugepagesz parameter, it will be ignored. + + Node format example:: + + hugepagesz=2M hugepages=0:1,1:2 + + It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1. + If the node number is invalid, the parameter will be ignored. + default_hugepagesz Specify the default huge page size. This parameter can only be specified once on the command line. default_hugepagesz can @@ -234,8 +244,12 @@ will exist, of the form:: hugepages-${size}kB -Inside each of these directories, the same set of files will exist:: +Inside each of these directories, the set of files contained in ``/proc`` +will exist. In addition, two additional interfaces for demoting huge +pages may exist:: + demote + demote_size nr_hugepages nr_hugepages_mempolicy nr_overcommit_hugepages @@ -243,7 +257,29 @@ Inside each of these directories, the same set of files will exist:: resv_hugepages surplus_hugepages -which function as described above for the default huge page-sized case. +The demote interfaces provide the ability to split a huge page into +smaller huge pages. For example, the x86 architecture supports both +1GB and 2MB huge pages sizes. A 1GB huge page can be split into 512 +2MB huge pages. Demote interfaces are not available for the smallest +huge page size. The demote interfaces are: + +demote_size + is the size of demoted pages. When a page is demoted a corresponding + number of huge pages of demote_size will be created. By default, + demote_size is set to the next smaller huge page size. If there are + multiple smaller huge page sizes, demote_size can be set to any of + these smaller sizes. Only huge page sizes less than the current huge + pages size are allowed. + +demote + is used to demote a number of huge pages. A user with root privileges + can write to this file. It may not be possible to demote the + requested number of huge pages. To determine how many pages were + actually demoted, compare the value of nr_hugepages before and after + writing to the demote interface. demote is a write only interface. + +The interfaces which are the same as in ``/proc`` (all except demote and +demote_size) function as described above for the default huge page-sized case. .. _mem_policy_and_hp_alloc: diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index cbd19d5e625f..c21b5823f126 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -37,5 +37,7 @@ the Linux memory management. numaperf pagemap soft-dirty + swap_numa transhuge userfaultfd + zswap diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 03dfbc925252..0f56ecd8ac05 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -165,9 +165,8 @@ Or alternatively:: % echo 1 > /sys/devices/system/memory/memoryXXX/online -The kernel will select the target zone automatically, usually defaulting to -``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel -command line or if the memory block would intersect the ZONE_MOVABLE already. +The kernel will select the target zone automatically, depending on the +configured ``online_policy``. One can explicitly request to associate an offline memory block with ZONE_MOVABLE by:: @@ -198,6 +197,9 @@ Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or % echo online > /sys/devices/system/memory/auto_online_blocks +Similarly to manual onlining, with ``online`` the kernel will select the +target zone automatically, depending on the configured ``online_policy``. + Modifying the auto-online behavior will only affect all subsequently added memory blocks only. @@ -393,11 +395,16 @@ command line parameters are relevant: ======================== ======================================================= ``memhp_default_state`` configure auto-onlining by essentially setting ``/sys/devices/system/memory/auto_online_blocks``. -``movablecore`` configure automatic zone selection of the kernel. When - set, the kernel will default to ZONE_MOVABLE, unless - other zones can be kept contiguous. +``movable_node`` configure automatic zone selection in the kernel when + using the ``contig-zones`` online policy. When + set, the kernel will default to ZONE_MOVABLE when + onlining a memory block, unless other zones can be kept + contiguous. ======================== ======================================================= +See Documentation/admin-guide/kernel-parameters.txt for a more generic +description of these command line parameters. + Module Parameters ------------------ @@ -410,24 +417,118 @@ them with ``memory_hotplug.`` such as:: and they can be observed (and some even modified at runtime) via:: - /sys/modules/memory_hotplug/parameters/ + /sys/module/memory_hotplug/parameters/ The following module parameters are currently defined: -======================== ======================================================= -``memmap_on_memory`` read-write: Allocate memory for the memmap from the - added memory block itself. Even if enabled, actual - support depends on various other system properties and - should only be regarded as a hint whether the behavior - would be desired. +================================ =============================================== +``memmap_on_memory`` read-write: Allocate memory for the memmap from + the added memory block itself. Even if enabled, + actual support depends on various other system + properties and should only be regarded as a + hint whether the behavior would be desired. - While allocating the memmap from the memory block - itself makes memory hotplug less likely to fail and - keeps the memmap on the same NUMA node in any case, it - can fragment physical memory in a way that huge pages - in bigger granularity cannot be formed on hotplugged - memory. -======================== ======================================================= + While allocating the memmap from the memory + block itself makes memory hotplug less likely + to fail and keeps the memmap on the same NUMA + node in any case, it can fragment physical + memory in a way that huge pages in bigger + granularity cannot be formed on hotplugged + memory. +``online_policy`` read-write: Set the basic policy used for + automatic zone selection when onlining memory + blocks without specifying a target zone. + ``contig-zones`` has been the kernel default + before this parameter was added. After an + online policy was configured and memory was + online, the policy should not be changed + anymore. + + When set to ``contig-zones``, the kernel will + try keeping zones contiguous. If a memory block + intersects multiple zones or no zone, the + behavior depends on the ``movable_node`` kernel + command line parameter: default to ZONE_MOVABLE + if set, default to the applicable kernel zone + (usually ZONE_NORMAL) if not set. + + When set to ``auto-movable``, the kernel will + try onlining memory blocks to ZONE_MOVABLE if + possible according to the configuration and + memory device details. With this policy, one + can avoid zone imbalances when eventually + hotplugging a lot of memory later and still + wanting to be able to hotunplug as much as + possible reliably, very desirable in + virtualized environments. This policy ignores + the ``movable_node`` kernel command line + parameter and isn't really applicable in + environments that require it (e.g., bare metal + with hotunpluggable nodes) where hotplugged + memory might be exposed via the + firmware-provided memory map early during boot + to the system instead of getting detected, + added and onlined later during boot (such as + done by virtio-mem or by some hypervisors + implementing emulated DIMMs). As one example, a + hotplugged DIMM will be onlined either + completely to ZONE_MOVABLE or completely to + ZONE_NORMAL, not a mixture. + As another example, as many memory blocks + belonging to a virtio-mem device will be + onlined to ZONE_MOVABLE as possible, + special-casing units of memory blocks that can + only get hotunplugged together. *This policy + does not protect from setups that are + problematic with ZONE_MOVABLE and does not + change the zone of memory blocks dynamically + after they were onlined.* +``auto_movable_ratio`` read-write: Set the maximum MOVABLE:KERNEL + memory ratio in % for the ``auto-movable`` + online policy. Whether the ratio applies only + for the system across all NUMA nodes or also + per NUMA nodes depends on the + ``auto_movable_numa_aware`` configuration. + + All accounting is based on present memory pages + in the zones combined with accounting per + memory device. Memory dedicated to the CMA + allocator is accounted as MOVABLE, although + residing on one of the kernel zones. The + possible ratio depends on the actual workload. + The kernel default is "301" %, for example, + allowing for hotplugging 24 GiB to a 8 GiB VM + and automatically onlining all hotplugged + memory to ZONE_MOVABLE in many setups. The + additional 1% deals with some pages being not + present, for example, because of some firmware + allocations. + + Note that ZONE_NORMAL memory provided by one + memory device does not allow for more + ZONE_MOVABLE memory for a different memory + device. As one example, onlining memory of a + hotplugged DIMM to ZONE_NORMAL will not allow + for another hotplugged DIMM to get onlined to + ZONE_MOVABLE automatically. In contrast, memory + hotplugged by a virtio-mem device that got + onlined to ZONE_NORMAL will allow for more + ZONE_MOVABLE memory within *the same* + virtio-mem device. +``auto_movable_numa_aware`` read-write: Configure whether the + ``auto_movable_ratio`` in the ``auto-movable`` + online policy also applies per NUMA + node in addition to the whole system across all + NUMA nodes. The kernel default is "Y". + + Disabling NUMA awareness can be helpful when + dealing with NUMA nodes that should be + completely hotunpluggable, onlining the memory + completely to ZONE_MOVABLE automatically if + possible. + + Parameter availability depends on CONFIG_NUMA. +================================ =============================================== ZONE_MOVABLE ============ diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 4581527c07ae..bfc28704856c 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -90,13 +90,14 @@ Short descriptions to the page flags ==================================== 0 - LOCKED - page is being locked for exclusive access, e.g. by undergoing read/write IO + The page is being locked for exclusive access, e.g. by undergoing read/write + IO. 7 - SLAB - page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator + The page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator. When compound page is used, SLUB/SLQB will only set this flag on the head page; SLOB will not flag it at all. 10 - BUDDY - a free memory block managed by the buddy system allocator + A free memory block managed by the buddy system allocator. The buddy system organizes free memory in blocks of various orders. An order N block has 2^N physically contiguous pages, with the BUDDY flag set for and _only_ for the first page. @@ -112,65 +113,65 @@ Short descriptions to the page flags 16 - COMPOUND_TAIL A compound page tail (see description above). 17 - HUGE - this is an integral part of a HugeTLB page + This is an integral part of a HugeTLB page. 19 - HWPOISON - hardware detected memory corruption on this page: don't touch the data! + Hardware detected memory corruption on this page: don't touch the data! 20 - NOPAGE - no page frame exists at the requested address + No page frame exists at the requested address. 21 - KSM - identical memory pages dynamically shared between one or more processes + Identical memory pages dynamically shared between one or more processes. 22 - THP - contiguous pages which construct transparent hugepages + Contiguous pages which construct transparent hugepages. 23 - OFFLINE - page is logically offline + The page is logically offline. 24 - ZERO_PAGE - zero page for pfn_zero or huge_zero page + Zero page for pfn_zero or huge_zero page. 25 - IDLE - page has not been accessed since it was marked idle (see + The page has not been accessed since it was marked idle (see :ref:`Documentation/admin-guide/mm/idle_page_tracking.rst `). Note that this flag may be stale in case the page was accessed via a PTE. To make sure the flag is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. 26 - PGTABLE - page is in use as a page table + The page is in use as a page table. IO related page flags --------------------- 1 - ERROR - IO error occurred + IO error occurred. 3 - UPTODATE - page has up-to-date data + The page has up-to-date data. ie. for file backed page: (in-memory data revision >= on-disk one) 4 - DIRTY - page has been written to, hence contains new data + The page has been written to, hence contains new data. i.e. for file backed page: (in-memory data revision > on-disk one) 8 - WRITEBACK - page is being synced to disk + The page is being synced to disk. LRU related page flags ---------------------- 5 - LRU - page is in one of the LRU lists + The page is in one of the LRU lists. 6 - ACTIVE - page is in the active LRU list + The page is in the active LRU list. 18 - UNEVICTABLE - page is in the unevictable (non-)LRU list It is somehow pinned and + The page is in the unevictable (non-)LRU list It is somehow pinned and not a candidate for LRU page reclaims, e.g. ramfs pages, - shmctl(SHM_LOCK) and mlock() memory segments + shmctl(SHM_LOCK) and mlock() memory segments. 2 - REFERENCED - page has been referenced since last LRU list enqueue/requeue + The page has been referenced since last LRU list enqueue/requeue. 9 - RECLAIM - page will be reclaimed soon after its pageout IO completed + The page will be reclaimed soon after its pageout IO completed. 11 - MMAP - a memory mapped page + A memory mapped page. 12 - ANON - a memory mapped page that is not part of a file + A memory mapped page that is not part of a file. 13 - SWAPCACHE - page is mapped to swap space, i.e. has an associated swap entry + The page is mapped to swap space, i.e. has an associated swap entry. 14 - SWAPBACKED - page is backed by swap/RAM + The page is backed by swap/RAM. The page-types tool in the tools/vm directory can be used to query the above flags. diff --git a/Documentation/vm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst similarity index 100% rename from Documentation/vm/swap_numa.rst rename to Documentation/admin-guide/mm/swap_numa.rst diff --git a/Documentation/vm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst similarity index 100% rename from Documentation/vm/zswap.rst rename to Documentation/admin-guide/mm/zswap.rst diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst index de7467e48067..682259ee633a 100644 --- a/Documentation/core-api/memory-hotplug.rst +++ b/Documentation/core-api/memory-hotplug.rst @@ -57,7 +57,6 @@ The third argument (arg) passes a pointer of struct memory_notify:: unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; - int status_change_nid_high; int status_change_nid; } @@ -65,8 +64,6 @@ The third argument (arg) passes a pointer of struct memory_notify:: - nr_pages is # of pages of online/offline memory. - status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask is (will be) set/clear, if this is -1, then nodemask status is not changed. -- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask - is (will be) set/clear, if this is -1, then nodemask status is not changed. - status_change_nid is set node id when N_MEMORY of nodemask is (will be) set/clear. It means a new(memoryless) node gets new memory by online and a node loses all memory. If this is -1, then nodemask status is not changed. diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index 0fbe3308bf37..ac6b89d1a8c3 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -231,10 +231,14 @@ Guarded allocations are set up based on the sample interval. After expiration of the sample interval, the next allocation through the main allocator (SLAB or SLUB) returns a guarded allocation from the KFENCE object pool (allocation sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and -the next allocation is set up after the expiration of the interval. To "gate" a -KFENCE allocation through the main allocator's fast-path without overhead, -KFENCE relies on static branches via the static keys infrastructure. The static -branch is toggled to redirect the allocation to KFENCE. +the next allocation is set up after the expiration of the interval. + +When using ``CONFIG_KFENCE_STATIC_KEYS=y``, KFENCE allocations are "gated" +through the main allocator's fast-path by relying on static branches via the +static keys infrastructure. The static branch is toggled to redirect the +allocation to KFENCE. Depending on sample interval, target workloads, and +system architecture, this may perform better than the simple dynamic branch. +Careful benchmarking is recommended. KFENCE objects each reside on a dedicated page, at either the left or right page boundaries selected at random. The pages to the left and right of the @@ -269,6 +273,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused first, and the chances of detecting use-after-frees of recently freed objects is increased. +If pool utilization reaches 75% (default) or above, to reduce the risk of the +pool eventually being fully occupied by allocated objects yet ensure diverse +coverage of allocations, KFENCE limits currently covered allocations of the +same source from further filling up the pool. The "source" of an allocation is +based on its partial allocation stack trace. A side-effect is that this also +limits frequent long-lived allocations (e.g. pagecache) of the same source +filling up the pool permanently, which is the most common risk for the pool +becoming full and the sampled allocation rate dropping to zero. The threshold +at which to start limiting currently covered allocations can be configured via +the boot parameter ``kfence.skip_covered_thresh`` (pool usage%). + Interface --------- diff --git a/Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml b/Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml new file mode 100644 index 000000000000..044fa967bc8b --- /dev/null +++ b/Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/mediatek,mt7621-pcie.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT7621 PCIe controller + +maintainers: + - Sergio Paracuellos + +description: |+ + MediaTek MT7621 PCIe subsys supports a single Root Complex (RC) + with 3 Root Ports. Each Root Port supports a Gen1 1-lane Link + +allOf: + - $ref: /schemas/pci/pci-bus.yaml# + +properties: + compatible: + const: mediatek,mt7621-pci + + reg: + items: + - description: host-pci bridge registers + - description: pcie port 0 RC control registers + - description: pcie port 1 RC control registers + - description: pcie port 2 RC control registers + + ranges: + maxItems: 2 + +patternProperties: + 'pcie@[0-2],0': + type: object + $ref: /schemas/pci/pci-bus.yaml# + + properties: + resets: + maxItems: 1 + + clocks: + maxItems: 1 + + phys: + maxItems: 1 + + required: + - "#interrupt-cells" + - interrupt-map-mask + - interrupt-map + - resets + - clocks + - phys + - phy-names + - ranges + + unevaluatedProperties: false + +required: + - compatible + - reg + - ranges + - "#interrupt-cells" + - interrupt-map-mask + - interrupt-map + - reset-gpios + +unevaluatedProperties: false + +examples: + - | + #include + #include + + pcie: pcie@1e140000 { + compatible = "mediatek,mt7621-pci"; + reg = <0x1e140000 0x100>, + <0x1e142000 0x100>, + <0x1e143000 0x100>, + <0x1e144000 0x100>; + + #address-cells = <3>; + #size-cells = <2>; + pinctrl-names = "default"; + pinctrl-0 = <&pcie_pins>; + device_type = "pci"; + ranges = <0x02000000 0 0x60000000 0x60000000 0 0x10000000>, /* pci memory */ + <0x01000000 0 0x1e160000 0x1e160000 0 0x00010000>; /* io space */ + #interrupt-cells = <1>; + interrupt-map-mask = <0xF800 0 0 0>; + interrupt-map = <0x0000 0 0 0 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>, + <0x0800 0 0 0 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>, + <0x1000 0 0 0 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>; + reset-gpios = <&gpio 19 GPIO_ACTIVE_LOW>; + + pcie@0,0 { + reg = <0x0000 0 0 0 0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>; + resets = <&rstctrl 24>; + clocks = <&clkctrl 24>; + phys = <&pcie0_phy 1>; + phy-names = "pcie-phy0"; + ranges; + }; + + pcie@1,0 { + reg = <0x0800 0 0 0 0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>; + resets = <&rstctrl 25>; + clocks = <&clkctrl 25>; + phys = <&pcie0_phy 1>; + phy-names = "pcie-phy1"; + ranges; + }; + + pcie@2,0 { + reg = <0x1000 0 0 0 0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>; + resets = <&rstctrl 26>; + clocks = <&clkctrl 26>; + phys = <&pcie2_phy 0>; + phy-names = "pcie-phy2"; + ranges; + }; + }; +... diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml new file mode 100644 index 000000000000..3d23599e5e91 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/qcom,pcie-ep.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm PCIe Endpoint Controller binding + +maintainers: + - Manivannan Sadhasivam + +allOf: + - $ref: "pci-ep.yaml#" + +properties: + compatible: + const: qcom,sdx55-pcie-ep + + reg: + items: + - description: Qualcomm-specific PARF configuration registers + - description: DesignWare PCIe registers + - description: External local bus interface registers + - description: Address Translation Unit (ATU) registers + - description: Memory region used to map remote RC address space + - description: BAR memory region + + reg-names: + items: + - const: parf + - const: dbi + - const: elbi + - const: atu + - const: addr_space + - const: mmio + + clocks: + items: + - description: PCIe Auxiliary clock + - description: PCIe CFG AHB clock + - description: PCIe Master AXI clock + - description: PCIe Slave AXI clock + - description: PCIe Slave Q2A AXI clock + - description: PCIe Sleep clock + - description: PCIe Reference clock + + clock-names: + items: + - const: aux + - const: cfg + - const: bus_master + - const: bus_slave + - const: slave_q2a + - const: sleep + - const: ref + + qcom,perst-regs: + description: Reference to a syscon representing TCSR followed by the two + offsets within syscon for Perst enable and Perst separation + enable registers + $ref: "/schemas/types.yaml#/definitions/phandle-array" + items: + minItems: 3 + maxItems: 3 + + interrupts: + items: + - description: PCIe Global interrupt + - description: PCIe Doorbell interrupt + + interrupt-names: + items: + - const: global + - const: doorbell + + reset-gpios: + description: GPIO used as PERST# input signal + maxItems: 1 + + wake-gpios: + description: GPIO used as WAKE# output signal + maxItems: 1 + + resets: + maxItems: 1 + + reset-names: + const: core + + power-domains: + maxItems: 1 + + phys: + maxItems: 1 + + phy-names: + const: pciephy + + num-lanes: + default: 2 + +required: + - compatible + - reg + - reg-names + - clocks + - clock-names + - qcom,perst-regs + - interrupts + - interrupt-names + - reset-gpios + - resets + - reset-names + - power-domains + +unevaluatedProperties: false + +examples: + - | + #include + #include + #include + pcie_ep: pcie-ep@40000000 { + compatible = "qcom,sdx55-pcie-ep"; + reg = <0x01c00000 0x3000>, + <0x40000000 0xf1d>, + <0x40000f20 0xc8>, + <0x40001000 0x1000>, + <0x40002000 0x1000>, + <0x01c03000 0x3000>; + reg-names = "parf", "dbi", "elbi", "atu", "addr_space", + "mmio"; + + clocks = <&gcc GCC_PCIE_AUX_CLK>, + <&gcc GCC_PCIE_CFG_AHB_CLK>, + <&gcc GCC_PCIE_MSTR_AXI_CLK>, + <&gcc GCC_PCIE_SLV_AXI_CLK>, + <&gcc GCC_PCIE_SLV_Q2A_AXI_CLK>, + <&gcc GCC_PCIE_SLEEP_CLK>, + <&gcc GCC_PCIE_0_CLKREF_CLK>; + clock-names = "aux", "cfg", "bus_master", "bus_slave", + "slave_q2a", "sleep", "ref"; + + qcom,perst-regs = <&tcsr 0xb258 0xb270>; + + interrupts = , + ; + interrupt-names = "global", "doorbell"; + reset-gpios = <&tlmm 57 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 53 GPIO_ACTIVE_LOW>; + resets = <&gcc GCC_PCIE_BCR>; + reset-names = "core"; + power-domains = <&gcc PCIE_GDSC>; + phys = <&pcie0_lane>; + phy-names = "pciephy"; + max-link-speed = <3>; + num-lanes = <2>; + }; diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt index 3f646875f8c2..a0ae024c2d0c 100644 --- a/Documentation/devicetree/bindings/pci/qcom,pcie.txt +++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt @@ -12,6 +12,7 @@ - "qcom,pcie-ipq4019" for ipq4019 - "qcom,pcie-ipq8074" for ipq8074 - "qcom,pcie-qcs404" for qcs404 + - "qcom,pcie-sc8180x" for sc8180x - "qcom,pcie-sdm845" for sdm845 - "qcom,pcie-sm8250" for sm8250 - "qcom,pcie-ipq6018" for ipq6018 @@ -156,7 +157,7 @@ - "pipe" PIPE clock - clock-names: - Usage: required for sm8250 + Usage: required for sc8180x and sm8250 Value type: Definition: Should contain the following entries - "aux" Auxiliary clock @@ -245,7 +246,7 @@ - "ahb" AHB reset - reset-names: - Usage: required for sdm845 and sm8250 + Usage: required for sc8180x, sdm845 and sm8250 Value type: Definition: Should contain the following entries - "pci" PCIe core reset diff --git a/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml new file mode 100644 index 000000000000..142bbe577763 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/rockchip-dw-pcie.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: DesignWare based PCIe controller on Rockchip SoCs + +maintainers: + - Shawn Lin + - Simon Xue + - Heiko Stuebner + +description: |+ + RK3568 SoC PCIe host controller is based on the Synopsys DesignWare + PCIe IP and thus inherits all the common properties defined in + designware-pcie.txt. + +allOf: + - $ref: /schemas/pci/pci-bus.yaml# + +# We need a select here so we don't match all nodes with 'snps,dw-pcie' +select: + properties: + compatible: + contains: + const: rockchip,rk3568-pcie + required: + - compatible + +properties: + compatible: + items: + - const: rockchip,rk3568-pcie + - const: snps,dw-pcie + + reg: + items: + - description: Data Bus Interface (DBI) registers + - description: Rockchip designed configuration registers + - description: Config registers + + reg-names: + items: + - const: dbi + - const: apb + - const: config + + clocks: + items: + - description: AHB clock for PCIe master + - description: AHB clock for PCIe slave + - description: AHB clock for PCIe dbi + - description: APB clock for PCIe + - description: Auxiliary clock for PCIe + + clock-names: + items: + - const: aclk_mst + - const: aclk_slv + - const: aclk_dbi + - const: pclk + - const: aux + + msi-map: true + + num-lanes: true + + phys: + maxItems: 1 + + phy-names: + const: pcie-phy + + power-domains: + maxItems: 1 + + ranges: + maxItems: 2 + + resets: + maxItems: 1 + + reset-names: + const: pipe + + vpcie3v3-supply: true + +required: + - compatible + - reg + - reg-names + - clocks + - clock-names + - msi-map + - num-lanes + - phys + - phy-names + - power-domains + - resets + - reset-names + +unevaluatedProperties: false + +examples: + - | + + bus { + #address-cells = <2>; + #size-cells = <2>; + + pcie3x2: pcie@fe280000 { + compatible = "rockchip,rk3568-pcie", "snps,dw-pcie"; + reg = <0x3 0xc0800000 0x0 0x390000>, + <0x0 0xfe280000 0x0 0x10000>, + <0x3 0x80000000 0x0 0x100000>; + reg-names = "dbi", "apb", "config"; + bus-range = <0x20 0x2f>; + clocks = <&cru 143>, <&cru 144>, + <&cru 145>, <&cru 146>, + <&cru 147>; + clock-names = "aclk_mst", "aclk_slv", + "aclk_dbi", "pclk", + "aux"; + device_type = "pci"; + linux,pci-domain = <2>; + max-link-speed = <2>; + msi-map = <0x2000 &its 0x2000 0x1000>; + num-lanes = <2>; + phys = <&pcie30phy>; + phy-names = "pcie-phy"; + power-domains = <&power 15>; + ranges = <0x81000000 0x0 0x80800000 0x3 0x80800000 0x0 0x100000>, + <0x83000000 0x0 0x80900000 0x3 0x80900000 0x0 0x3f700000>; + resets = <&cru 193>; + reset-names = "pipe"; + #address-cells = <3>; + #size-cells = <2>; + }; + }; +... diff --git a/Documentation/translations/zh_CN/core-api/memory-hotplug.rst b/Documentation/translations/zh_CN/core-api/memory-hotplug.rst index 0750d9442477..9b2841fb9a5f 100644 --- a/Documentation/translations/zh_CN/core-api/memory-hotplug.rst +++ b/Documentation/translations/zh_CN/core-api/memory-hotplug.rst @@ -63,7 +63,6 @@ memory_notify结构体的指针:: unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; - int status_change_nid_high; int status_change_nid; } @@ -74,9 +73,6 @@ memory_notify结构体的指针:: - status_change_nid_normal是当nodemask的N_NORMAL_MEMORY被设置/清除时设置节 点id,如果是-1,则nodemask状态不改变。 -- status_change_nid_high是当nodemask的N_HIGH_MEMORY被设置/清除时设置的节点 - id,如果这个值为-1,那么nodemask状态不会改变。 - - status_change_nid是当nodemask的N_MEMORY被(将)设置/清除时设置的节点id。这 意味着一个新的(没上线的)节点通过联机获得新的内存,而一个节点失去了所有的内 存。如果这个值为-1,那么nodemask的状态就不会改变。 diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst index b05159c295f4..210f0f50efd8 100644 --- a/Documentation/vm/damon/design.rst +++ b/Documentation/vm/damon/design.rst @@ -35,13 +35,17 @@ two parts: 1. Identification of the monitoring target address range for the address space. 2. Access check of specific address range in the target space. -DAMON currently provides the implementation of the primitives for only the -virtual address spaces. Below two subsections describe how it works. +DAMON currently provides the implementations of the primitives for the physical +and virtual address spaces. Below two subsections describe how those work. VMA-based Target Address Range Construction ------------------------------------------- +This is only for the virtual address space primitives implementation. That for +the physical address space simply asks users to manually set the monitoring +target address ranges. + Only small parts in the super-huge virtual address space of the processes are mapped to the physical memory and accessed. Thus, tracking the unmapped address regions is just wasteful. However, because DAMON can deal with some @@ -71,15 +75,18 @@ to make a reasonable trade-off. Below shows this in detail:: PTE Accessed-bit Based Access Check ----------------------------------- -The implementation for the virtual address space uses PTE Accessed-bit for -basic access checks. It finds the relevant PTE Accessed bit from the address -by walking the page table for the target task of the address. In this way, the -implementation finds and clears the bit for next sampling target address and -checks whether the bit set again after one sampling period. This could disturb -other kernel subsystems using the Accessed bits, namely Idle page tracking and -the reclaim logic. To avoid such disturbances, DAMON makes it mutually -exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page -flags to solve the conflict with the reclaim logic, as Idle page tracking does. +Both of the implementations for physical and virtual address spaces use PTE +Accessed-bit for basic access checks. Only one difference is the way of +finding the relevant PTE Accessed bit(s) from the address. While the +implementation for the virtual address walks the page table for the target task +of the address, the implementation for the physical address walks every page +table having a mapping to the address. In this way, the implementations find +and clear the bit(s) for next sampling target address and checks whether the +bit(s) set again after one sampling period. This could disturb other kernel +subsystems using the Accessed bits, namely Idle page tracking and the reclaim +logic. To avoid such disturbances, DAMON makes it mutually exclusive with Idle +page tracking and uses ``PG_idle`` and ``PG_young`` page flags to solve the +conflict with the reclaim logic, as Idle page tracking does. Address Space Independent Core Mechanisms diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst index cb3d8b585a8b..11aea40eb328 100644 --- a/Documentation/vm/damon/faq.rst +++ b/Documentation/vm/damon/faq.rst @@ -36,10 +36,9 @@ constructions and actual access checks can be implemented and configured on the DAMON core by the users. In this way, DAMON users can monitor any address space with any access check technique. -Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based +Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based implementations of the address space dependent functions for the virtual memory -by default, for a reference and convenient use. In near future, we will -provide those for physical memory address space. +and the physical memory by default, for a reference and convenient use. Can I simply monitor page granularity? diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst index a2858baf3bf1..48c0bbff98b2 100644 --- a/Documentation/vm/damon/index.rst +++ b/Documentation/vm/damon/index.rst @@ -27,4 +27,3 @@ workloads and systems. faq design api - plans diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index b51f0d8992f8..6f5ffef4b716 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -3,27 +3,11 @@ Linux Memory Management Documentation ===================================== This is a collection of documents about the Linux memory management (mm) -subsystem. If you are looking for advice on simply allocating memory, -see the :ref:`memory_allocation`. - -User guides for MM features -=========================== - -The following documents provide guides for controlling and tuning -various features of the Linux memory management - -.. toctree:: - :maxdepth: 1 - - swap_numa - zswap - -Kernel developers MM documentation -================================== - -The below documents describe MM internals with different level of -details ranging from notes and mailing list responses to elaborate -descriptions of data structures and algorithms. +subsystem internals with different level of details ranging from notes and +mailing list responses for elaborating descriptions of data structures and +algorithms. If you are looking for advice on simply allocating memory, see the +:ref:`memory_allocation`. For controlling and tuning guides, see the +:doc:`admin guide <../admin-guide/mm/index>`. .. toctree:: :maxdepth: 1 diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 2175465c9bf2..9837fc8147dd 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -85,5 +85,26 @@ Usage cat /sys/kernel/debug/page_owner > page_owner_full.txt ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + The general output of ``page_owner_full.txt`` is as follows: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows + in buf, uses regexp to extract the page order value, counts the times + and pages of buf, and finally sorts them according to the times. + See the result about who allocated each page - in the ``sorted_page_owner.txt``. + in the ``sorted_page_owner.txt``. General output: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + By default, ``page_owner_sort`` is sorted according to the times of buf. + If you want to sort by the pages nums of buf, use the ``-m`` parameter. diff --git a/MAINTAINERS b/MAINTAINERS index 74158b271cb7..edf76bbafe16 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1297,6 +1297,13 @@ S: Maintained F: Documentation/devicetree/bindings/iommu/apple,dart.yaml F: drivers/iommu/apple-dart.c +APPLE PCIE CONTROLLER DRIVER +M: Alyssa Rosenzweig +M: Marc Zyngier +L: linux-pci@vger.kernel.org +S: Maintained +F: drivers/pci/controller/pcie-apple.c + APPLE SMC DRIVER M: Henrik Rydberg L: linux-hwmon@vger.kernel.org @@ -5220,7 +5227,7 @@ F: net/ax25/ax25_timer.c F: net/ax25/sysctl_net_ax25.c DATA ACCESS MONITOR -M: SeongJae Park +M: SeongJae Park L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/damon/ @@ -12005,6 +12012,12 @@ S: Maintained F: Documentation/devicetree/bindings/i2c/i2c-mt7621.txt F: drivers/i2c/busses/i2c-mt7621.c +MEDIATEK MT7621 PCIE CONTROLLER DRIVER +M: Sergio Paracuellos +S: Maintained +F: Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml +F: drivers/pci/controller/pcie-mt7621.c + MEDIATEK MT7621 PHY PCI DRIVER M: Sergio Paracuellos S: Maintained @@ -14647,9 +14660,12 @@ M: Lorenzo Pieralisi R: Krzysztof Wilczyński L: linux-pci@vger.kernel.org S: Supported +Q: https://patchwork.kernel.org/project/linux-pci/list/ +B: https://bugzilla.kernel.org +C: irc://irc.oftc.net/linux-pci +T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git F: Documentation/PCI/endpoint/* F: Documentation/misc-devices/pci-endpoint-test.rst -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kishon/pci-endpoint.git F: drivers/misc/pci_endpoint_test.c F: drivers/pci/endpoint/ F: tools/pci/ @@ -14695,15 +14711,21 @@ R: Rob Herring R: Krzysztof Wilczyński L: linux-pci@vger.kernel.org S: Supported -Q: http://patchwork.ozlabs.org/project/linux-pci/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/ +Q: https://patchwork.kernel.org/project/linux-pci/list/ +B: https://bugzilla.kernel.org +C: irc://irc.oftc.net/linux-pci +T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git F: drivers/pci/controller/ +F: drivers/pci/pci-bridge-emul.c +F: drivers/pci/pci-bridge-emul.h PCI SUBSYSTEM M: Bjorn Helgaas L: linux-pci@vger.kernel.org S: Supported -Q: http://patchwork.ozlabs.org/project/linux-pci/list/ +Q: https://patchwork.kernel.org/project/linux-pci/list/ +B: https://bugzilla.kernel.org +C: irc://irc.oftc.net/linux-pci T: git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git F: Documentation/PCI/ F: Documentation/devicetree/bindings/pci/ @@ -14803,7 +14825,15 @@ M: Stanimir Varbanov L: linux-pci@vger.kernel.org L: linux-arm-msm@vger.kernel.org S: Maintained -F: drivers/pci/controller/dwc/*qcom* +F: drivers/pci/controller/dwc/pcie-qcom.c + +PCIE ENDPOINT DRIVER FOR QUALCOMM +M: Manivannan Sadhasivam +L: linux-pci@vger.kernel.org +L: linux-arm-msm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml +F: drivers/pci/controller/dwc/pcie-qcom-ep.c PCIE DRIVER FOR ROCKCHIP M: Shawn Lin diff --git a/Makefile b/Makefile index 375d9b1611eb..2b45a326661c 100644 --- a/Makefile +++ b/Makefile @@ -1015,6 +1015,21 @@ ifdef CONFIG_CC_IS_GCC KBUILD_CFLAGS += -Wno-maybe-uninitialized endif +ifdef CONFIG_CC_IS_GCC +# The allocators already balk at large sizes, so silence the compiler +# warnings for bounds checks involving those possible values. While +# -Wno-alloc-size-larger-than would normally be used here, earlier versions +# of gcc (<9.1) weirdly don't handle the option correctly when _other_ +# warnings are produced (?!). Using -Walloc-size-larger-than=SIZE_MAX +# doesn't work (as it is documented to), silently resolving to "0" prior to +# version 9.1 (and producing an error more recently). Numeric values larger +# than PTRDIFF_MAX also don't work prior to version 9.1, which are silently +# ignored, continuing to default to PTRDIFF_MAX. So, left with no other +# choice, we must perform a versioned check to disable this warning. +# https://lore.kernel.org/lkml/20210824115859.187f272f@canb.auug.org.au +KBUILD_CFLAGS += $(call cc-ifversion, -ge, 0901, -Wno-alloc-size-larger-than) +endif + # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index 72af1e72d833..6b8ed12936b6 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c @@ -233,7 +233,7 @@ albacore_init_arch(void) unsigned long size; size = initrd_end - initrd_start; - memblock_free(__pa(initrd_start), PAGE_ALIGN(size)); + memblock_free((void *)initrd_start, PAGE_ALIGN(size)); if (!move_initrd(pci_mem)) printk("irongate_init_arch: initrd too big " "(%ldK)\ndisabling initrd\n", diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 699ecf119641..ce4e939a7f07 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -59,13 +59,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) low_mem_sz = size; in_use = 1; - memblock_add_node(base, size, 0); + memblock_add_node(base, size, 0, MEMBLOCK_NONE); } else { #ifdef CONFIG_HIGHMEM high_mem_start = base; high_mem_sz = size; in_use = 1; - memblock_add_node(base, size, 1); + memblock_add_node(base, size, 1, MEMBLOCK_NONE); memblock_reserve(base, size); #endif } @@ -173,7 +173,7 @@ static void __init highmem_init(void) #ifdef CONFIG_HIGHMEM unsigned long tmp; - memblock_free(high_mem_start, high_mem_sz); + memblock_phys_free(high_mem_start, high_mem_sz); for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++) free_highmem_page(pfn_to_page(tmp)); #endif diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c index 96a484095194..258586e31333 100644 --- a/arch/arm/mach-hisi/platmcpm.c +++ b/arch/arm/mach-hisi/platmcpm.c @@ -339,7 +339,7 @@ err_fabric: err_sysctrl: iounmap(relocation); err_reloc: - memblock_free(hip04_boot_method[0], hip04_boot_method[1]); + memblock_phys_free(hip04_boot_method[0], hip04_boot_method[1]); err: return ret; } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 6162a070a410..6d0cb0f7bc54 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -158,7 +158,7 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align) panic("Failed to steal %pa bytes at %pS\n", &size, (void *)_RET_IP_); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); return phys; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 04a2aaf843c6..9e440657f15e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1163,6 +1163,10 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK def_bool y depends on NUMA +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y + depends on NUMA + source "kernel/Kconfig.hz" config ARCH_SPARSEMEM_ENABLE diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 61b52a92b8b6..5b996ca4d996 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -287,6 +287,22 @@ static void __init kasan_init_depth(void) init_task.kasan_depth = 0; } +#ifdef CONFIG_KASAN_VMALLOC +void __init kasan_populate_early_vm_area_shadow(void *start, unsigned long size) +{ + unsigned long shadow_start, shadow_end; + + if (!is_vmalloc_or_module_addr(start)) + return; + + shadow_start = (unsigned long)kasan_mem_to_shadow(start); + shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); + shadow_end = (unsigned long)kasan_mem_to_shadow(start + size); + shadow_end = ALIGN(shadow_end, PAGE_SIZE); + kasan_map_populate(shadow_start, shadow_end, NUMA_NO_NODE); +} +#endif + void __init kasan_init(void) { kasan_init_shadow(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index fd85b51b9d50..d77bf06d6a6d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -738,8 +738,8 @@ void __init paging_init(void) cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); init_mm.pgd = swapper_pg_dir; - memblock_free(__pa_symbol(init_pg_dir), - __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir)); + memblock_phys_free(__pa_symbol(init_pg_dir), + __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir)); memblock_allow_resize(); } diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 42e025cfbd08..24901d809301 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -153,7 +153,7 @@ find_memory (void) efi_memmap_walk(find_max_min_low_pfn, NULL); max_pfn = max_low_pfn; - memblock_add_node(0, PFN_PHYS(max_low_pfn), 0); + memblock_add_node(0, PFN_PHYS(max_low_pfn), 0, MEMBLOCK_NONE); find_initrd(); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 5c6da8d83c1a..5d165607bf35 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -378,7 +378,7 @@ int __init register_active_ranges(u64 start, u64 len, int nid) #endif if (start < end) - memblock_add_node(__pa(start), end - start, nid); + memblock_add_node(__pa(start), end - start, nid, MEMBLOCK_NONE); return 0; } diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index eac9dde65193..6f1f25125294 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -174,7 +174,8 @@ void __init cf_bootmem_alloc(void) m68k_memory[0].addr = _rambase; m68k_memory[0].size = _ramend - _rambase; - memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0); + memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0, + MEMBLOCK_NONE); /* compute total pages in system */ num_pages = PFN_DOWN(_ramend - _rambase); diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 9f3f77785aa7..2b05bb2bac00 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -410,7 +410,8 @@ void __init paging_init(void) min_addr = m68k_memory[0].addr; max_addr = min_addr + m68k_memory[0].size; - memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0); + memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0, + MEMBLOCK_NONE); for (i = 1; i < m68k_num_memory;) { if (m68k_memory[i].addr < min_addr) { printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n", @@ -421,7 +422,8 @@ void __init paging_init(void) (m68k_num_memory - i) * sizeof(struct m68k_mem_info)); continue; } - memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i); + memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i, + MEMBLOCK_NONE); addr = m68k_memory[i].addr + m68k_memory[i].size; if (addr > max_addr) max_addr = addr; diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 557585f1be41..622a4867f9e9 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -587,13 +587,12 @@ static void pcibios_fixup_resources(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_resources); -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { dev->irq = of_irq_parse_and_map_pci(dev, 0, 0); return 0; } -EXPORT_SYMBOL(pcibios_add_device); /* * Reparent resource children of pr that conflict with res diff --git a/arch/mips/loongson64/init.c b/arch/mips/loongson64/init.c index f03fc52ed1d6..ee8de1735b7c 100644 --- a/arch/mips/loongson64/init.c +++ b/arch/mips/loongson64/init.c @@ -77,7 +77,9 @@ void __init szmem(unsigned int node) (u32)node_id, mem_type, mem_start, mem_size); pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", start_pfn, end_pfn, num_physpages); - memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(node_psize), node); + memblock_add_node(PFN_PHYS(start_pfn), + PFN_PHYS(node_psize), node, + MEMBLOCK_NONE); break; case SYSTEM_RAM_RESERVED: pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n", diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 19347dc6bbf8..325e1552cbea 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_early(__pa(ptr), size); + memblock_free(ptr, size); } void __init setup_per_cpu_areas(void) diff --git a/arch/mips/ralink/Kconfig b/arch/mips/ralink/Kconfig index c800bf5559b5..120adad51d6a 100644 --- a/arch/mips/ralink/Kconfig +++ b/arch/mips/ralink/Kconfig @@ -51,7 +51,8 @@ choice select SYS_SUPPORTS_HIGHMEM select MIPS_GIC select CLKSRC_MIPS_GIC - select HAVE_PCI if PCI_MT7621 + select HAVE_PCI + select PCI_DRIVERS_GENERIC select SOC_BUS endchoice diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 6173684b5aaa..adc2faeecf7c 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -341,7 +341,8 @@ static void __init szmem(void) continue; } memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)), - PFN_PHYS(slot_psize), node); + PFN_PHYS(slot_psize), node, + MEMBLOCK_NONE); } } } diff --git a/arch/mips/sgi-ip30/ip30-setup.c b/arch/mips/sgi-ip30/ip30-setup.c index 44b1607e964d..75a34684e704 100644 --- a/arch/mips/sgi-ip30/ip30-setup.c +++ b/arch/mips/sgi-ip30/ip30-setup.c @@ -69,10 +69,10 @@ static void __init ip30_mem_init(void) total_mem += size; if (addr >= IP30_REAL_MEMORY_START) - memblock_free(addr, size); + memblock_phys_free(addr, size); else if ((addr + size) > IP30_REAL_MEMORY_START) - memblock_free(IP30_REAL_MEMORY_START, - size - IP30_MAX_PROM_MEMORY); + memblock_phys_free(IP30_REAL_MEMORY_START, + size - IP30_MAX_PROM_MEMORY); } pr_info("Detected %luMB of physical memory.\n", MEM_SHIFT(total_mem)); } diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 396d508730a1..f491875700e8 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -274,7 +274,6 @@ CONFIG_NLS_UTF8=y CONFIG_ENCRYPTED_KEYS=y CONFIG_SECURITY=y CONFIG_HARDENED_USERCOPY=y -# CONFIG_HARDENED_USERCOPY_FALLBACK is not set CONFIG_HARDENED_USERCOPY_PAGESPAN=y CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_LOCKDOWN_LSM=y diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index a68311077d32..9c3c9f04129f 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -31,7 +31,7 @@ struct machdep_calls { #ifdef CONFIG_PM void (*iommu_restore)(void); #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG unsigned long (*memory_block_size)(void); #endif #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index 2b9edbf6e929..f6cf0159024e 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -55,11 +55,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); void eeh_sysfs_add_device(struct pci_dev *pdev); void eeh_sysfs_remove_device(struct pci_dev *pdev); -static inline const char *eeh_driver_name(struct pci_dev *pdev) -{ - return (pdev && pdev->driver) ? pdev->driver->name : ""; -} - #endif /* CONFIG_EEH */ #define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff) diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 6e4af4492a14..79cb7a25a5fb 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -6,21 +6,8 @@ #include #include -#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed - #include -extern bool init_mem_is_free; - -static inline int arch_is_kernel_initmem_freed(unsigned long addr) -{ - if (!init_mem_is_free) - return 0; - - return addr >= (unsigned long)__init_begin && - addr < (unsigned long)__init_end; -} - extern char __head_end[]; #ifdef __powerpc64__ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 358aee7c2d79..ba527fb52993 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -1095,8 +1095,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char cpufeatures_setup_finished(); - memblock_free(__pa(dt_cpu_features), - sizeof(struct dt_cpu_feature)*nr_dt_cpu_features); + memblock_free(dt_cpu_features, + sizeof(struct dt_cpu_feature) * nr_dt_cpu_features); return 0; } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 91e0f4cf1db3..28bb1e7263a6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -399,6 +399,14 @@ out: return ret; } +static inline const char *eeh_driver_name(struct pci_dev *pdev) +{ + if (pdev) + return dev_driver_string(&pdev->dev); + + return ""; +} + /** * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze * @edev: eeh device diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 3eff6a4888e7..350dab18e137 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -104,13 +104,13 @@ static bool eeh_edev_actionable(struct eeh_dev *edev) */ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) { - if (!pdev || !pdev->driver) + if (!pdev || !pdev->dev.driver) return NULL; - if (!try_module_get(pdev->driver->driver.owner)) + if (!try_module_get(pdev->dev.driver->owner)) return NULL; - return pdev->driver; + return to_pci_driver(pdev->dev.driver); } /** @@ -122,10 +122,10 @@ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) */ static inline void eeh_pcid_put(struct pci_dev *pdev) { - if (!pdev || !pdev->driver) + if (!pdev || !pdev->dev.driver) return; - module_put(pdev->driver->driver.owner); + module_put(pdev->dev.driver->owner); } /** diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 9bd30cac852b..4208b4044d12 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -322,8 +322,8 @@ void __init free_unused_pacas(void) new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids; if (new_ptrs_size < paca_ptrs_size) - memblock_free(__pa(paca_ptrs) + new_ptrs_size, - paca_ptrs_size - new_ptrs_size); + memblock_phys_free(__pa(paca_ptrs) + new_ptrs_size, + paca_ptrs_size - new_ptrs_size); paca_nr_cpu_ids = nr_cpu_ids; paca_ptrs_size = new_ptrs_size; @@ -331,8 +331,8 @@ void __init free_unused_pacas(void) #ifdef CONFIG_PPC_BOOK3S_64 if (early_radix_enabled()) { /* Ugly fixup, see new_slb_shadow() */ - memblock_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr), - sizeof(struct slb_shadow)); + memblock_phys_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr), + sizeof(struct slb_shadow)); paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL; } #endif diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index c3573430919d..6749905932f4 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1059,7 +1059,7 @@ void pcibios_bus_add_device(struct pci_dev *dev) ppc_md.pcibios_bus_add_device(dev); } -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { struct irq_domain *d; diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 0b7894eed58d..4f1322b65760 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -822,7 +822,7 @@ static void __init smp_setup_pacas(void) set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]); } - memblock_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32)); + memblock_free(cpu_to_phys_id, nr_cpu_ids * sizeof(u32)); cpu_to_phys_id = NULL; } #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index eaa79a0996d1..6052f5d5ded3 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -812,7 +812,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, static void __init pcpu_free_bootmem(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_free(ptr, size); } static int pcpu_cpu_distance(unsigned int from, unsigned int to) @@ -912,7 +912,7 @@ void __init setup_per_cpu_areas(void) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG unsigned long memory_block_size_bytes(void) { if (ppc_md.memory_block_size) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9a75ba078e1b..82d8b368ca6d 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -229,17 +229,22 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) m->hstate = hstate; return 1; } + +bool __init hugetlb_node_alloc_supported(void) +{ + return false; +} #endif -int __init alloc_bootmem_huge_page(struct hstate *h) +int __init alloc_bootmem_huge_page(struct hstate *h, int nid) { #ifdef CONFIG_PPC_BOOK3S_64 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) return pseries_alloc_bootmem_huge_page(h); #endif - return __alloc_bootmem_huge_page(h); + return __alloc_bootmem_huge_page(h, nid); } #ifndef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3dd35c327d1c..004cd6a96c8a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2981,7 +2981,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, if (!phb->hose) { pr_err(" Can't allocate PCI controller for %pOF\n", np); - memblock_free(__pa(phb), sizeof(struct pnv_phb)); + memblock_free(phb, sizeof(struct pnv_phb)); return; } diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c index deddbb233fde..04155aaaadb1 100644 --- a/arch/powerpc/platforms/powernv/pci-sriov.c +++ b/arch/powerpc/platforms/powernv/pci-sriov.c @@ -51,7 +51,7 @@ * to "new_size", calculated above. Implementing this is a convoluted process * which requires several hooks in the PCI core: * - * 1. In pcibios_add_device() we call pnv_pci_ioda_fixup_iov(). + * 1. In pcibios_device_add() we call pnv_pci_ioda_fixup_iov(). * * At this point the device has been probed and the device's BARs are sized, * but no resource allocations have been done. The SR-IOV BARs are sized diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index a8db3f153063..ad56a54ac9c5 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -440,7 +440,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) } #endif /* CONFIG_KEXEC_CORE */ -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG static unsigned long pnv_memory_block_size(void) { /* @@ -553,7 +553,7 @@ define_machine(powernv) { #ifdef CONFIG_KEXEC_CORE .kexec_cpu_down = pnv_kexec_cpu_down, #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG .memory_block_size = pnv_memory_block_size, #endif }; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 2188054470c1..8a62af5b9c24 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -1088,7 +1088,7 @@ define_machine(pseries) { .machine_kexec = pSeries_machine_kexec, .kexec_cpu_down = pseries_kexec_cpu_down, #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG .memory_block_size = pseries_memory_block_size, #endif }; diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index c083ecbbae4d..c5228f4969eb 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -57,8 +57,7 @@ void __init svm_swiotlb_init(void) return; - memblock_free_early(__pa(vstart), - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); panic("SVM: Cannot allocate SWIOTLB buffer"); } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index b9620e5f00ba..b42bfdc67482 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -230,13 +230,13 @@ static void __init init_resources(void) /* Clean-up any unused pre-allocated resources */ if (res_idx >= 0) - memblock_free(__pa(mem_res), (res_idx + 1) * sizeof(*mem_res)); + memblock_free(mem_res, (res_idx + 1) * sizeof(*mem_res)); return; error: /* Better an empty resource tree than an inconsistent one */ release_child_resources(&iomem_resource); - memblock_free(__pa(mem_res), mem_res_sz); + memblock_free(mem_res, mem_res_sz); } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b86de61b8caa..8857ec3b97eb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -153,12 +153,15 @@ config S390 select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_ARGS + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FAST_GUP select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER @@ -190,6 +193,7 @@ config S390 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECT select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING @@ -434,6 +438,14 @@ endchoice config 64BIT def_bool y +config COMMAND_LINE_SIZE + int "Maximum size of kernel command line" + default 4096 + range 896 1048576 + help + This allows you to specify the maximum length of the kernel command + line. + config COMPAT def_bool y prompt "Kernel support for 31 bit emulation" @@ -938,6 +950,8 @@ menu "Selftests" config S390_UNWIND_SELFTEST def_tristate n + depends on KUNIT + default KUNIT_ALL_TESTS prompt "Test unwind functions" help This option enables s390 specific stack unwinder testing kernel @@ -946,4 +960,16 @@ config S390_UNWIND_SELFTEST Say N if you are unsure. +config S390_KPROBES_SANITY_TEST + def_tristate n + prompt "Enable s390 specific kprobes tests" + depends on KPROBES + depends on KUNIT + help + This option enables an s390 specific kprobes test module. This option + is not useful for distributions or general kernels, but only for kernel + developers working on architecture code. + + Say N if you are unsure. + endmenu diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h index a59f75c5b049..f75cc31a77dd 100644 --- a/arch/s390/boot/compressed/decompressor.h +++ b/arch/s390/boot/compressed/decompressor.h @@ -24,6 +24,7 @@ struct vmlinux_info { unsigned long dynsym_start; unsigned long rela_dyn_start; unsigned long rela_dyn_end; + unsigned long amode31_size; }; /* Symbols defined by linker scripts */ diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index 40f4cff538b8..3a252d140c55 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -184,35 +184,23 @@ iplstart: bas %r14,.Lloader # load parameter file ltr %r2,%r2 # got anything ? bz .Lnopf - chi %r2,895 - bnh .Lnotrunc - la %r2,895 + l %r3,MAX_COMMAND_LINE_SIZE+ARCH_OFFSET-PARMAREA(%r12) + ahi %r3,-1 + clr %r2,%r3 + bl .Lnotrunc + lr %r2,%r3 .Lnotrunc: l %r4,.Linitrd clc 0(3,%r4),.L_hdr # if it is HDRx bz .Lagain1 # skip dataset header clc 0(3,%r4),.L_eof # if it is EOFx bz .Lagain1 # skip dateset trailer - la %r5,0(%r4,%r2) - lr %r3,%r2 - la %r3,COMMAND_LINE-PARMAREA(%r12) # load adr. of command line - mvc 0(256,%r3),0(%r4) - mvc 256(256,%r3),256(%r4) - mvc 512(256,%r3),512(%r4) - mvc 768(122,%r3),768(%r4) - slr %r0,%r0 - b .Lcntlp -.Ldelspc: - ic %r0,0(%r2,%r3) - chi %r0,0x20 # is it a space ? - be .Lcntlp - ahi %r2,1 - b .Leolp -.Lcntlp: - brct %r2,.Ldelspc -.Leolp: - slr %r0,%r0 - stc %r0,0(%r2,%r3) # terminate buffer + + lr %r5,%r2 + la %r6,COMMAND_LINE-PARMAREA(%r12) + lr %r7,%r2 + ahi %r7,1 + mvcl %r6,%r4 .Lnopf: # @@ -317,6 +305,7 @@ SYM_CODE_START_LOCAL(startup_normal) xc 0x300(256),0x300 xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 + lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers stcke __LC_BOOT_CLOCK mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) @@ -335,6 +324,22 @@ SYM_CODE_END(startup_normal) .quad 0x0000000180000000,startup_pgm_check_handler .Lio_new_psw: .quad 0x0002000180000000,0x1f0 # disabled wait +.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space + .quad 0 # cr1: primary space segment table + .quad 0 # cr2: dispatchable unit control table + .quad 0 # cr3: instruction authorization + .quad 0xffff # cr4: instruction authorization + .quad 0 # cr5: primary-aste origin + .quad 0 # cr6: I/O interrupts + .quad 0 # cr7: secondary space segment table + .quad 0x0000000000008000 # cr8: access registers translation + .quad 0 # cr9: tracing off + .quad 0 # cr10: tracing off + .quad 0 # cr11: tracing off + .quad 0 # cr12: tracing off + .quad 0 # cr13: home space segment table + .quad 0xc0000000 # cr14: machine check handling off + .quad 0 # cr15: linkage stack operations #include "head_kdump.S" @@ -377,11 +382,10 @@ SYM_DATA_START(parmarea) .quad 0 # OLDMEM_BASE .quad 0 # OLDMEM_SIZE .quad kernel_version # points to kernel version string + .quad COMMAND_LINE_SIZE .org COMMAND_LINE .byte "root=/dev/ram0 ro" .byte 0 .org PARMAREA+__PARMAREA_SIZE SYM_DATA_END(parmarea) - - .org HEAD_END diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 0f84c072625e..9ed7e29c81d9 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -170,10 +170,10 @@ static inline int has_ebcdic_char(const char *str) void setup_boot_command_line(void) { - parmarea.command_line[ARCH_COMMAND_LINE_SIZE - 1] = 0; + parmarea.command_line[COMMAND_LINE_SIZE - 1] = 0; /* convert arch command line to ascii if necessary */ if (has_ebcdic_char(parmarea.command_line)) - EBCASC(parmarea.command_line, ARCH_COMMAND_LINE_SIZE); + EBCASC(parmarea.command_line, COMMAND_LINE_SIZE); /* copy arch command line */ strcpy(early_command_line, strim(parmarea.command_line)); diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c index 75bcbfa27941..c2a1defc79da 100644 --- a/arch/s390/boot/pgm_check_info.c +++ b/arch/s390/boot/pgm_check_info.c @@ -175,6 +175,6 @@ void print_pgm_check_info(void) gpregs[12], gpregs[13], gpregs[14], gpregs[15]); print_stacktrace(); decompressor_printk("Last Breaking-Event-Address:\n"); - decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.breaking_event_addr, - (void *)S390_lowcore.breaking_event_addr); + decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break, + (void *)S390_lowcore.pgm_last_break); } diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 6dc8d0a53864..7571dee72a0c 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -15,6 +15,7 @@ #include "uv.h" unsigned long __bootdata_preserved(__kaslr_offset); +unsigned long __bootdata(__amode31_base); unsigned long __bootdata_preserved(VMALLOC_START); unsigned long __bootdata_preserved(VMALLOC_END); struct page *__bootdata_preserved(vmemmap); @@ -259,6 +260,12 @@ static void offset_vmlinux_info(unsigned long offset) vmlinux.dynsym_start += offset; } +static unsigned long reserve_amode31(unsigned long safe_addr) +{ + __amode31_base = PAGE_ALIGN(safe_addr); + return safe_addr + vmlinux.amode31_size; +} + void startup_kernel(void) { unsigned long random_lma; @@ -273,6 +280,7 @@ void startup_kernel(void) setup_lpp(); store_ipl_parmblock(); safe_addr = mem_safe_offset(); + safe_addr = reserve_amode31(safe_addr); safe_addr = read_ipl_report(safe_addr); uv_query_info(); rescue_initrd(safe_addr); diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6aad18ee131d..fd825097cf04 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -61,7 +61,8 @@ CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m -CONFIG_S390_UNWIND_SELFTEST=y +CONFIG_S390_UNWIND_SELFTEST=m +CONFIG_S390_KPROBES_SANITY_TEST=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y @@ -776,7 +777,6 @@ CONFIG_CRC8=m CONFIG_RANDOM32_SELFTEST=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 -CONFIG_DMA_API_DEBUG=y CONFIG_PRINTK_TIME=y CONFIG_DYNAMIC_DEBUG=y CONFIG_DEBUG_INFO=y @@ -839,8 +839,13 @@ CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y CONFIG_FTRACE_STARTUP_TEST=y # CONFIG_EVENT_TRACE_STARTUP_TEST is not set +CONFIG_SAMPLES=y +CONFIG_SAMPLE_TRACE_PRINTK=m +CONFIG_SAMPLE_FTRACE_DIRECT=m CONFIG_DEBUG_ENTRY=y CONFIG_CIO_INJECT=y +CONFIG_KUNIT=m +CONFIG_KUNIT_DEBUGFS=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index f08b161c9446..c9c3cedff2d8 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -60,6 +60,7 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_S390_UNWIND_SELFTEST=m +CONFIG_S390_KPROBES_SANITY_TEST=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y # CONFIG_GCC_PLUGINS is not set @@ -788,6 +789,11 @@ CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y +CONFIG_SAMPLES=y +CONFIG_SAMPLE_TRACE_PRINTK=m +CONFIG_SAMPLE_FTRACE_DIRECT=m +CONFIG_KUNIT=m +CONFIG_KUNIT_DEBUGFS=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index f9eddbca79d2..2c057e1f3200 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -16,20 +16,24 @@ #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES /* Fast-BCR without checkpoint synchronization */ -#define __ASM_BARRIER "bcr 14,0\n" +#define __ASM_BCR_SERIALIZE "bcr 14,0\n" #else -#define __ASM_BARRIER "bcr 15,0\n" +#define __ASM_BCR_SERIALIZE "bcr 15,0\n" #endif -#define mb() do { asm volatile(__ASM_BARRIER : : : "memory"); } while (0) +static __always_inline void bcr_serialize(void) +{ + asm volatile(__ASM_BCR_SERIALIZE : : : "memory"); +} -#define rmb() barrier() -#define wmb() barrier() -#define dma_rmb() mb() -#define dma_wmb() mb() -#define __smp_mb() mb() -#define __smp_rmb() rmb() -#define __smp_wmb() wmb() +#define mb() bcr_serialize() +#define rmb() barrier() +#define wmb() barrier() +#define dma_rmb() mb() +#define dma_wmb() mb() +#define __smp_mb() mb() +#define __smp_rmb() rmb() +#define __smp_wmb() wmb() #define __smp_store_release(p, v) \ do { \ diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index fd149480b6e2..5a530c552c23 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -188,7 +188,7 @@ static inline bool arch_test_and_set_bit_lock(unsigned long nr, volatile unsigned long *ptr) { if (arch_test_bit(nr, ptr)) - return 1; + return true; return arch_test_and_set_bit(nr, ptr); } diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h index 62228a884e06..26c710cd3485 100644 --- a/arch/s390/include/asm/cpu.h +++ b/arch/s390/include/asm/cpu.h @@ -12,6 +12,7 @@ #ifndef __ASSEMBLY__ #include +#include struct cpuid { @@ -21,5 +22,7 @@ struct cpuid unsigned int unused : 16; } __attribute__ ((packed, aligned(8))); +DECLARE_STATIC_KEY_FALSE(cpu_has_bear); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_CPU_H */ diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 19a55e1e3a0c..77f24262c25c 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -462,7 +462,7 @@ arch_initcall(VNAME(var, reg)) * * @var: Name of debug_info_t variable * @name: Name of debug log (e.g. used for debugfs entry) - * @pages_per_area: Number of pages per area + * @pages: Number of pages per area * @nr_areas: Number of debug areas * @buf_size: Size of data area in each debug entry * @view: Pointer to debug view struct diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index e8b460f39c58..267f70f4393f 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -17,7 +17,6 @@ void ftrace_caller(void); -extern char ftrace_graph_caller_end; extern void *ftrace_func; struct dyn_arch_ftrace { }; @@ -42,6 +41,35 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + return &fregs->regs; +} + +static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs, + unsigned long ip) +{ + struct pt_regs *regs = arch_ftrace_get_regs(fregs); + + regs->psw.addr = ip; +} + +/* + * When an ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, + * place the direct caller in the ORIG_GPR2 part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + regs->orig_gpr2 = addr; +} + /* * Even though the system call numbers are identical for s390/s390x a * different system call table is used for compat tasks. This may lead @@ -68,4 +96,32 @@ static inline bool arch_syscall_match_sym_name(const char *sym, } #endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_NOP_INSN .word 0xc004, 0x0000, 0x0000 /* brcl 0,0 */ + +#ifndef CC_USING_HOTPATCH + +#define FTRACE_GEN_MCOUNT_RECORD(name) \ + .section __mcount_loc, "a", @progbits; \ + .quad name; \ + .previous; + +#else /* !CC_USING_HOTPATCH */ + +#define FTRACE_GEN_MCOUNT_RECORD(name) + +#endif /* !CC_USING_HOTPATCH */ + +#define FTRACE_GEN_NOP_ASM(name) \ + FTRACE_GEN_MCOUNT_RECORD(name) \ + FTRACE_NOP_INSN + +#else /* CONFIG_FUNCTION_TRACER */ + +#define FTRACE_GEN_NOP_ASM(name) + +#endif /* CONFIG_FUNCTION_TRACER */ + #endif /* _ASM_S390_FTRACE_H */ diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index dcb1bba4f406..916cfcb36d8a 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -2,6 +2,8 @@ #ifndef _ASM_S390_JUMP_LABEL_H #define _ASM_S390_JUMP_LABEL_H +#define HAVE_JUMP_LABEL_BATCH + #ifndef __ASSEMBLY__ #include diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h index d578a8c76676..5209f223331a 100644 --- a/arch/s390/include/asm/livepatch.h +++ b/arch/s390/include/asm/livepatch.h @@ -16,9 +16,7 @@ static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { - struct pt_regs *regs = ftrace_get_regs(fregs); - - regs->psw.addr = ip; + ftrace_instruction_pointer_set(fregs, ip); } #endif diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 11213c8bfca5..1262f5003acf 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -65,7 +65,7 @@ struct lowcore { __u32 external_damage_code; /* 0x00f4 */ __u64 failing_storage_address; /* 0x00f8 */ __u8 pad_0x0100[0x0110-0x0100]; /* 0x0100 */ - __u64 breaking_event_addr; /* 0x0110 */ + __u64 pgm_last_break; /* 0x0110 */ __u8 pad_0x0118[0x0120-0x0118]; /* 0x0118 */ psw_t restart_old_psw; /* 0x0120 */ psw_t external_old_psw; /* 0x0130 */ @@ -93,9 +93,10 @@ struct lowcore { psw_t return_psw; /* 0x0290 */ psw_t return_mcck_psw; /* 0x02a0 */ + __u64 last_break; /* 0x02b0 */ + /* CPU accounting and timing values. */ - __u64 sys_enter_timer; /* 0x02b0 */ - __u8 pad_0x02b8[0x02c0-0x02b8]; /* 0x02b8 */ + __u64 sys_enter_timer; /* 0x02b8 */ __u64 mcck_enter_timer; /* 0x02c0 */ __u64 exit_timer; /* 0x02c8 */ __u64 user_timer; /* 0x02d0 */ @@ -188,7 +189,7 @@ struct lowcore { __u32 tod_progreg_save_area; /* 0x1324 */ __u32 cpu_timer_save_area[2]; /* 0x1328 */ __u32 clock_comp_save_area[2]; /* 0x1330 */ - __u8 pad_0x1338[0x1340-0x1338]; /* 0x1338 */ + __u64 last_break_save_area; /* 0x1338 */ __u32 access_regs_save_area[16]; /* 0x1340 */ __u64 cregs_save_area[16]; /* 0x1380 */ __u8 pad_0x1400[0x1800-0x1400]; /* 0x1400 */ diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index b4bd8c41e9d3..82725cf783c7 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -12,6 +12,11 @@ void nospec_init_branches(void); void nospec_auto_detect(void); void nospec_revert(s32 *start, s32 *end); +static inline bool nospec_uses_trampoline(void) +{ + return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index e43416950245..008a6c856fa4 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -583,11 +583,11 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new #define CRDTE_DTT_REGION1 0x1cUL static inline void crdte(unsigned long old, unsigned long new, - unsigned long table, unsigned long dtt, + unsigned long *table, unsigned long dtt, unsigned long address, unsigned long asce) { union register_pair r1 = { .even = old, .odd = new, }; - union register_pair r2 = { .even = table | dtt, .odd = address, }; + union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, }; asm volatile(".insn rrf,0xb98f0000,%[r1],%[r2],%[asce],0" : [r1] "+&d" (r1.pair) @@ -1001,7 +1001,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, unsigned long opt, unsigned long asce, int local) { - unsigned long pto = (unsigned long) ptep; + unsigned long pto = __pa(ptep); if (__builtin_constant_p(opt) && opt == 0) { /* Invalidation + TLB flush for the pte */ @@ -1023,7 +1023,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, static __always_inline void __ptep_ipte_range(unsigned long address, int nr, pte_t *ptep, int local) { - unsigned long pto = (unsigned long) ptep; + unsigned long pto = __pa(ptep); /* Invalidate a range of ptes + TLB flush of the ptes */ do { @@ -1487,7 +1487,7 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, { unsigned long sto; - sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); + sto = __pa(pmdp) - pmd_index(addr) * sizeof(pmd_t); if (__builtin_constant_p(opt) && opt == 0) { /* flush without guest asce */ asm volatile( @@ -1513,7 +1513,7 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp, { unsigned long r3o; - r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); + r3o = __pa(pudp) - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; if (__builtin_constant_p(opt) && opt == 0) { /* flush without guest asce */ diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 61b22aa990e7..4ffa8e7f0ed3 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -76,8 +76,7 @@ enum { * The pt_regs struct defines the way the registers are stored on * the stack during a system call. */ -struct pt_regs -{ +struct pt_regs { union { user_pt_regs user_regs; struct { @@ -97,6 +96,7 @@ struct pt_regs }; unsigned long flags; unsigned long cr1; + unsigned long last_break; }; /* @@ -197,6 +197,25 @@ const char *regs_query_register_name(unsigned int offset); unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_kernel_argument() returns @n th argument of the function call. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, + unsigned int n) +{ + unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long); + +#define NR_REG_ARGUMENTS 5 + if (n < NR_REG_ARGUMENTS) + return regs_get_register(regs, 2 + n); + n -= NR_REG_ARGUMENTS; + return regs_get_kernel_stack_nth(regs, argoffset + n); +} + static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->gprs[15]; diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index e3ae937bef1c..c68ea35de498 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -117,6 +117,7 @@ struct zpci_report_error_header { extern char *sclp_early_sccb; +void sclp_early_adjust_va(void); void sclp_early_set_buffer(void *sccb); int sclp_early_read_info(void); int sclp_early_read_storage_info(void); diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 85881dd48022..3fecaa4e8b74 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -2,20 +2,8 @@ #ifndef _S390_SECTIONS_H #define _S390_SECTIONS_H -#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed - #include -extern bool initmem_freed; - -static inline int arch_is_kernel_initmem_freed(unsigned long addr) -{ - if (!initmem_freed) - return 0; - return addr >= (unsigned long)__init_begin && - addr < (unsigned long)__init_end; -} - /* * .boot.data section contains variables "shared" between the decompressor and * the decompressed kernel. The decompressor will store values in them, and diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index b6606ffd85d8..77e6506898f5 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -11,8 +11,8 @@ #include #define PARMAREA 0x10400 -#define HEAD_END 0x11000 +#define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE /* * Machine features detected in early.c */ @@ -43,6 +43,8 @@ #define STARTUP_NORMAL_OFFSET 0x10000 #define STARTUP_KDUMP_OFFSET 0x10010 +#define LEGACY_COMMAND_LINE_SIZE 896 + #ifndef __ASSEMBLY__ #include @@ -55,8 +57,9 @@ struct parmarea { unsigned long oldmem_base; /* 0x10418 */ unsigned long oldmem_size; /* 0x10420 */ unsigned long kernel_version; /* 0x10428 */ - char pad1[0x10480 - 0x10430]; /* 0x10430 - 0x10480 */ - char command_line[ARCH_COMMAND_LINE_SIZE]; /* 0x10480 */ + unsigned long max_command_line_size; /* 0x10430 */ + char pad1[0x10480-0x10438]; /* 0x10438 - 0x10480 */ + char command_line[COMMAND_LINE_SIZE]; /* 0x10480 */ }; extern struct parmarea parmarea; diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 4fd66c5e8934..3fae93ddb322 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -31,22 +31,18 @@ void *memmove(void *dest, const void *src, size_t n); #define __HAVE_ARCH_STRCMP /* arch function */ #define __HAVE_ARCH_STRCPY /* inline & arch function */ #define __HAVE_ARCH_STRLCAT /* arch function */ -#define __HAVE_ARCH_STRLCPY /* arch function */ #define __HAVE_ARCH_STRLEN /* inline & arch function */ #define __HAVE_ARCH_STRNCAT /* arch function */ #define __HAVE_ARCH_STRNCPY /* arch function */ #define __HAVE_ARCH_STRNLEN /* inline & arch function */ -#define __HAVE_ARCH_STRRCHR /* arch function */ #define __HAVE_ARCH_STRSTR /* arch function */ /* Prototypes for non-inlined arch strings functions. */ int memcmp(const void *s1, const void *s2, size_t n); int strcmp(const char *s1, const char *s2); size_t strlcat(char *dest, const char *src, size_t n); -size_t strlcpy(char *dest, const char *src, size_t size); char *strncat(char *dest, const char *src, size_t n); char *strncpy(char *dest, const char *src, size_t n); -char *strrchr(const char *s, int c); char *strstr(const char *s1, const char *s2); #endif /* !CONFIG_KASAN */ diff --git a/arch/s390/include/asm/text-patching.h b/arch/s390/include/asm/text-patching.h new file mode 100644 index 000000000000..b219056a8817 --- /dev/null +++ b/arch/s390/include/asm/text-patching.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_S390_TEXT_PATCHING_H +#define _ASM_S390_TEXT_PATCHING_H + +#include + +static __always_inline void sync_core(void) +{ + bcr_serialize(); +} + +void text_poke_sync(void); +void text_poke_sync_lock(void); + +#endif /* _ASM_S390_TEXT_PATCHING_H */ diff --git a/arch/s390/include/uapi/asm/setup.h b/arch/s390/include/uapi/asm/setup.h index 1f8803a31079..598d769e76df 100644 --- a/arch/s390/include/uapi/asm/setup.h +++ b/arch/s390/include/uapi/asm/setup.h @@ -1,14 +1 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S390 version - * Copyright IBM Corp. 1999, 2010 - */ - -#ifndef _UAPI_ASM_S390_SETUP_H -#define _UAPI_ASM_S390_SETUP_H - -#define COMMAND_LINE_SIZE 4096 - -#define ARCH_COMMAND_LINE_SIZE 896 - -#endif /* _UAPI_ASM_S390_SETUP_H */ diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index c22ea1c3ef84..cce0ddee2d02 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include +#include #include #include #include @@ -110,3 +113,20 @@ void __init apply_alternative_instructions(void) { apply_alternatives(__alt_instructions, __alt_instructions_end); } + +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +void text_poke_sync_lock(void) +{ + cpus_read_lock(); + text_poke_sync(); + cpus_read_unlock(); +} diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index b57da9338588..8e00bb228662 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -35,6 +35,7 @@ int main(void) OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); OFFSET(__PT_FLAGS, pt_regs, flags); OFFSET(__PT_CR1, pt_regs, cr1); + OFFSET(__PT_LAST_BREAK, pt_regs, last_break); DEFINE(__PT_SIZE, sizeof(struct pt_regs)); BLANK(); /* stack_frame offsets */ @@ -45,6 +46,7 @@ int main(void) OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[2]); OFFSET(__SF_SIE_REASON, stack_frame, empty1[3]); OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[4]); + DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); /* idle data offsets */ OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); @@ -77,7 +79,7 @@ int main(void) OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); - OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); + OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break); OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe); OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); @@ -126,6 +128,7 @@ int main(void) OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); OFFSET(__LC_GMAP, lowcore, gmap); OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline); + OFFSET(__LC_LAST_BREAK, lowcore, last_break); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); /* hardware defined lowcore locations 0x1000 - 0x18ff */ @@ -139,6 +142,7 @@ int main(void) OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area); OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area); OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area); + OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area); OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area); OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); @@ -160,5 +164,6 @@ int main(void) DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base)); DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); + DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); return 0; } diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 54efc279f54e..72e106cfd8c7 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -29,7 +29,7 @@ static int diag8_noresponse(int cmdlen) asm volatile( " diag %[rx],%[ry],0x8\n" : [ry] "+&d" (cmdlen) - : [rx] "d" ((addr_t) cpcmd_buf) + : [rx] "d" (__pa(cpcmd_buf)) : "cc"); return cmdlen; } @@ -39,8 +39,8 @@ static int diag8_response(int cmdlen, char *response, int *rlen) union register_pair rx, ry; int cc; - rx.even = (addr_t) cpcmd_buf; - rx.odd = (addr_t) response; + rx.even = __pa(cpcmd_buf); + rx.odd = __pa(response); ry.even = cmdlen | 0x40000000L; ry.odd = *rlen; asm volatile( diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index db1bc00229ca..85f326e258df 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -152,7 +152,7 @@ void show_stack(struct task_struct *task, unsigned long *stack, static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); - printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]); + printk(" [<%016lx>] %pSR\n", regs->last_break, (void *)regs->last_break); } void show_registers(struct pt_regs *regs) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 9857cb046726..3cdf68c53614 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -280,7 +280,7 @@ char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; static void __init setup_boot_command_line(void) { /* copy arch command line */ - strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE); + strlcpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); } static void __init check_image_bootable(void) @@ -296,6 +296,7 @@ static void __init check_image_bootable(void) void __init startup_init(void) { + sclp_early_adjust_va(); reset_tod_clock(); check_image_bootable(); time_early_init(); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 4c9b967290ae..01bae1d51113 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -52,6 +52,22 @@ STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE _LPP_OFFSET = __LC_LPP + .macro STBEAR address + ALTERNATIVE "", ".insn s,0xb2010000,\address", 193 + .endm + + .macro LBEAR address + ALTERNATIVE "", ".insn s,0xb2000000,\address", 193 + .endm + + .macro LPSWEY address,lpswe + ALTERNATIVE "b \lpswe", ".insn siy,0xeb0000000071,\address,0", 193 + .endm + + .macro MBEAR reg + ALTERNATIVE "", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 + .endm + .macro CHECK_STACK savearea #ifdef CONFIG_CHECK_STACK tml %r15,STACK_SIZE - CONFIG_STACK_GUARD @@ -302,6 +318,7 @@ ENTRY(system_call) BPOFF lghi %r14,0 .Lsysc_per: + STBEAR __LC_LAST_BREAK lctlg %c1,%c1,__LC_KERNEL_ASCE lg %r12,__LC_CURRENT lg %r15,__LC_KERNEL_STACK @@ -321,14 +338,16 @@ ENTRY(system_call) xgr %r11,%r11 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC + MBEAR %r2 lgr %r3,%r14 brasl %r14,__do_syscall lctlg %c1,%c1,__LC_USER_ASCE mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) stpt __LC_EXIT_TIMER - b __LC_RETURN_LPSWE + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE ENDPROC(system_call) # @@ -340,9 +359,10 @@ ENTRY(ret_from_fork) lctlg %c1,%c1,__LC_USER_ASCE mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) stpt __LC_EXIT_TIMER - b __LC_RETURN_LPSWE + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE ENDPROC(ret_from_fork) /* @@ -382,6 +402,7 @@ ENTRY(pgm_check_handler) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK stmg %r8,%r9,__PT_PSW(%r11) # clear user controlled registers to prevent speculative use @@ -401,8 +422,9 @@ ENTRY(pgm_check_handler) stpt __LC_EXIT_TIMER .Lpgm_exit_kernel: mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - b __LC_RETURN_LPSWE + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # # single stepped system call @@ -412,7 +434,8 @@ ENTRY(pgm_check_handler) larl %r14,.Lsysc_per stg %r14,__LC_RETURN_PSW+8 lghi %r14,1 - lpswe __LC_RETURN_PSW # branch to .Lsysc_per + LBEAR __LC_PGM_LAST_BREAK + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per ENDPROC(pgm_check_handler) /* @@ -422,6 +445,7 @@ ENDPROC(pgm_check_handler) ENTRY(\name) STCK __LC_INT_CLOCK stpt __LC_SYS_ENTER_TIMER + STBEAR __LC_LAST_BREAK BPOFF stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r12,__LC_CURRENT @@ -453,6 +477,7 @@ ENTRY(\name) xgr %r10,%r10 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC + MBEAR %r11 stmg %r8,%r9,__PT_PSW(%r11) tm %r8,0x0001 # coming from user space? jno 1f @@ -465,8 +490,9 @@ ENTRY(\name) lctlg %c1,%c1,__LC_USER_ASCE BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER -2: lmg %r0,%r15,__PT_R0(%r11) - b __LC_RETURN_LPSWE +2: LBEAR __PT_LAST_BREAK(%r11) + lmg %r0,%r15,__PT_R0(%r11) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE ENDPROC(\name) .endm @@ -505,6 +531,7 @@ ENTRY(mcck_int_handler) BPOFF la %r1,4095 # validate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer + LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs lg %r12,__LC_CURRENT lmg %r8,%r9,__LC_MCK_OLD_PSW @@ -591,8 +618,10 @@ ENTRY(mcck_int_handler) jno 0f BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER -0: lmg %r11,%r15,__PT_R11(%r11) - b __LC_RETURN_MCCK_LPSWE +0: ALTERNATIVE "", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 + LBEAR 0(%r12) + lmg %r11,%r15,__PT_R11(%r11) + LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE .Lmcck_panic: /* diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 7f2696e8d511..6083090be1f4 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -70,5 +70,6 @@ extern struct exception_table_entry _stop_amode31_ex_table[]; #define __amode31_data __section(".amode31.data") #define __amode31_ref __section(".amode31.refs") extern long _start_amode31_refs[], _end_amode31_refs[]; +extern unsigned long __amode31_base; #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 5165bf344f95..5510c7d10ddc 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -80,17 +81,6 @@ asm( #ifdef CONFIG_MODULES static char *ftrace_plt; - -asm( - " .data\n" - "ftrace_plt_template:\n" - " basr %r1,%r0\n" - " lg %r1,0f-.(%r1)\n" - " br %r1\n" - "0: .quad ftrace_caller\n" - "ftrace_plt_template_end:\n" - " .previous\n" -); #endif /* CONFIG_MODULES */ static const char *ftrace_shared_hotpatch_trampoline(const char **end) @@ -116,7 +106,7 @@ static const char *ftrace_shared_hotpatch_trampoline(const char **end) bool ftrace_need_init_nop(void) { - return ftrace_shared_hotpatch_trampoline(NULL); + return true; } int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) @@ -175,28 +165,6 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return 0; } -static void ftrace_generate_nop_insn(struct ftrace_insn *insn) -{ - /* brcl 0,0 */ - insn->opc = 0xc004; - insn->disp = 0; -} - -static void ftrace_generate_call_insn(struct ftrace_insn *insn, - unsigned long ip) -{ - unsigned long target; - - /* brasl r0,ftrace_caller */ - target = FTRACE_ADDR; -#ifdef CONFIG_MODULES - if (is_module_addr((void *)ip)) - target = (unsigned long)ftrace_plt; -#endif /* CONFIG_MODULES */ - insn->opc = 0xc005; - insn->disp = (target - ip) / 2; -} - static void brcl_disable(void *brcl) { u8 op = 0x04; /* set mask field to zero */ @@ -207,23 +175,7 @@ static void brcl_disable(void *brcl) int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - struct ftrace_insn orig, new, old; - - if (ftrace_shared_hotpatch_trampoline(NULL)) { - brcl_disable((void *)rec->ip); - return 0; - } - - if (copy_from_kernel_nofault(&old, (void *) rec->ip, sizeof(old))) - return -EFAULT; - /* Replace ftrace call with a nop. */ - ftrace_generate_call_insn(&orig, rec->ip); - ftrace_generate_nop_insn(&new); - - /* Verify that the to be replaced code matches what we expect. */ - if (memcmp(&orig, &old, sizeof(old))) - return -EINVAL; - s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + brcl_disable((void *)rec->ip); return 0; } @@ -236,23 +188,7 @@ static void brcl_enable(void *brcl) int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - struct ftrace_insn orig, new, old; - - if (ftrace_shared_hotpatch_trampoline(NULL)) { - brcl_enable((void *)rec->ip); - return 0; - } - - if (copy_from_kernel_nofault(&old, (void *) rec->ip, sizeof(old))) - return -EFAULT; - /* Replace nop with an ftrace call. */ - ftrace_generate_nop_insn(&orig); - ftrace_generate_call_insn(&new, rec->ip); - - /* Verify that the to be replaced code matches what we expect. */ - if (memcmp(&orig, &old, sizeof(old))) - return -EINVAL; - s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + brcl_enable((void *)rec->ip); return 0; } @@ -264,22 +200,16 @@ int ftrace_update_ftrace_func(ftrace_func_t func) void arch_ftrace_update_code(int command) { - if (ftrace_shared_hotpatch_trampoline(NULL)) - ftrace_modify_all_code(command); - else - ftrace_run_stop_machine(command); -} - -static void __ftrace_sync(void *dummy) -{ + ftrace_modify_all_code(command); } int ftrace_arch_code_modify_post_process(void) { - if (ftrace_shared_hotpatch_trampoline(NULL)) { - /* Send SIGP to the other CPUs, so they see the new code. */ - smp_call_function(__ftrace_sync, NULL, 1); - } + /* + * Flush any pre-fetched instructions on all + * CPUs to make the new code visible. + */ + text_poke_sync_lock(); return 0; } @@ -294,10 +224,6 @@ static int __init ftrace_plt_init(void) panic("cannot allocate ftrace plt\n"); start = ftrace_shared_hotpatch_trampoline(&end); - if (!start) { - start = ftrace_plt_template; - end = ftrace_plt_template_end; - } memcpy(ftrace_plt, start, end - start); set_memory_ro((unsigned long)ftrace_plt, 1); return 0; @@ -337,12 +263,14 @@ NOKPROBE_SYMBOL(prepare_ftrace_return); int ftrace_enable_ftrace_graph_caller(void) { brcl_disable(ftrace_graph_caller); + text_poke_sync_lock(); return 0; } int ftrace_disable_ftrace_graph_caller(void) { brcl_enable(ftrace_graph_caller); + text_poke_sync_lock(); return 0; } diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 114b5490ad8e..42f9a325a257 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -20,8 +20,6 @@ __HEAD ENTRY(startup_continue) larl %r1,tod_clock_base mvc 0(16,%r1),__LC_BOOT_CLOCK - larl %r13,.LPG1 # get base - lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers # # Setup stack # @@ -42,19 +40,3 @@ ENTRY(startup_continue) .align 16 .LPG1: .Ldw: .quad 0x0002000180000000,0x0000000000000000 -.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space - .quad 0 # cr1: primary space segment table - .quad 0 # cr2: dispatchable unit control table - .quad 0 # cr3: instruction authorization - .quad 0xffff # cr4: instruction authorization - .quad 0 # cr5: primary-aste origin - .quad 0 # cr6: I/O interrupts - .quad 0 # cr7: secondary space segment table - .quad 0x0000000000008000 # cr8: access registers translation - .quad 0 # cr9: tracing off - .quad 0 # cr10: tracing off - .quad 0 # cr11: tracing off - .quad 0 # cr12: tracing off - .quad 0 # cr13: home space segment table - .quad 0xc0000000 # cr14: machine check handling off - .quad 0 # cr15: linkage stack operations diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 3a3145c4a3ba..0df83ecaa2e0 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -140,8 +140,11 @@ void noinstr do_io_irq(struct pt_regs *regs) irq_enter(); - if (user_mode(regs)) + if (user_mode(regs)) { update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + } from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; if (from_idle) @@ -171,8 +174,11 @@ void noinstr do_ext_irq(struct pt_regs *regs) irq_enter(); - if (user_mode(regs)) + if (user_mode(regs)) { update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; + } regs->int_code = S390_lowcore.ext_int_code_addr; regs->int_parm = S390_lowcore.ext_params; diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 9156653b56f6..6bec000c6c1c 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -6,8 +6,9 @@ * Author(s): Jan Glauber */ #include -#include #include +#include +#include #include struct insn { @@ -48,9 +49,9 @@ static struct insn orignop = { .offset = JUMP_LABEL_NOP_OFFSET >> 1, }; -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { void *code = (void *)jump_entry_code(entry); struct insn old, new; @@ -72,19 +73,28 @@ static void __jump_label_transform(struct jump_entry *entry, s390_kernel_write(code, &new, sizeof(new)); } -static void __jump_label_sync(void *dummy) -{ -} - void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - __jump_label_transform(entry, type, 0); - smp_call_function(__jump_label_sync, NULL, 1); + jump_label_transform(entry, type, 0); + text_poke_sync(); } -void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) { - __jump_label_transform(entry, type, 1); + jump_label_transform(entry, type, 0); + return true; +} + +void arch_jump_label_transform_apply(void) +{ + text_poke_sync(); +} + +void __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) +{ + jump_label_transform(entry, type, 1); + text_poke_sync(); } diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index c505c0ee5f47..e27a7d3b0364 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -122,9 +122,55 @@ static void s390_free_insn_slot(struct kprobe *p) } NOKPROBE_SYMBOL(s390_free_insn_slot); +/* Check if paddr is at an instruction boundary */ +static bool can_probe(unsigned long paddr) +{ + unsigned long addr, offset = 0; + kprobe_opcode_t insn; + struct kprobe *kp; + + if (paddr & 0x01) + return false; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return false; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn))) + return false; + + if (insn >> 8 == 0) { + if (insn != BREAKPOINT_INSTRUCTION) { + /* + * Note that QEMU inserts opcode 0x0000 to implement + * software breakpoints for guests. Since the size of + * the original instruction is unknown, stop following + * instructions and prevent setting a kprobe. + */ + return false; + } + /* + * Check if the instruction has been modified by another + * kprobe, in which case the original instruction is + * decoded. + */ + kp = get_kprobe((void *)addr); + if (!kp) { + /* not a kprobe */ + return false; + } + insn = kp->opcode; + } + addr += insn_length(insn >> 8); + } + return addr == paddr; +} + int arch_prepare_kprobe(struct kprobe *p) { - if ((unsigned long) p->addr & 0x01) + if (!can_probe((unsigned long)p->addr)) return -EINVAL; /* Make sure the probe isn't going on a difficult instruction */ if (probe_is_prohibited_opcode(p->addr)) diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index f9e4baa64b67..528edff085d9 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -216,7 +216,9 @@ void *kexec_file_add_components(struct kimage *image, int (*add_kernel)(struct kimage *image, struct s390_load_data *data)) { + unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE; struct s390_load_data data = {0}; + unsigned long minsize; int ret; data.report = ipl_report_init(&ipl_block); @@ -227,10 +229,23 @@ void *kexec_file_add_components(struct kimage *image, if (ret) goto out; - if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { - ret = -EINVAL; + ret = -EINVAL; + minsize = PARMAREA + offsetof(struct parmarea, command_line); + if (image->kernel_buf_len < minsize) goto out; - } + + if (data.parm->max_command_line_size) + max_command_line_size = data.parm->max_command_line_size; + + if (minsize + max_command_line_size < minsize) + goto out; + + if (image->kernel_buf_len < minsize + max_command_line_size) + goto out; + + if (image->cmdline_buf_len >= max_command_line_size) + goto out; + memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); @@ -307,17 +322,3 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, } return 0; } - -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - /* A kernel must be at least large enough to contain head.S. During - * load memory in head.S will be accessed, e.g. to register the next - * command line. If the next kernel were smaller the current kernel - * will panic at load. - */ - if (buf_len < HEAD_END) - return -ENOEXEC; - - return kexec_image_probe_default(image, buf, buf_len); -} diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 6b13797143a7..39bcc0e39a10 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -22,10 +22,11 @@ ENTRY(ftrace_stub) BR_EX %r14 ENDPROC(ftrace_stub) -#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) -#define STACK_PTREGS (STACK_FRAME_OVERHEAD) -#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) -#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) +#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) +#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2) #ifdef __PACK_STACK /* allocate just enough for r14, r15 and backchain */ #define TRACED_FUNC_FRAME_SIZE 24 @@ -33,13 +34,15 @@ ENDPROC(ftrace_stub) #define TRACED_FUNC_FRAME_SIZE STACK_FRAME_OVERHEAD #endif -ENTRY(ftrace_caller) - .globl ftrace_regs_caller - .set ftrace_regs_caller,ftrace_caller + .macro ftrace_regs_entry, allregs=0 stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller + + .if \allregs == 1 lghi %r14,0 # save condition code ipm %r14 # don't put any instructions sllg %r14,%r14,16 # clobbering CC before this point + .endif + lgr %r1,%r15 # allocate stack frame for ftrace_caller to contain traced function aghi %r15,-TRACED_FUNC_FRAME_SIZE @@ -49,13 +52,31 @@ ENTRY(ftrace_caller) # allocate pt_regs and stack frame for ftrace_trace_function aghi %r15,-STACK_FRAME_SIZE stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15) + xc STACK_PTREGS_ORIG_GPR2(8,%r15),STACK_PTREGS_ORIG_GPR2(%r15) + + .if \allregs == 1 stg %r14,(STACK_PTREGS_PSW)(%r15) - lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address stosm (STACK_PTREGS_PSW)(%r15),0 + .endif + + lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address aghi %r1,-TRACED_FUNC_FRAME_SIZE stg %r1,__SF_BACKCHAIN(%r15) stg %r0,(STACK_PTREGS_PSW+8)(%r15) stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15) + .endm + +SYM_CODE_START(ftrace_regs_caller) + ftrace_regs_entry 1 + j ftrace_common +SYM_CODE_END(ftrace_regs_caller) + +SYM_CODE_START(ftrace_caller) + ftrace_regs_entry 0 + j ftrace_common +SYM_CODE_END(ftrace_caller) + +SYM_CODE_START(ftrace_common) #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES aghik %r2,%r0,-MCOUNT_INSN_SIZE lgrl %r4,function_trace_op @@ -74,24 +95,31 @@ ENTRY(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER # The j instruction gets runtime patched to a nop instruction. # See ftrace_enable_ftrace_graph_caller. - .globl ftrace_graph_caller -ftrace_graph_caller: - j ftrace_graph_caller_end +SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) + j .Lftrace_graph_caller_end lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) lg %r4,(STACK_PTREGS_PSW+8)(%r15) brasl %r14,prepare_ftrace_return stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) -ftrace_graph_caller_end: - .globl ftrace_graph_caller_end +.Lftrace_graph_caller_end: #endif - lg %r1,(STACK_PTREGS_PSW+8)(%r15) - lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) + lg %r0,(STACK_PTREGS_PSW+8)(%r15) +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + ltg %r1,STACK_PTREGS_ORIG_GPR2(%r15) + locgrz %r1,%r0 +#else + lg %r1,STACK_PTREGS_ORIG_GPR2(%r15) + ltgr %r1,%r1 + jnz 0f + lgr %r1,%r0 +#endif +0: lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 -ENDPROC(ftrace_caller) +SYM_CODE_END(ftrace_common) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(return_to_handler) +SYM_FUNC_START(return_to_handler) stmg %r2,%r5,32(%r15) lgr %r1,%r15 aghi %r15,-STACK_FRAME_OVERHEAD @@ -101,6 +129,6 @@ ENTRY(return_to_handler) lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 -ENDPROC(return_to_handler) +SYM_FUNC_END(return_to_handler) #endif diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 250e4dbf653c..60e6fec27bba 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -38,7 +38,7 @@ static int __init nospec_report(void) { if (test_facility(156)) pr_info("Spectre V2 mitigation: etokens\n"); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) + if (nospec_uses_trampoline()) pr_info("Spectre V2 mitigation: execute trampolines\n"); if (__test_facility(82, alt_stfle_fac_list)) pr_info("Spectre V2 mitigation: limited branch prediction\n"); diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index b4b5c8c21166..52d4353188ad 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -15,7 +15,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (test_facility(156)) return sprintf(buf, "Mitigation: etokens\n"); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) + if (nospec_uses_trampoline()) return sprintf(buf, "Mitigation: execute trampolines\n"); if (__test_facility(82, alt_stfle_fac_list)) return sprintf(buf, "Mitigation: limited branch prediction\n"); diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 4a99154fe651..6f431fa9e4d7 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -773,22 +773,46 @@ static int __init cpumf_pmu_init(void) * counter set via normal file operations. */ -static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Excl. access */ +static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Access count */ static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ unsigned int sets; /* Counter set bit mask */ atomic_t cpus_ack; /* # CPUs successfully executed func */ }; -static struct cfset_request { /* CPUs and counter set bit mask */ +static struct cfset_session { /* CPUs and counter set bit mask */ + struct list_head head; /* Head of list of active processes */ +} cfset_session = { + .head = LIST_HEAD_INIT(cfset_session.head) +}; + +struct cfset_request { /* CPUs and counter set bit mask */ unsigned long ctrset; /* Bit mask of counter set to read */ cpumask_t mask; /* CPU mask to read from */ -} cfset_request; + struct list_head node; /* Chain to cfset_session.head */ +}; -static void cfset_ctrset_clear(void) +static void cfset_session_init(void) { - cpumask_clear(&cfset_request.mask); - cfset_request.ctrset = 0; + INIT_LIST_HEAD(&cfset_session.head); +} + +/* Remove current request from global bookkeeping. Maintain a counter set bit + * mask on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_del(struct cfset_request *p) +{ + list_del(&p->node); +} + +/* Add current request to global bookkeeping. Maintain a counter set bit mask + * on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_add(struct cfset_request *p) +{ + list_add(&p->node, &cfset_session.head); } /* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access @@ -827,15 +851,23 @@ static void cfset_ioctl_off(void *parm) struct cfset_call_on_cpu_parm *p = parm; int rc; - cpuhw->dev_state = 0; + /* Check if any counter set used by /dev/hwc */ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) - if ((p->sets & cpumf_ctr_ctl[rc])) - atomic_dec(&cpuhw->ctr_set[rc]); - rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if ((p->sets & cpumf_ctr_ctl[rc])) { + if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { + ctr_set_disable(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + ctr_set_stop(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + } + } + /* Keep perf_event_open counter sets */ + rc = lcctl(cpuhw->dev_state | cpuhw->state); if (rc) pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", cpuhw->state, S390_HWCTR_DEVICE, rc); - cpuhw->flags &= ~PMU_F_IN_USE; + if (!cpuhw->dev_state) + cpuhw->flags &= ~PMU_F_IN_USE; debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", __func__, rc, cpuhw->state, cpuhw->dev_state); } @@ -870,11 +902,26 @@ static void cfset_release_cpu(void *p) debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", __func__, cpuhw->state, cpuhw->dev_state); + cpuhw->dev_state = 0; rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ if (rc) pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", cpuhw->state, S390_HWCTR_DEVICE, rc); - cpuhw->dev_state = 0; +} + +/* This modifies the process CPU mask to adopt it to the currently online + * CPUs. Offline CPUs can not be addresses. This call terminates the access + * and is usually followed by close() or a new iotcl(..., START, ...) which + * creates a new request structure. + */ +static void cfset_all_stop(struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p = { + .sets = req->ctrset, + }; + + cpumask_and(&req->mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1); } /* Release function is also called when application gets terminated without @@ -882,10 +929,19 @@ static void cfset_release_cpu(void *p) */ static int cfset_release(struct inode *inode, struct file *file) { - on_each_cpu(cfset_release_cpu, NULL, 1); + mutex_lock(&cfset_ctrset_mutex); + /* Open followed by close/exit has no private_data */ + if (file->private_data) { + cfset_all_stop(file->private_data); + cfset_session_del(file->private_data); + kfree(file->private_data); + file->private_data = NULL; + } + if (!atomic_dec_return(&cfset_opencnt)) + on_each_cpu(cfset_release_cpu, NULL, 1); + mutex_unlock(&cfset_ctrset_mutex); + hw_perf_event_destroy(NULL); - cfset_ctrset_clear(); - atomic_set(&cfset_opencnt, 0); return 0; } @@ -893,9 +949,10 @@ static int cfset_open(struct inode *inode, struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - /* Only one user space program can open /dev/hwctr */ - if (atomic_xchg(&cfset_opencnt, 1)) - return -EBUSY; + mutex_lock(&cfset_ctrset_mutex); + if (atomic_inc_return(&cfset_opencnt) == 1) + cfset_session_init(); + mutex_unlock(&cfset_ctrset_mutex); cpumf_hw_inuse(); file->private_data = NULL; @@ -903,25 +960,10 @@ static int cfset_open(struct inode *inode, struct file *file) return nonseekable_open(inode, file); } -static int cfset_all_stop(void) +static int cfset_all_start(struct cfset_request *req) { struct cfset_call_on_cpu_parm p = { - .sets = cfset_request.ctrset, - }; - cpumask_var_t mask; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); - on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); - free_cpumask_var(mask); - return 0; -} - -static int cfset_all_start(void) -{ - struct cfset_call_on_cpu_parm p = { - .sets = cfset_request.ctrset, + .sets = req->ctrset, .cpus_ack = ATOMIC_INIT(0), }; cpumask_var_t mask; @@ -929,7 +971,7 @@ static int cfset_all_start(void) if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + cpumask_and(mask, &req->mask, cpu_online_mask); on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); @@ -1045,7 +1087,7 @@ static void cfset_cpu_read(void *parm) cpuhw->sets, cpuhw->used); } -static int cfset_all_read(unsigned long arg) +static int cfset_all_read(unsigned long arg, struct cfset_request *req) { struct cfset_call_on_cpu_parm p; cpumask_var_t mask; @@ -1054,46 +1096,53 @@ static int cfset_all_read(unsigned long arg) if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - p.sets = cfset_request.ctrset; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + p.sets = req->ctrset; + cpumask_and(mask, &req->mask, cpu_online_mask); on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); rc = cfset_all_copy(arg, mask); free_cpumask_var(mask); return rc; } -static long cfset_ioctl_read(unsigned long arg) +static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) { struct s390_ctrset_read read; - int ret = 0; + int ret = -ENODATA; - if (copy_from_user(&read, (char __user *)arg, sizeof(read))) - return -EFAULT; - ret = cfset_all_read(arg); - return ret; -} - -static long cfset_ioctl_stop(void) -{ - int ret = ENXIO; - - if (cfset_request.ctrset) { - ret = cfset_all_stop(); - cfset_ctrset_clear(); + if (req && req->ctrset) { + if (copy_from_user(&read, (char __user *)arg, sizeof(read))) + return -EFAULT; + ret = cfset_all_read(arg, req); } return ret; } -static long cfset_ioctl_start(unsigned long arg) +static long cfset_ioctl_stop(struct file *file) +{ + struct cfset_request *req = file->private_data; + int ret = -ENXIO; + + if (req) { + cfset_all_stop(req); + cfset_session_del(req); + kfree(req); + file->private_data = NULL; + ret = 0; + } + return ret; +} + +static long cfset_ioctl_start(unsigned long arg, struct file *file) { struct s390_ctrset_start __user *ustart; struct s390_ctrset_start start; + struct cfset_request *preq; void __user *umask; unsigned int len; int ret = 0; size_t need; - if (cfset_request.ctrset) + if (file->private_data) return -EBUSY; ustart = (struct s390_ctrset_start __user *)arg; if (copy_from_user(&start, ustart, sizeof(start))) @@ -1108,25 +1157,36 @@ static long cfset_ioctl_start(unsigned long arg) return -EINVAL; /* Invalid counter set */ if (!start.counter_sets) return -EINVAL; /* No counter set at all? */ - cpumask_clear(&cfset_request.mask); + + preq = kzalloc(sizeof(*preq), GFP_KERNEL); + if (!preq) + return -ENOMEM; + cpumask_clear(&preq->mask); len = min_t(u64, start.cpumask_len, cpumask_size()); umask = (void __user *)start.cpumask; - if (copy_from_user(&cfset_request.mask, umask, len)) + if (copy_from_user(&preq->mask, umask, len)) { + kfree(preq); return -EFAULT; - if (cpumask_empty(&cfset_request.mask)) + } + if (cpumask_empty(&preq->mask)) { + kfree(preq); return -EINVAL; + } need = cfset_needspace(start.counter_sets); - if (put_user(need, &ustart->data_bytes)) - ret = -EFAULT; - if (ret) - goto out; - cfset_request.ctrset = start.counter_sets; - ret = cfset_all_start(); -out: - if (ret) - cfset_ctrset_clear(); - debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n", - __func__, cfset_request.ctrset, need, ret); + if (put_user(need, &ustart->data_bytes)) { + kfree(preq); + return -EFAULT; + } + preq->ctrset = start.counter_sets; + ret = cfset_all_start(preq); + if (!ret) { + cfset_session_add(preq); + file->private_data = preq; + debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n", + __func__, preq->ctrset, need, ret); + } else { + kfree(preq); + } return ret; } @@ -1136,7 +1196,7 @@ out: * counter set keeps running until explicitly stopped. Returns the number * of bytes needed to store the counter values. If another S390_HWCTR_START * ioctl subcommand is called without a previous S390_HWCTR_STOP stop - * command, -EBUSY is returned. + * command on the same file descriptor, -EBUSY is returned. * S390_HWCTR_READ: Read the counter set values from specified CPU list given * with the S390_HWCTR_START command. * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the @@ -1150,13 +1210,13 @@ static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) mutex_lock(&cfset_ctrset_mutex); switch (cmd) { case S390_HWCTR_START: - ret = cfset_ioctl_start(arg); + ret = cfset_ioctl_start(arg, file); break; case S390_HWCTR_STOP: - ret = cfset_ioctl_stop(); + ret = cfset_ioctl_stop(file); break; case S390_HWCTR_READ: - ret = cfset_ioctl_read(arg); + ret = cfset_ioctl_read(arg, file->private_data); break; default: ret = -ENOTTY; @@ -1182,29 +1242,41 @@ static struct miscdevice cfset_dev = { .fops = &cfset_fops, }; +/* Hotplug add of a CPU. Scan through all active processes and add + * that CPU to the list of CPUs supplied with ioctl(..., START, ...). + */ int cfset_online_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; mutex_lock(&cfset_ctrset_mutex); - if (cfset_request.ctrset) { - p.sets = cfset_request.ctrset; - cfset_ioctl_on(&p); - cpumask_set_cpu(cpu, &cfset_request.mask); + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_on(&p); + cpumask_set_cpu(cpu, &rp->mask); + } } mutex_unlock(&cfset_ctrset_mutex); return 0; } +/* Hotplug remove of a CPU. Scan through all active processes and clear + * that CPU from the list of CPUs supplied with ioctl(..., START, ...). + */ int cfset_offline_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; mutex_lock(&cfset_ctrset_mutex); - if (cfset_request.ctrset) { - p.sets = cfset_request.ctrset; - cfset_ioctl_off(&p); - cpumask_clear_cpu(cpu, &cfset_request.mask); + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_off(&p); + cpumask_clear_cpu(cpu, &rp->mask); + } } mutex_unlock(&cfset_ctrset_mutex); return 0; diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e5dd46b1bff8..e8858b2de24b 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -141,7 +141,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, frame->childregs.gprs[10] = arg; frame->childregs.gprs[11] = (unsigned long)do_exit; frame->childregs.orig_gpr2 = -1; - + frame->childregs.last_break = 1; return 0; } frame->childregs = *current_pt_regs(); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 0d58c1f07176..a6f5f1cae2fe 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -96,10 +96,10 @@ EXPORT_SYMBOL(console_irq); * relocated above 2 GB, because it has to use 31 bit addresses. * Such code and data is part of the .amode31 section. */ -unsigned long __amode31_ref __samode31 = __pa(&_samode31); -unsigned long __amode31_ref __eamode31 = __pa(&_eamode31); -unsigned long __amode31_ref __stext_amode31 = __pa(&_stext_amode31); -unsigned long __amode31_ref __etext_amode31 = __pa(&_etext_amode31); +unsigned long __amode31_ref __samode31 = (unsigned long)&_samode31; +unsigned long __amode31_ref __eamode31 = (unsigned long)&_eamode31; +unsigned long __amode31_ref __stext_amode31 = (unsigned long)&_stext_amode31; +unsigned long __amode31_ref __etext_amode31 = (unsigned long)&_etext_amode31; struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table; struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table; @@ -150,6 +150,7 @@ struct mem_detect_info __bootdata(mem_detect); struct initrd_data __bootdata(initrd_data); unsigned long __bootdata_preserved(__kaslr_offset); +unsigned long __bootdata(__amode31_base); unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); u64 __bootdata_preserved(stfle_fac_list[16]); @@ -174,6 +175,8 @@ unsigned long MODULES_END; struct lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); +DEFINE_STATIC_KEY_FALSE(cpu_has_bear); + /* * The Write Back bit position in the physaddr is given by the SLPC PCI. * Leaving the mask zero always uses write through which is safe @@ -594,7 +597,8 @@ static void __init setup_resources(void) * part of the System RAM resource. */ if (crashk_res.end) { - memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0); + memblock_add_node(crashk_res.start, resource_size(&crashk_res), + 0, MEMBLOCK_NONE); memblock_reserve(crashk_res.start, resource_size(&crashk_res)); insert_resource(&iomem_resource, &crashk_res); } @@ -694,7 +698,7 @@ static void __init reserve_crashkernel(void) } if (register_memory_notifier(&kdump_mem_nb)) { - memblock_free(crash_base, crash_size); + memblock_phys_free(crash_base, crash_size); return; } @@ -719,7 +723,7 @@ static void __init reserve_initrd(void) #ifdef CONFIG_BLK_DEV_INITRD if (!initrd_data.start || !initrd_data.size) return; - initrd_start = initrd_data.start; + initrd_start = (unsigned long)__va(initrd_data.start); initrd_end = initrd_start + initrd_data.size; memblock_reserve(initrd_data.start, initrd_data.size); #endif @@ -749,7 +753,7 @@ static void __init free_mem_detect_info(void) get_mem_detect_reserved(&start, &size); if (size) - memblock_free(start, size); + memblock_phys_free(start, size); } static const char * __init get_mem_info_source(void) @@ -794,7 +798,7 @@ static void __init check_initrd(void) if (initrd_data.start && initrd_data.size && !memblock_is_region_memory(initrd_data.start, initrd_data.size)) { pr_err("The initial RAM disk does not fit into the memory\n"); - memblock_free(initrd_data.start, initrd_data.size); + memblock_phys_free(initrd_data.start, initrd_data.size); initrd_start = initrd_end = 0; } #endif @@ -805,12 +809,10 @@ static void __init check_initrd(void) */ static void __init reserve_kernel(void) { - unsigned long start_pfn = PFN_UP(__pa(_end)); - memblock_reserve(0, STARTUP_NORMAL_OFFSET); - memblock_reserve((unsigned long)sclp_early_sccb, EXT_SCCB_READ_SCP); - memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - - (unsigned long)_stext); + memblock_reserve(__amode31_base, __eamode31 - __samode31); + memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); + memblock_reserve(__pa(_stext), _end - _stext); } static void __init setup_memory(void) @@ -832,20 +834,14 @@ static void __init setup_memory(void) static void __init relocate_amode31_section(void) { - unsigned long amode31_addr, amode31_size; - long amode31_offset; + unsigned long amode31_size = __eamode31 - __samode31; + long amode31_offset = __amode31_base - __samode31; long *ptr; - /* Allocate a new AMODE31 capable memory region */ - amode31_size = __eamode31 - __samode31; pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size); - amode31_addr = (unsigned long)memblock_alloc_low(amode31_size, PAGE_SIZE); - if (!amode31_addr) - panic("Failed to allocate memory for AMODE31 section\n"); - amode31_offset = amode31_addr - __samode31; /* Move original AMODE31 section to the new one */ - memmove((void *)amode31_addr, (void *)__samode31, amode31_size); + memmove((void *)__amode31_base, (void *)__samode31, amode31_size); /* Zero out the old AMODE31 section to catch invalid accesses within it */ memset((void *)__samode31, 0, amode31_size); @@ -884,14 +880,12 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE, - PAGE_SIZE); + vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); if (!vmms) panic("Failed to allocate memory for sysinfo structure\n"); - if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); - memblock_free((unsigned long) vmms, PAGE_SIZE); + memblock_free(vmms, PAGE_SIZE); } /* @@ -1051,6 +1045,9 @@ void __init setup_arch(char **cmdline_p) smp_detect_cpus(); topology_init_early(); + if (test_facility(193)) + static_branch_enable(&cpu_has_bear); + /* * Create kernel page tables and switch to virtual addressing. */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1a04e5bdf655..78a8ea6fd582 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -723,7 +723,7 @@ void __init smp_save_dump_cpus(void) /* Get the CPU registers */ smp_save_cpu_regs(sa, addr, is_boot_cpu, page); } - memblock_free(page, PAGE_SIZE); + memblock_phys_free(page, PAGE_SIZE); diag_amode31_ops.diag308_reset(); pcpu_set_smt(0); } @@ -880,7 +880,7 @@ void __init smp_detect_cpus(void) /* Add CPUs present at boot */ __smp_rescan_cpus(info, true); - memblock_free_early((unsigned long)info, sizeof(*info)); + memblock_phys_free((unsigned long)info, sizeof(*info)); } /* diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 8fe2d23b64f4..dc2355c623d6 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -154,6 +154,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap) regs->psw = S390_lowcore.svc_old_psw; regs->int_code = S390_lowcore.svc_int_code; update_timer_sys(); + if (static_branch_likely(&cpu_has_bear)) + current->thread.last_break = regs->last_break; local_irq_enable(); regs->orig_gpr2 = regs->gprs[2]; diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index bcefc2173de4..6c6f7dcce1a5 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -300,7 +300,6 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { - unsigned long last_break = S390_lowcore.breaking_event_addr; unsigned int trapnr; irqentry_state_t state; @@ -311,10 +310,11 @@ void noinstr __do_pgm_check(struct pt_regs *regs) if (user_mode(regs)) { update_timer_sys(); - if (last_break < 4096) - last_break = 1; - current->thread.last_break = last_break; - regs->args[0] = last_break; + if (!static_branch_likely(&cpu_has_bear)) { + if (regs->last_break < 4096) + regs->last_break = 1; + } + current->thread.last_break = regs->last_break; } if (S390_lowcore.pgm_code & 0x0200) { diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 8b0e62507d62..386d4e42b8d3 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -64,7 +64,7 @@ void __init setup_uv(void) } if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) { - memblock_free(uv_stor_base, uv_info.uv_base_stor_len); + memblock_phys_free(uv_stor_base, uv_info.uv_base_stor_len); goto fail; } diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 63bdb9e1bfc1..42c43521878f 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -212,6 +212,7 @@ SECTIONS QUAD(__dynsym_start) /* dynsym_start */ QUAD(__rela_dyn_start) /* rela_dyn_start */ QUAD(__rela_dyn_end) /* rela_dyn_end */ + QUAD(_eamode31 - _samode31) /* amode31_size */ } :NONE /* Debugging sections. */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2245f4b8d362..c3bd993fdd0c 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -960,7 +960,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) /* bit 1+2 of the target are the ilc, so we can directly use ilen */ rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC); rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea, - (u64 *) __LC_LAST_BREAK); + (u64 *) __LC_PGM_LAST_BREAK); rc |= put_guest_lc(vcpu, pgm_info.code, (u16 *)__LC_PGM_INT_CODE); rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 678333936f78..707cd4622c13 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -7,6 +7,8 @@ lib-y += delay.o string.o uaccess.o find.o spinlock.o obj-y += mem.o xor.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o +obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o +test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o # Instrumenting memory accesses to __user data (in different address space) # produce false positives diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 9b2dab5a69f9..692dc84cd19c 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -26,7 +26,7 @@ static int __init spin_retry_init(void) } early_initcall(spin_retry_init); -/** +/* * spin_retry= parameter */ static int __init spin_retry_setup(char *str) diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index 47080560e0d8..7d8741818239 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -100,32 +100,6 @@ char *strcpy(char *dest, const char *src) EXPORT_SYMBOL(strcpy); #endif -/** - * strlcpy - Copy a %NUL terminated string into a sized buffer - * @dest: Where to copy the string to - * @src: Where to copy the string from - * @size: size of destination buffer - * - * Compatible with *BSD: the result is always a valid - * NUL-terminated string that fits in the buffer (unless, - * of course, the buffer size is zero). It does not pad - * out the result like strncpy() does. - */ -#ifdef __HAVE_ARCH_STRLCPY -size_t strlcpy(char *dest, const char *src, size_t size) -{ - size_t ret = __strend(src) - src; - - if (size) { - size_t len = (ret >= size) ? size-1 : ret; - dest[len] = '\0'; - memcpy(dest, src, len); - } - return ret; -} -EXPORT_SYMBOL(strlcpy); -#endif - /** * strncpy - Copy a length-limited, %NUL-terminated string * @dest: Where to copy the string to @@ -254,25 +228,6 @@ int strcmp(const char *s1, const char *s2) EXPORT_SYMBOL(strcmp); #endif -/** - * strrchr - Find the last occurrence of a character in a string - * @s: The string to be searched - * @c: The character to search for - */ -#ifdef __HAVE_ARCH_STRRCHR -char *strrchr(const char *s, int c) -{ - ssize_t len = __strend(s) - s; - - do { - if (s[len] == (char)c) - return (char *)s + len; - } while (--len >= 0); - return NULL; -} -EXPORT_SYMBOL(strrchr); -#endif - static inline int clcle(const char *s1, unsigned long l1, const char *s2, unsigned long l2) { diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c new file mode 100644 index 000000000000..9e62d62812e5 --- /dev/null +++ b/arch/s390/lib/test_kprobes.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include "test_kprobes.h" + +static struct kprobe kp; + +static void setup_kprobe(struct kunit *test, struct kprobe *kp, + const char *symbol, int offset) +{ + kp->offset = offset; + kp->addr = NULL; + kp->symbol_name = symbol; +} + +static void test_kprobe_offset(struct kunit *test, struct kprobe *kp, + const char *target, int offset) +{ + int ret; + + setup_kprobe(test, kp, target, 0); + ret = register_kprobe(kp); + if (!ret) + unregister_kprobe(kp); + KUNIT_EXPECT_EQ(test, 0, ret); + setup_kprobe(test, kp, target, offset); + ret = register_kprobe(kp); + KUNIT_EXPECT_EQ(test, -EINVAL, ret); + if (!ret) + unregister_kprobe(kp); +} + +static void test_kprobe_odd(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_odd", + kprobes_target_odd_offs); +} + +static void test_kprobe_in_insn4(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn4", + kprobes_target_in_insn4_offs); +} + +static void test_kprobe_in_insn6_lo(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_lo", + kprobes_target_in_insn6_lo_offs); +} + +static void test_kprobe_in_insn6_hi(struct kunit *test) +{ + test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_hi", + kprobes_target_in_insn6_hi_offs); +} + +static struct kunit_case kprobes_testcases[] = { + KUNIT_CASE(test_kprobe_odd), + KUNIT_CASE(test_kprobe_in_insn4), + KUNIT_CASE(test_kprobe_in_insn6_lo), + KUNIT_CASE(test_kprobe_in_insn6_hi), + {} +}; + +static struct kunit_suite kprobes_test_suite = { + .name = "kprobes_test_s390", + .test_cases = kprobes_testcases, +}; + +kunit_test_suites(&kprobes_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/lib/test_kprobes.h b/arch/s390/lib/test_kprobes.h new file mode 100644 index 000000000000..2b4c9bc337f1 --- /dev/null +++ b/arch/s390/lib/test_kprobes.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef TEST_KPROBES_H +#define TEST_KPROBES_H + +extern unsigned long kprobes_target_odd_offs; +extern unsigned long kprobes_target_in_insn4_offs; +extern unsigned long kprobes_target_in_insn6_lo_offs; +extern unsigned long kprobes_target_in_insn6_hi_offs; + +#endif diff --git a/arch/s390/lib/test_kprobes_asm.S b/arch/s390/lib/test_kprobes_asm.S new file mode 100644 index 000000000000..ade7a3042334 --- /dev/null +++ b/arch/s390/lib/test_kprobes_asm.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#include +#include + +#define KPROBES_TARGET_START(name) \ + SYM_FUNC_START(name); \ + FTRACE_GEN_NOP_ASM(name) + +#define KPROBES_TARGET_END(name) \ + SYM_FUNC_END(name); \ + SYM_DATA(name##_offs, .quad 1b - name) + +KPROBES_TARGET_START(kprobes_target_in_insn4) + .word 0x4700 // bc 0,0 +1: .word 0x0000 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn4) + +KPROBES_TARGET_START(kprobes_target_in_insn6_lo) + .word 0xe310 // ly 1,0 +1: .word 0x0000 + .word 0x0058 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn6_lo) + +KPROBES_TARGET_START(kprobes_target_in_insn6_hi) + .word 0xe310 // ly 1,0 + .word 0x0000 +1: .word 0x0058 + br %r14 +KPROBES_TARGET_END(kprobes_target_in_insn6_hi) + +KPROBES_TARGET_START(kprobes_target_bp) + nop + .word 0x0000 + nop +1: br %r14 +KPROBES_TARGET_END(kprobes_target_bp) + +KPROBES_TARGET_START(kprobes_target_odd) + .byte 0x07 +1: .byte 0x07 + br %r14 +KPROBES_TARGET_END(kprobes_target_odd) diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index ecf327d743a0..cfc5f5557c06 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -3,7 +3,7 @@ * Test module for unwind_for_each_frame */ -#define pr_fmt(fmt) "test_unwind: " fmt +#include #include #include #include @@ -16,6 +16,8 @@ #include #include +struct kunit *current_test; + #define BT_BUF_SIZE (PAGE_SIZE * 4) /* @@ -29,7 +31,7 @@ static void print_backtrace(char *bt) p = strsep(&bt, "\n"); if (!p) break; - pr_err("%s\n", p); + kunit_err(current_test, "%s\n", p); } } @@ -49,7 +51,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, bt = kmalloc(BT_BUF_SIZE, GFP_ATOMIC); if (!bt) { - pr_err("failed to allocate backtrace buffer\n"); + kunit_err(current_test, "failed to allocate backtrace buffer\n"); return -ENOMEM; } /* Unwind. */ @@ -63,7 +65,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, if (frame_count++ == max_frames) break; if (state.reliable && !addr) { - pr_err("unwind state reliable but addr is 0\n"); + kunit_err(current_test, "unwind state reliable but addr is 0\n"); ret = -EINVAL; break; } @@ -75,7 +77,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, stack_type_name(state.stack_info.type), (void *)state.sp, (void *)state.ip); if (bt_pos >= BT_BUF_SIZE) - pr_err("backtrace buffer is too small\n"); + kunit_err(current_test, "backtrace buffer is too small\n"); } frame_count += 1; if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1")) @@ -85,15 +87,15 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs, /* Check the results. */ if (unwind_error(&state)) { - pr_err("unwind error\n"); + kunit_err(current_test, "unwind error\n"); ret = -EINVAL; } if (!seen_func2_func1) { - pr_err("unwindme_func2 and unwindme_func1 not found\n"); + kunit_err(current_test, "unwindme_func2 and unwindme_func1 not found\n"); ret = -EINVAL; } if (frame_count == max_frames) { - pr_err("Maximum number of frames exceeded\n"); + kunit_err(current_test, "Maximum number of frames exceeded\n"); ret = -EINVAL; } if (ret) @@ -166,7 +168,7 @@ static noinline int unwindme_func4(struct unwindme *u) kp.pre_handler = pgm_pre_handler; ret = register_kprobe(&kp); if (ret < 0) { - pr_err("register_kprobe failed %d\n", ret); + kunit_err(current_test, "register_kprobe failed %d\n", ret); return -EINVAL; } @@ -252,7 +254,7 @@ static int test_unwind_irq(struct unwindme *u) } /* Spawns a task and passes it to test_unwind(). */ -static int test_unwind_task(struct unwindme *u) +static int test_unwind_task(struct kunit *test, struct unwindme *u) { struct task_struct *task; int ret; @@ -267,7 +269,7 @@ static int test_unwind_task(struct unwindme *u) */ task = kthread_run(unwindme_func1, u, "%s", __func__); if (IS_ERR(task)) { - pr_err("kthread_run() failed\n"); + kunit_err(test, "kthread_run() failed\n"); return PTR_ERR(task); } /* @@ -282,77 +284,98 @@ static int test_unwind_task(struct unwindme *u) return ret; } -static int test_unwind_flags(int flags) +struct test_params { + int flags; + char *name; +}; + +/* + * Create required parameter list for tests + */ +static const struct test_params param_list[] = { + {.flags = UWM_DEFAULT, .name = "UWM_DEFAULT"}, + {.flags = UWM_SP, .name = "UWM_SP"}, + {.flags = UWM_REGS, .name = "UWM_REGS"}, + {.flags = UWM_SWITCH_STACK, + .name = "UWM_SWITCH_STACK"}, + {.flags = UWM_SP | UWM_REGS, + .name = "UWM_SP | UWM_REGS"}, + {.flags = UWM_CALLER | UWM_SP, + .name = "WM_CALLER | UWM_SP"}, + {.flags = UWM_CALLER | UWM_SP | UWM_REGS, + .name = "UWM_CALLER | UWM_SP | UWM_REGS"}, + {.flags = UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK, + .name = "UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK"}, + {.flags = UWM_THREAD, .name = "UWM_THREAD"}, + {.flags = UWM_THREAD | UWM_SP, + .name = "UWM_THREAD | UWM_SP"}, + {.flags = UWM_THREAD | UWM_CALLER | UWM_SP, + .name = "UWM_THREAD | UWM_CALLER | UWM_SP"}, + {.flags = UWM_IRQ, .name = "UWM_IRQ"}, + {.flags = UWM_IRQ | UWM_SWITCH_STACK, + .name = "UWM_IRQ | UWM_SWITCH_STACK"}, + {.flags = UWM_IRQ | UWM_SP, + .name = "UWM_IRQ | UWM_SP"}, + {.flags = UWM_IRQ | UWM_REGS, + .name = "UWM_IRQ | UWM_REGS"}, + {.flags = UWM_IRQ | UWM_SP | UWM_REGS, + .name = "UWM_IRQ | UWM_SP | UWM_REGS"}, + {.flags = UWM_IRQ | UWM_CALLER | UWM_SP, + .name = "UWM_IRQ | UWM_CALLER | UWM_SP"}, + {.flags = UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS, + .name = "UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS"}, + {.flags = UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK, + .name = "UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK"}, + #ifdef CONFIG_KPROBES + {.flags = UWM_PGM, .name = "UWM_PGM"}, + {.flags = UWM_PGM | UWM_SP, + .name = "UWM_PGM | UWM_SP"}, + {.flags = UWM_PGM | UWM_REGS, + .name = "UWM_PGM | UWM_REGS"}, + {.flags = UWM_PGM | UWM_SP | UWM_REGS, + .name = "UWM_PGM | UWM_SP | UWM_REGS"}, + #endif +}; + +/* + * Parameter description generator: required for KUNIT_ARRAY_PARAM() + */ +static void get_desc(const struct test_params *params, char *desc) +{ + strscpy(desc, params->name, KUNIT_PARAM_DESC_SIZE); +} + +/* + * Create test_unwind_gen_params + */ +KUNIT_ARRAY_PARAM(test_unwind, param_list, get_desc); + +static void test_unwind_flags(struct kunit *test) { struct unwindme u; + const struct test_params *params; - u.flags = flags; + current_test = test; + params = (const struct test_params *)test->param_value; + u.flags = params->flags; if (u.flags & UWM_THREAD) - return test_unwind_task(&u); + KUNIT_EXPECT_EQ(test, 0, test_unwind_task(test, &u)); else if (u.flags & UWM_IRQ) - return test_unwind_irq(&u); + KUNIT_EXPECT_EQ(test, 0, test_unwind_irq(&u)); else - return unwindme_func1(&u); + KUNIT_EXPECT_EQ(test, 0, unwindme_func1(&u)); } -static int test_unwind_init(void) -{ - int failed = 0; - int total = 0; +static struct kunit_case unwind_test_cases[] = { + KUNIT_CASE_PARAM(test_unwind_flags, test_unwind_gen_params), + {} +}; -#define TEST(flags) \ -do { \ - pr_info("[ RUN ] " #flags "\n"); \ - total++; \ - if (!test_unwind_flags((flags))) { \ - pr_info("[ OK ] " #flags "\n"); \ - } else { \ - pr_err("[ FAILED ] " #flags "\n"); \ - failed++; \ - } \ -} while (0) +static struct kunit_suite test_unwind_suite = { + .name = "test_unwind", + .test_cases = unwind_test_cases, +}; - pr_info("running stack unwinder tests"); - TEST(UWM_DEFAULT); - TEST(UWM_SP); - TEST(UWM_REGS); - TEST(UWM_SWITCH_STACK); - TEST(UWM_SP | UWM_REGS); - TEST(UWM_CALLER | UWM_SP); - TEST(UWM_CALLER | UWM_SP | UWM_REGS); - TEST(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK); - TEST(UWM_THREAD); - TEST(UWM_THREAD | UWM_SP); - TEST(UWM_THREAD | UWM_CALLER | UWM_SP); - TEST(UWM_IRQ); - TEST(UWM_IRQ | UWM_SWITCH_STACK); - TEST(UWM_IRQ | UWM_SP); - TEST(UWM_IRQ | UWM_REGS); - TEST(UWM_IRQ | UWM_SP | UWM_REGS); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS); - TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK); -#ifdef CONFIG_KPROBES - TEST(UWM_PGM); - TEST(UWM_PGM | UWM_SP); - TEST(UWM_PGM | UWM_REGS); - TEST(UWM_PGM | UWM_SP | UWM_REGS); -#endif -#undef TEST - if (failed) { - pr_err("%d of %d stack unwinder tests failed", failed, total); - WARN(1, "%d of %d stack unwinder tests failed", failed, total); - } else { - pr_info("all %d stack unwinder tests passed", total); - } +kunit_test_suites(&test_unwind_suite); - return failed ? -EINVAL : 0; -} - -static void test_unwind_exit(void) -{ -} - -module_init(test_unwind_init); -module_exit(test_unwind_exit); MODULE_LICENSE("GPL"); diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 1141c8d5c0d0..2203164b39da 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -14,8 +14,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -394,13 +394,10 @@ static int __init cmm_init(void) goto out_sysctl; #ifdef CONFIG_CMM_IUCV /* convert sender to uppercase characters */ - if (sender) { - int len = strlen(sender); - while (len--) - sender[len] = toupper(sender[len]); - } else { + if (sender) + string_upper(sender, sender); + else sender = cmm_default_sender; - } rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); if (rc < 0) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 0b0c8c284953..9f9af5298dd6 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -8,6 +8,7 @@ #include #include #include +#include #include static unsigned long max_addr; @@ -116,8 +117,13 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) return; if (st->current_prot & _PAGE_NOEXEC) return; - /* The first lowcore page is currently still W+X. */ - if (addr == PAGE_SIZE) + /* + * The first lowcore page is W+X if spectre mitigations are using + * trampolines or the BEAR enhancements facility is not installed, + * in which case we have two lpswe instructions in lowcore that need + * to be executable. + */ + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) return; WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); @@ -203,7 +209,9 @@ void ptdump_check_wx(void) if (st.wx_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); else - pr_info("Checked W+X mappings: passed, no unexpected W+X pages found\n"); + pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", + (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? + "unexpected " : ""); } #endif /* CONFIG_DEBUG_WX */ diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index a04faf49001a..8c6f258a6183 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -58,8 +58,6 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -bool initmem_freed; - static void __init setup_zero_pages(void) { unsigned int order; @@ -214,7 +212,6 @@ void __init mem_init(void) void free_initmem(void) { - initmem_freed = true; __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RW | SET_MEMORY_NX); diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 3e4735168019..483b9dbe0970 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -399,5 +399,5 @@ void __init kasan_copy_shadow_mapping(void) void __init kasan_free_early_identity(void) { - memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); + memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index fdc86c0e4e6c..654019181a37 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -57,7 +57,7 @@ void arch_report_meminfo(struct seq_file *m) static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, unsigned long dtt) { - unsigned long table, mask; + unsigned long *table, mask; mask = 0; if (MACHINE_HAS_EDAT2) { @@ -72,7 +72,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1); break; } - table = (unsigned long)old & mask; + table = (unsigned long *)((unsigned long)old & mask); crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); } else if (MACHINE_HAS_IDTE) { cspg(old, *old, new); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 2b1c6d916cf9..7d9705eeb02f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -584,8 +585,13 @@ void __init vmem_map_init(void) __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - /* we need lowcore executable for our LPSWE instructions */ - set_memory_x(0, 1); + if (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) { + /* + * Lowcore must be executable for LPSWE + * and expoline trampoline branch instructions. + */ + set_memory_x(0, 1); + } pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 1a374d021e25..233cc9bcd652 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -567,7 +567,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE, stack_depth); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { + if (nospec_uses_trampoline()) { jit->r14_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r14 thunk */ if (test_facility(35)) { @@ -585,7 +585,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) /* br %r14 */ _EMIT2(0x07fe); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable && + if ((nospec_uses_trampoline()) && (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) { jit->r1_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r1 thunk */ @@ -1332,7 +1332,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->seen |= SEEN_FUNC; /* lgrl %w1,func */ EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { + if (nospec_uses_trampoline()) { /* brasl %r14,__s390_indirect_jump_r1 */ EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); } else { diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index b833155ce838..872d772b73d2 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -561,7 +561,7 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev) zdev->has_resources = 0; } -int pcibios_add_device(struct pci_dev *pdev) +int pcibios_device_add(struct pci_dev *pdev) { struct zpci_dev *zdev = to_zpci(pdev); struct resource *res; diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 93223bd110c3..1f4540d6bd2d 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -18,6 +18,8 @@ static struct kmem_cache *dma_region_table_cache; static struct kmem_cache *dma_page_table_cache; static int s390_iommu_strict; +static u64 s390_iommu_aperture; +static u32 s390_iommu_aperture_factor = 1; static int zpci_refresh_global(struct zpci_dev *zdev) { @@ -565,15 +567,19 @@ int zpci_dma_init_device(struct zpci_dev *zdev) /* * Restrict the iommu bitmap size to the minimum of the following: - * - main memory size + * - s390_iommu_aperture which defaults to high_memory * - 3-level pagetable address limit minus start_dma offset * - DMA address range allowed by the hardware (clp query pci fn) * * Also set zdev->end_dma to the actual end address of the usable * range, instead of the theoretical maximum as reported by hardware. + * + * This limits the number of concurrently usable DMA mappings since + * for each DMA mapped memory address we need a DMA address including + * extra DMA addresses for multiple mappings of the same memory address. */ zdev->start_dma = PAGE_ALIGN(zdev->start_dma); - zdev->iommu_size = min3((u64) high_memory, + zdev->iommu_size = min3(s390_iommu_aperture, ZPCI_TABLE_SIZE_RT - zdev->start_dma, zdev->end_dma - zdev->start_dma + 1); zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1; @@ -660,6 +666,12 @@ static int __init dma_alloc_cpu_table_caches(void) int __init zpci_dma_init(void) { + s390_iommu_aperture = (u64)high_memory; + if (!s390_iommu_aperture_factor) + s390_iommu_aperture = ULONG_MAX; + else + s390_iommu_aperture *= s390_iommu_aperture_factor; + return dma_alloc_cpu_table_caches(); } @@ -692,3 +704,12 @@ static int __init s390_iommu_setup(char *str) } __setup("s390_iommu=", s390_iommu_setup); + +static int __init s390_iommu_aperture_setup(char *str) +{ + if (kstrtou32(str, 10, &s390_iommu_aperture_factor)) + s390_iommu_aperture_factor = 1; + return 1; +} + +__setup("s390_iommu_aperture=", s390_iommu_aperture_setup); diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 5b8d647523f9..6a5bfa9dc1f2 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -52,6 +52,8 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); struct pci_dev *pdev = NULL; + zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n", + ccdf->fid, ccdf->fh, ccdf->pec); zpci_err("error CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); @@ -96,6 +98,8 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); enum zpci_state state; + zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n", + ccdf->fid, ccdf->fh, ccdf->pec); zpci_err("avail CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index 335c281811c7..cae280e5c047 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -90,6 +90,14 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, if (zdev_enabled(zdev)) { ret = zpci_disable_device(zdev); + /* + * Due to a z/VM vs LPAR inconsistency in the error + * state the FH may indicate an enabled device but + * disable says the device is already disabled don't + * treat it as an error here. + */ + if (ret == -EINVAL) + ret = 0; if (ret) goto out; } diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c index bac8a058ebd7..c77b5f00a66a 100644 --- a/arch/sh/boards/mach-ap325rxa/setup.c +++ b/arch/sh/boards/mach-ap325rxa/setup.c @@ -560,7 +560,7 @@ static void __init ap325rxa_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu_dma_membase = phys; diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 97c5703b1818..4c9522dd351f 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -1502,7 +1502,7 @@ static void __init ecovec_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU0 memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu0_dma_membase = phys; @@ -1510,7 +1510,7 @@ static void __init ecovec_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU1 memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu1_dma_membase = phys; } diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c index eeb5ce341efd..20f4db778ed6 100644 --- a/arch/sh/boards/mach-kfr2r09/setup.c +++ b/arch/sh/boards/mach-kfr2r09/setup.c @@ -633,7 +633,7 @@ static void __init kfr2r09_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu_dma_membase = phys; diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c index 6703a2122c0d..f60061283c48 100644 --- a/arch/sh/boards/mach-migor/setup.c +++ b/arch/sh/boards/mach-migor/setup.c @@ -633,7 +633,7 @@ static void __init migor_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu_dma_membase = phys; diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index d9b31d4560c0..b60a2626e18b 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -966,7 +966,7 @@ static void __init ms7724se_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU0 memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu0_dma_membase = phys; @@ -974,7 +974,7 @@ static void __init ms7724se_mv_mem_reserve(void) if (!phys) panic("Failed to allocate CEU1 memory\n"); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remove(phys, size); ceu1_dma_membase = phys; } diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 9c2b720bfd20..31b0c1983286 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -1010,7 +1010,7 @@ void pcibios_set_master(struct pci_dev *dev) } #ifdef CONFIG_PCI_IOV -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { struct pci_dev *pdev; diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 0224d8f19ed6..b98a7bbe6728 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1567,7 +1567,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, static void __init pcpu_free_bootmem(void *ptr, size_t size) { - memblock_free(__pa(ptr), size); + memblock_free(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 8e636ce02949..0039771eb01c 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -47,7 +47,7 @@ void __init mem_init(void) */ brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); - memblock_free(__pa(brk_end), uml_reserved - brk_end); + memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; /* this will put all low memory onto the freelists */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b1d4b481fcdd..95dd1ee01546 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,7 +63,7 @@ config X86 select ARCH_CLOCKSOURCE_INIT select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION - select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) + select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE) select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE @@ -192,6 +192,8 @@ config X86 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_SAMPLE_FTRACE_DIRECT if X86_64 + select HAVE_SAMPLE_FTRACE_MULTI_DIRECT if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA @@ -1627,7 +1629,7 @@ config ARCH_SELECT_MEMORY_MODEL config ARCH_MEMORY_PROBE bool "Enable sysfs memory/probe interface" - depends on X86_64 && MEMORY_HOTPLUG + depends on MEMORY_HOTPLUG help This option enables a sysfs memory/probe interface for testing. See Documentation/admin-guide/mm/memory-hotplug.rst for more information. @@ -2423,7 +2425,7 @@ endmenu config ARCH_HAS_ADD_PAGES def_bool y - depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG + depends on ARCH_ENABLE_MEMORY_HOTPLUG config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE def_bool y diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index c72e368dd164..f1ba6ab2e97e 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1187,7 +1187,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id * PCI slot and func to indicate the uncore box. */ if (id->driver_data & ~0xffff) { - struct pci_driver *pci_drv = pdev->driver; + struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver); pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table); if (pmu == NULL) diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 9e1def3744f2..36e84d904260 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -80,7 +80,7 @@ static struct resource video_rom_resource = { */ static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) { - struct pci_driver *drv = pdev->driver; + struct pci_driver *drv = to_pci_driver(pdev->dev.driver); const struct pci_device_id *id; if (pdev->vendor == vendor && pdev->device == device) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 544a885fa9a9..69468eebf8a0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -324,7 +324,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else @@ -523,7 +523,7 @@ static void __init reserve_crashkernel(void) } if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { - memblock_free(crash_base, crash_size); + memblock_phys_free(crash_base, crash_size); return; } diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5afd98559193..7b65275544b2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_ptr(ptr, size); + memblock_free(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 23a14d82e783..1895986842b9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -618,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start, */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); - memblock_free(addr, PMD_SIZE); + memblock_phys_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd90b8fe81e4..5cd7ea6d645c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -779,37 +779,6 @@ void __init mem_init(void) test_wp_bit(); } -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_params *params) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - int ret; - - /* - * The page tables were already mapped at boot so if the caller - * requests a different mapping type then we must change all the - * pages with __set_memory_prot(). - */ - if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) { - ret = __set_memory_prot(start, nr_pages, params->pgprot); - if (ret) - return ret; - } - - return __add_pages(nid, start_pfn, nr_pages, params); -} - -void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - __remove_pages(start_pfn, nr_pages, altmap); -} -#endif - int kernel_set_to_readonly __read_mostly; static void mark_nxdata_nx(void) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index ef885370719a..e7b9b464a82f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -49,7 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; - memblock_free_ptr(p, PMD_SIZE); + memblock_free(p, PMD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); @@ -85,7 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; - memblock_free_ptr(p, PUD_SIZE); + memblock_free(p, PUD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1e9b93b088db..c6b1213086d6 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -355,7 +355,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free_ptr(numa_distance, size); + memblock_free(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index e801e30089c4..1a02b791d273 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -517,7 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } /* free the copied physical distance table */ - memblock_free_ptr(phys_dist, phys_size); + memblock_free(phys_dist, phys_size); return; no_emu: diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3507f456fcd0..9e1e6b8d8876 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -632,7 +632,7 @@ static void set_dev_domain_options(struct pci_dev *pdev) pdev->hotplug_user_indicators = 1; } -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { struct pci_setup_rom *rom; struct irq_domain *msidom; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1ce436eeda15..cdbf4822f431 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1025,7 +1025,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) make_lowmem_page_readwrite(vaddr); - memblock_free(paddr, size); + memblock_phys_free(paddr, size); } static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) @@ -1151,7 +1151,7 @@ static void __init xen_pagetable_p2m_free(void) xen_cleanhighmap(addr, addr + size); size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(addr), size); + memblock_free((void *)addr, size); } else { xen_cleanmfnmap(addr); } @@ -1956,7 +1956,7 @@ void __init xen_relocate_p2m(void) pfn_end = p2m_pfn_end; } - memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); while (pfn < pfn_end) { if (pfn == p2m_pfn) { pfn = p2m_pfn_end; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 5e6e236977c7..58db86f7b384 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void) static void __ref free_p2m_page(void *p) { if (unlikely(!slab_is_available())) { - memblock_free((unsigned long)p, PAGE_SIZE); + memblock_free(p, PAGE_SIZE); return; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8bfc10330107..f387fc7e5250 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn, break; } } - memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); + memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -719,7 +719,7 @@ static void __init xen_reserve_xen_mfnlist(void) return; xen_relocate_p2m(); - memblock_free(start, size); + memblock_phys_free(start, size); } /** @@ -885,7 +885,7 @@ char * __init xen_memory_setup(void) xen_phys_memcpy(new_area, start, size); pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", start, start + size, new_area, new_area + size); - memblock_free(start, size); + memblock_phys_free(start, size); boot_params.hdr.ramdisk_image = new_area; boot_params.ext_ramdisk_image = new_area >> 32; } diff --git a/arch/xtensa/boot/boot-elf/bootstrap.S b/arch/xtensa/boot/boot-elf/bootstrap.S index 99e98c9bae41..2dd28931d699 100644 --- a/arch/xtensa/boot/boot-elf/bootstrap.S +++ b/arch/xtensa/boot/boot-elf/bootstrap.S @@ -42,12 +42,14 @@ _bootparam: .align 4 _SetupMMU: +#if XCHAL_HAVE_WINDOWED movi a0, 0 wsr a0, windowbase rsync movi a0, 1 wsr a0, windowstart rsync +#endif movi a0, 0x1F wsr a0, ps rsync diff --git a/arch/xtensa/boot/boot-redboot/bootstrap.S b/arch/xtensa/boot/boot-redboot/bootstrap.S index 48ba5a232d94..3ed94ad35000 100644 --- a/arch/xtensa/boot/boot-redboot/bootstrap.S +++ b/arch/xtensa/boot/boot-redboot/bootstrap.S @@ -3,6 +3,7 @@ #include #include #include +#include /* * RB-Data: RedBoot data/bss * P: Boot-Parameters @@ -36,7 +37,7 @@ .globl __start /* this must be the first byte of the loader! */ __start: - entry sp, 32 # we do not intend to return + abi_entry(32) # we do not intend to return _call0 _start __start_a0: .align 4 @@ -55,17 +56,19 @@ _start: movi a4, 1 wsr a4, ps rsync - +#if XCHAL_HAVE_WINDOWED rsr a5, windowbase ssl a5 sll a4, a4 wsr a4, windowstart rsync - - movi a4, 0x00040000 +#endif + movi a4, KERNEL_PS_WOE_MASK wsr a4, ps rsync +KABI_C0 mov abi_saved0, abi_arg0 + /* copy the loader to its address * Note: The loader itself is a very small piece, so we assume we * don't partially overlap. We also assume (even more important) @@ -168,52 +171,52 @@ _reloc: movi a3, __image_load sub a4, a3, a4 - add a8, a0, a4 + add abi_arg2, a0, a4 # a1 Stack # a8(a4) Load address of the image - movi a6, _image_start - movi a10, _image_end - movi a7, 0x1000000 - sub a11, a10, a6 - movi a9, complen - s32i a11, a9, 0 + movi abi_arg0, _image_start + movi abi_arg4, _image_end + movi abi_arg1, 0x1000000 + sub abi_tmp0, abi_arg4, abi_arg0 + movi abi_arg3, complen + s32i abi_tmp0, abi_arg3, 0 movi a0, 0 - # a6 destination - # a7 maximum size of destination - # a8 source - # a9 ptr to length + # abi_arg0 destination + # abi_arg1 maximum size of destination + # abi_arg2 source + # abi_arg3 ptr to length .extern gunzip - movi a4, gunzip - beqz a4, 1f + movi abi_tmp0, gunzip + beqz abi_tmp0, 1f - callx4 a4 + abi_callx abi_tmp0 j 2f - # a6 destination start - # a7 maximum size of destination - # a8 source start - # a9 ptr to length - # a10 destination end + # abi_arg0 destination start + # abi_arg1 maximum size of destination + # abi_arg2 source start + # abi_arg3 ptr to length + # abi_arg4 destination end 1: - l32i a9, a8, 0 - l32i a11, a8, 4 - s32i a9, a6, 0 - s32i a11, a6, 4 - l32i a9, a8, 8 - l32i a11, a8, 12 - s32i a9, a6, 8 - s32i a11, a6, 12 - addi a6, a6, 16 - addi a8, a8, 16 - blt a6, a10, 1b + l32i abi_tmp0, abi_arg2, 0 + l32i abi_tmp1, abi_arg2, 4 + s32i abi_tmp0, abi_arg0, 0 + s32i abi_tmp1, abi_arg0, 4 + l32i abi_tmp0, abi_arg2, 8 + l32i abi_tmp1, abi_arg2, 12 + s32i abi_tmp0, abi_arg0, 8 + s32i abi_tmp1, abi_arg0, 12 + addi abi_arg0, abi_arg0, 16 + addi abi_arg2, abi_arg2, 16 + blt abi_arg0, abi_arg4, 1b /* jump to the kernel */ @@ -230,6 +233,7 @@ _reloc: # a2 Boot parameter list +KABI_C0 mov abi_arg0, abi_saved0 movi a0, _image_start jx a0 diff --git a/arch/xtensa/include/asm/asmmacro.h b/arch/xtensa/include/asm/asmmacro.h index bfc89e11f469..809c507d1825 100644 --- a/arch/xtensa/include/asm/asmmacro.h +++ b/arch/xtensa/include/asm/asmmacro.h @@ -194,6 +194,12 @@ #define XTENSA_STACK_ALIGNMENT 16 #if defined(__XTENSA_WINDOWED_ABI__) + +/* Assembly instructions for windowed kernel ABI. */ +#define KABI_W +/* Assembly instructions for call0 kernel ABI (will be ignored). */ +#define KABI_C0 # + #define XTENSA_FRAME_SIZE_RESERVE 16 #define XTENSA_SPILL_STACK_RESERVE 32 @@ -206,8 +212,34 @@ #define abi_ret(frame_size) retw #define abi_ret_default retw + /* direct call */ +#define abi_call call4 + /* indirect call */ +#define abi_callx callx4 + /* outgoing call argument registers */ +#define abi_arg0 a6 +#define abi_arg1 a7 +#define abi_arg2 a8 +#define abi_arg3 a9 +#define abi_arg4 a10 +#define abi_arg5 a11 + /* return value */ +#define abi_rv a6 + /* registers preserved across call */ +#define abi_saved0 a2 +#define abi_saved1 a3 + + /* none of the above */ +#define abi_tmp0 a4 +#define abi_tmp1 a5 + #elif defined(__XTENSA_CALL0_ABI__) +/* Assembly instructions for windowed kernel ABI (will be ignored). */ +#define KABI_W # +/* Assembly instructions for call0 kernel ABI. */ +#define KABI_C0 + #define XTENSA_SPILL_STACK_RESERVE 0 #define abi_entry(frame_size) __abi_entry (frame_size) @@ -233,10 +265,43 @@ #define abi_ret_default ret + /* direct call */ +#define abi_call call0 + /* indirect call */ +#define abi_callx callx0 + /* outgoing call argument registers */ +#define abi_arg0 a2 +#define abi_arg1 a3 +#define abi_arg2 a4 +#define abi_arg3 a5 +#define abi_arg4 a6 +#define abi_arg5 a7 + /* return value */ +#define abi_rv a2 + /* registers preserved across call */ +#define abi_saved0 a12 +#define abi_saved1 a13 + + /* none of the above */ +#define abi_tmp0 a8 +#define abi_tmp1 a9 + #else #error Unsupported Xtensa ABI #endif +#if defined(USER_SUPPORT_WINDOWED) +/* Assembly instructions for windowed user ABI. */ +#define UABI_W +/* Assembly instructions for call0 user ABI (will be ignored). */ +#define UABI_C0 # +#else +/* Assembly instructions for windowed user ABI (will be ignored). */ +#define UABI_W # +/* Assembly instructions for call0 user ABI. */ +#define UABI_C0 +#endif + #define __XTENSA_HANDLER .section ".exception.text", "ax" #endif /* _XTENSA_ASMMACRO_H */ diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index 4361fe4247e3..52da614f953c 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -25,15 +25,15 @@ * * Locking interrupts looks like this: * - * rsil a15, TOPLEVEL + * rsil a14, TOPLEVEL * - * wsr a15, PS + * wsr a14, PS * rsync * - * Note that a15 is used here because the register allocation + * Note that a14 is used here because the register allocation * done by the compiler is not guaranteed and a window overflow * may not occur between the rsil and wsr instructions. By using - * a15 in the rsil, the machine is guaranteed to be in a state + * a14 in the rsil, the machine is guaranteed to be in a state * where no register reference will cause an overflow. */ @@ -185,15 +185,15 @@ static inline void arch_atomic_##op(int i, atomic_t * v) \ unsigned int vval; \ \ __asm__ __volatile__( \ - " rsil a15, "__stringify(TOPLEVEL)"\n" \ + " rsil a14, "__stringify(TOPLEVEL)"\n" \ " l32i %[result], %[mem]\n" \ " " #op " %[result], %[result], %[i]\n" \ " s32i %[result], %[mem]\n" \ - " wsr a15, ps\n" \ + " wsr a14, ps\n" \ " rsync\n" \ : [result] "=&a" (vval), [mem] "+m" (*v) \ : [i] "a" (i) \ - : "a15", "memory" \ + : "a14", "memory" \ ); \ } \ @@ -203,15 +203,15 @@ static inline int arch_atomic_##op##_return(int i, atomic_t * v) \ unsigned int vval; \ \ __asm__ __volatile__( \ - " rsil a15,"__stringify(TOPLEVEL)"\n" \ + " rsil a14,"__stringify(TOPLEVEL)"\n" \ " l32i %[result], %[mem]\n" \ " " #op " %[result], %[result], %[i]\n" \ " s32i %[result], %[mem]\n" \ - " wsr a15, ps\n" \ + " wsr a14, ps\n" \ " rsync\n" \ : [result] "=&a" (vval), [mem] "+m" (*v) \ : [i] "a" (i) \ - : "a15", "memory" \ + : "a14", "memory" \ ); \ \ return vval; \ @@ -223,16 +223,16 @@ static inline int arch_atomic_fetch_##op(int i, atomic_t * v) \ unsigned int tmp, vval; \ \ __asm__ __volatile__( \ - " rsil a15,"__stringify(TOPLEVEL)"\n" \ + " rsil a14,"__stringify(TOPLEVEL)"\n" \ " l32i %[result], %[mem]\n" \ " " #op " %[tmp], %[result], %[i]\n" \ " s32i %[tmp], %[mem]\n" \ - " wsr a15, ps\n" \ + " wsr a14, ps\n" \ " rsync\n" \ : [result] "=&a" (vval), [tmp] "=&a" (tmp), \ [mem] "+m" (*v) \ : [i] "a" (i) \ - : "a15", "memory" \ + : "a14", "memory" \ ); \ \ return vval; \ diff --git a/arch/xtensa/include/asm/cmpxchg.h b/arch/xtensa/include/asm/cmpxchg.h index 3699e2818efb..eb87810357ad 100644 --- a/arch/xtensa/include/asm/cmpxchg.h +++ b/arch/xtensa/include/asm/cmpxchg.h @@ -52,16 +52,16 @@ __cmpxchg_u32(volatile int *p, int old, int new) return new; #else __asm__ __volatile__( - " rsil a15, "__stringify(TOPLEVEL)"\n" + " rsil a14, "__stringify(TOPLEVEL)"\n" " l32i %[old], %[mem]\n" " bne %[old], %[cmp], 1f\n" " s32i %[new], %[mem]\n" "1:\n" - " wsr a15, ps\n" + " wsr a14, ps\n" " rsync\n" : [old] "=&a" (old), [mem] "+m" (*p) : [cmp] "a" (old), [new] "r" (new) - : "a15", "memory"); + : "a14", "memory"); return old; #endif } @@ -116,10 +116,10 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, /* * xchg_u32 * - * Note that a15 is used here because the register allocation + * Note that a14 is used here because the register allocation * done by the compiler is not guaranteed and a window overflow * may not occur between the rsil and wsr instructions. By using - * a15 in the rsil, the machine is guaranteed to be in a state + * a14 in the rsil, the machine is guaranteed to be in a state * where no register reference will cause an overflow. */ @@ -157,14 +157,14 @@ static inline unsigned long xchg_u32(volatile int * m, unsigned long val) #else unsigned long tmp; __asm__ __volatile__( - " rsil a15, "__stringify(TOPLEVEL)"\n" + " rsil a14, "__stringify(TOPLEVEL)"\n" " l32i %[tmp], %[mem]\n" " s32i %[val], %[mem]\n" - " wsr a15, ps\n" + " wsr a14, ps\n" " rsync\n" : [tmp] "=&a" (tmp), [mem] "+m" (*m) : [val] "a" (val) - : "a15", "memory"); + : "a14", "memory"); return tmp; #endif } diff --git a/arch/xtensa/include/asm/core.h b/arch/xtensa/include/asm/core.h index 5590b0f68837..9138077e567d 100644 --- a/arch/xtensa/include/asm/core.h +++ b/arch/xtensa/include/asm/core.h @@ -26,4 +26,15 @@ #define XCHAL_SPANNING_WAY 0 #endif +#if XCHAL_HAVE_WINDOWED +#if defined(CONFIG_USER_ABI_DEFAULT) || defined(CONFIG_USER_ABI_CALL0_PROBE) +/* Whether windowed ABI is supported in userspace. */ +#define USER_SUPPORT_WINDOWED +#endif +#if defined(__XTENSA_WINDOWED_ABI__) || defined(USER_SUPPORT_WINDOWED) +/* Whether windowed ABI is supported either in userspace or in the kernel. */ +#define SUPPORT_WINDOWED +#endif +#endif + #endif diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index ad15fbc57283..37d3e9887fe7 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -18,12 +18,6 @@ #include #include -/* Assertions. */ - -#if (XCHAL_HAVE_WINDOWED != 1) -# error Linux requires the Xtensa Windowed Registers Option. -#endif - /* Xtensa ABI requires stack alignment to be at least 16 */ #define STACK_ALIGN (XCHAL_DATA_WIDTH > 16 ? XCHAL_DATA_WIDTH : 16) @@ -105,8 +99,18 @@ #define WSBITS (XCHAL_NUM_AREGS / 4) /* width of WINDOWSTART in bits */ #define WBBITS (XCHAL_NUM_AREGS_LOG2 - 2) /* width of WINDOWBASE in bits */ +#if defined(__XTENSA_WINDOWED_ABI__) +#define KERNEL_PS_WOE_MASK PS_WOE_MASK +#elif defined(__XTENSA_CALL0_ABI__) +#define KERNEL_PS_WOE_MASK 0 +#else +#error Unsupported xtensa ABI +#endif + #ifndef __ASSEMBLY__ +#if defined(__XTENSA_WINDOWED_ABI__) + /* Build a valid return address for the specified call winsize. * winsize must be 1 (call4), 2 (call8), or 3 (call12) */ @@ -117,6 +121,22 @@ */ #define MAKE_PC_FROM_RA(ra,sp) (((ra) & 0x3fffffff) | ((sp) & 0xc0000000)) +#elif defined(__XTENSA_CALL0_ABI__) + +/* Build a valid return address for the specified call winsize. + * winsize must be 1 (call4), 2 (call8), or 3 (call12) + */ +#define MAKE_RA_FOR_CALL(ra, ws) (ra) + +/* Convert return address to a valid pc + * Note: We assume that the stack pointer is in the same 1GB ranges as the ra + */ +#define MAKE_PC_FROM_RA(ra, sp) (ra) + +#else +#error Unsupported Xtensa ABI +#endif + /* Spill slot location for the register reg in the spill area under the stack * pointer sp. reg must be in the range [0..4). */ diff --git a/arch/xtensa/include/asm/sections.h b/arch/xtensa/include/asm/sections.h new file mode 100644 index 000000000000..a8c42d08e281 --- /dev/null +++ b/arch/xtensa/include/asm/sections.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _XTENSA_SECTIONS_H +#define _XTENSA_SECTIONS_H + +#include + +#ifdef CONFIG_VECTORS_ADDR +extern char _WindowVectors_text_start[]; +extern char _WindowVectors_text_end[]; +extern char _DebugInterruptVector_text_start[]; +extern char _DebugInterruptVector_text_end[]; +extern char _KernelExceptionVector_text_start[]; +extern char _KernelExceptionVector_text_end[]; +extern char _UserExceptionVector_text_start[]; +extern char _UserExceptionVector_text_end[]; +extern char _DoubleExceptionVector_text_start[]; +extern char _DoubleExceptionVector_text_end[]; +extern char _exception_text_start[]; +extern char _exception_text_end[]; +extern char _Level2InterruptVector_text_start[]; +extern char _Level2InterruptVector_text_end[]; +extern char _Level3InterruptVector_text_start[]; +extern char _Level3InterruptVector_text_end[]; +extern char _Level4InterruptVector_text_start[]; +extern char _Level4InterruptVector_text_end[]; +extern char _Level5InterruptVector_text_start[]; +extern char _Level5InterruptVector_text_end[]; +extern char _Level6InterruptVector_text_start[]; +extern char _Level6InterruptVector_text_end[]; +#endif +#ifdef CONFIG_SMP +extern char _SecondaryResetVector_text_start[]; +extern char _SecondaryResetVector_text_end[]; +#endif +#ifdef CONFIG_XIP_KERNEL +extern char _xip_start[]; +extern char _xip_end[]; +#endif + +#endif diff --git a/arch/xtensa/include/asm/traps.h b/arch/xtensa/include/asm/traps.h index f720a57d0a5b..6fa47cd8e02d 100644 --- a/arch/xtensa/include/asm/traps.h +++ b/arch/xtensa/include/asm/traps.h @@ -56,6 +56,7 @@ void secondary_trap_init(void); static inline void spill_registers(void) { +#if defined(__XTENSA_WINDOWED_ABI__) #if XCHAL_NUM_AREGS > 16 __asm__ __volatile__ ( " call8 1f\n" @@ -96,6 +97,7 @@ static inline void spill_registers(void) " mov a12, a12\n" : : : "memory"); #endif +#endif } struct debug_table { diff --git a/arch/xtensa/kernel/align.S b/arch/xtensa/kernel/align.S index 9301452e521e..d062c732ef18 100644 --- a/arch/xtensa/kernel/align.S +++ b/arch/xtensa/kernel/align.S @@ -58,7 +58,9 @@ * BE shift left / mask 0 0 X X */ +#if XCHAL_HAVE_WINDOWED #define UNALIGNED_USER_EXCEPTION +#endif #if XCHAL_HAVE_BE diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index 647b162f959b..99ab3c1a3387 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -158,6 +158,7 @@ _user_exception: /* Rotate ws so that the current windowbase is at bit0. */ /* Assume ws = xxwww1yyyy. Rotate ws right, so that a2 = yyyyxxwww1 */ +#if defined(USER_SUPPORT_WINDOWED) rsr a2, windowbase rsr a3, windowstart ssr a2 @@ -167,24 +168,33 @@ _user_exception: src a2, a3, a2 srli a2, a2, 32-WSBITS s32i a2, a1, PT_WMASK # needed for restoring registers +#else + movi a2, 0 + movi a3, 1 + s32i a2, a1, PT_WINDOWBASE + s32i a3, a1, PT_WINDOWSTART + s32i a3, a1, PT_WMASK +#endif /* Save only live registers. */ - _bbsi.l a2, 1, 1f +UABI_W _bbsi.l a2, 1, 1f s32i a4, a1, PT_AREG4 s32i a5, a1, PT_AREG5 s32i a6, a1, PT_AREG6 s32i a7, a1, PT_AREG7 - _bbsi.l a2, 2, 1f +UABI_W _bbsi.l a2, 2, 1f s32i a8, a1, PT_AREG8 s32i a9, a1, PT_AREG9 s32i a10, a1, PT_AREG10 s32i a11, a1, PT_AREG11 - _bbsi.l a2, 3, 1f +UABI_W _bbsi.l a2, 3, 1f s32i a12, a1, PT_AREG12 s32i a13, a1, PT_AREG13 s32i a14, a1, PT_AREG14 s32i a15, a1, PT_AREG15 + +#if defined(USER_SUPPORT_WINDOWED) _bnei a2, 1, 1f # only one valid frame? /* Only one valid frame, skip saving regs. */ @@ -239,7 +249,7 @@ _user_exception: rsync /* We are back to the original stack pointer (a1) */ - +#endif 2: /* Now, jump to the common exception handler. */ j common_exception @@ -295,6 +305,7 @@ _kernel_exception: s32i a3, a1, PT_SAR s32i a2, a1, PT_ICOUNTLEVEL +#if defined(__XTENSA_WINDOWED_ABI__) /* Rotate ws so that the current windowbase is at bit0. */ /* Assume ws = xxwww1yyyy. Rotate ws right, so that a2 = yyyyxxwww1 */ @@ -305,27 +316,28 @@ _kernel_exception: src a2, a3, a2 srli a2, a2, 32-WSBITS s32i a2, a1, PT_WMASK # needed for kernel_exception_exit +#endif /* Save only the live window-frame */ - _bbsi.l a2, 1, 1f +KABI_W _bbsi.l a2, 1, 1f s32i a4, a1, PT_AREG4 s32i a5, a1, PT_AREG5 s32i a6, a1, PT_AREG6 s32i a7, a1, PT_AREG7 - _bbsi.l a2, 2, 1f +KABI_W _bbsi.l a2, 2, 1f s32i a8, a1, PT_AREG8 s32i a9, a1, PT_AREG9 s32i a10, a1, PT_AREG10 s32i a11, a1, PT_AREG11 - _bbsi.l a2, 3, 1f +KABI_W _bbsi.l a2, 3, 1f s32i a12, a1, PT_AREG12 s32i a13, a1, PT_AREG13 s32i a14, a1, PT_AREG14 s32i a15, a1, PT_AREG15 +#ifdef __XTENSA_WINDOWED_ABI__ _bnei a2, 1, 1f - /* Copy spill slots of a0 and a1 to imitate movsp * in order to keep exception stack continuous */ @@ -333,6 +345,7 @@ _kernel_exception: l32i a0, a1, PT_SIZE + 4 s32e a3, a1, -16 s32e a0, a1, -12 +#endif 1: l32i a0, a1, PT_AREG0 # restore saved a0 wsr a0, depc @@ -419,16 +432,16 @@ common_exception: movi a3, LOCKLEVEL .Lexception: - movi a0, PS_WOE_MASK - or a3, a3, a0 +KABI_W movi a0, PS_WOE_MASK +KABI_W or a3, a3, a0 #else addi a2, a2, -EXCCAUSE_LEVEL1_INTERRUPT movi a0, LOCKLEVEL extui a3, a3, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH # a3 = PS.INTLEVEL moveqz a3, a0, a2 # a3 = LOCKLEVEL iff interrupt - movi a2, PS_WOE_MASK - or a3, a3, a2 +KABI_W movi a2, PS_WOE_MASK +KABI_W or a3, a3, a2 rsr a2, exccause #endif @@ -461,14 +474,14 @@ common_exception: */ rsr a4, excsave1 - mov a6, a1 # pass stack frame - mov a7, a2 # pass EXCCAUSE addx4 a4, a2, a4 l32i a4, a4, EXC_TABLE_DEFAULT # load handler + mov abi_arg1, a2 # pass EXCCAUSE + mov abi_arg0, a1 # pass stack frame /* Call the second-level handler */ - callx4 a4 + abi_callx a4 /* Jump here for exception exit */ .global common_exception_return @@ -482,15 +495,15 @@ common_exception_return: 1: irq_save a2, a3 #ifdef CONFIG_TRACE_IRQFLAGS - call4 trace_hardirqs_off + abi_call trace_hardirqs_off #endif /* Jump if we are returning from kernel exceptions. */ - l32i a3, a1, PT_PS + l32i abi_saved1, a1, PT_PS GET_THREAD_INFO(a2, a1) l32i a4, a2, TI_FLAGS - _bbci.l a3, PS_UM_BIT, 6f + _bbci.l abi_saved1, PS_UM_BIT, 6f /* Specific to a user exception exit: * We need to check some flags for signal handling and rescheduling, @@ -509,20 +522,20 @@ common_exception_return: /* Call do_signal() */ #ifdef CONFIG_TRACE_IRQFLAGS - call4 trace_hardirqs_on + abi_call trace_hardirqs_on #endif rsil a2, 0 - mov a6, a1 - call4 do_notify_resume # int do_notify_resume(struct pt_regs*) + mov abi_arg0, a1 + abi_call do_notify_resume # int do_notify_resume(struct pt_regs*) j 1b 3: /* Reschedule */ #ifdef CONFIG_TRACE_IRQFLAGS - call4 trace_hardirqs_on + abi_call trace_hardirqs_on #endif rsil a2, 0 - call4 schedule # void schedule (void) + abi_call schedule # void schedule (void) j 1b #ifdef CONFIG_PREEMPTION @@ -533,33 +546,33 @@ common_exception_return: l32i a4, a2, TI_PRE_COUNT bnez a4, 4f - call4 preempt_schedule_irq + abi_call preempt_schedule_irq j 4f #endif #if XTENSA_FAKE_NMI .LNMIexit: - l32i a3, a1, PT_PS - _bbci.l a3, PS_UM_BIT, 4f + l32i abi_saved1, a1, PT_PS + _bbci.l abi_saved1, PS_UM_BIT, 4f #endif 5: #ifdef CONFIG_HAVE_HW_BREAKPOINT _bbci.l a4, TIF_DB_DISABLED, 7f - call4 restore_dbreak + abi_call restore_dbreak 7: #endif #ifdef CONFIG_DEBUG_TLB_SANITY l32i a4, a1, PT_DEPC bgeui a4, VALID_DOUBLE_EXCEPTION_ADDRESS, 4f - call4 check_tlb_sanity + abi_call check_tlb_sanity #endif 6: 4: #ifdef CONFIG_TRACE_IRQFLAGS - extui a4, a3, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH + extui a4, abi_saved1, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH bgei a4, LOCKLEVEL, 1f - call4 trace_hardirqs_on + abi_call trace_hardirqs_on 1: #endif /* Restore optional registers. */ @@ -572,14 +585,15 @@ common_exception_return: l32i a2, a1, PT_SCOMPARE1 wsr a2, scompare1 #endif - wsr a3, ps /* disable interrupts */ + wsr abi_saved1, ps /* disable interrupts */ - _bbci.l a3, PS_UM_BIT, kernel_exception_exit + _bbci.l abi_saved1, PS_UM_BIT, kernel_exception_exit user_exception_exit: /* Restore the state of the task and return from the exception. */ +#if defined(USER_SUPPORT_WINDOWED) /* Switch to the user thread WINDOWBASE. Save SP temporarily in DEPC */ l32i a2, a1, PT_WINDOWBASE @@ -634,8 +648,10 @@ user_exception_exit: * frame where we had loaded a2), or at least the lower 4 bits * (if we have restored WSBITS-1 frames). */ - 2: +#else + movi a2, 1 +#endif #if XCHAL_HAVE_THREADPTR l32i a3, a1, PT_THREADPTR wur a3, threadptr @@ -650,6 +666,7 @@ user_exception_exit: kernel_exception_exit: +#if defined(__XTENSA_WINDOWED_ABI__) /* Check if we have to do a movsp. * * We only have to do a movsp if the previous window-frame has @@ -702,6 +719,9 @@ kernel_exception_exit: * * Note: We expect a2 to hold PT_WMASK */ +#else + movi a2, 1 +#endif common_exception_exit: @@ -920,14 +940,16 @@ unrecoverable_text: ENTRY(unrecoverable_exception) +#if XCHAL_HAVE_WINDOWED movi a0, 1 movi a1, 0 wsr a0, windowstart wsr a1, windowbase rsync +#endif - movi a1, PS_WOE_MASK | LOCKLEVEL + movi a1, KERNEL_PS_WOE_MASK | LOCKLEVEL wsr a1, ps rsync @@ -935,8 +957,8 @@ ENTRY(unrecoverable_exception) movi a0, 0 addi a1, a1, PT_REGS_OFFSET - movi a6, unrecoverable_text - call4 panic + movi abi_arg0, unrecoverable_text + abi_call panic 1: j 1b @@ -947,6 +969,7 @@ ENDPROC(unrecoverable_exception) __XTENSA_HANDLER .literal_position +#ifdef SUPPORT_WINDOWED /* * Fast-handler for alloca exceptions * @@ -1010,6 +1033,7 @@ ENTRY(fast_alloca) 8: j _WindowUnderflow8 4: j _WindowUnderflow4 ENDPROC(fast_alloca) +#endif #ifdef CONFIG_USER_ABI_CALL0_PROBE /* @@ -1206,7 +1230,8 @@ ENDPROC(fast_syscall_xtensa) * Note: We assume the stack pointer is EXC_TABLE_KSTK in the fixup handler. */ -#ifdef CONFIG_FAST_SYSCALL_SPILL_REGISTERS +#if defined(CONFIG_FAST_SYSCALL_SPILL_REGISTERS) && \ + defined(USER_SUPPORT_WINDOWED) ENTRY(fast_syscall_spill_registers) @@ -1403,12 +1428,12 @@ ENTRY(fast_syscall_spill_registers) rsr a3, excsave1 l32i a1, a3, EXC_TABLE_KSTK - movi a4, PS_WOE_MASK | LOCKLEVEL + movi a4, KERNEL_PS_WOE_MASK | LOCKLEVEL wsr a4, ps rsync - movi a6, SIGSEGV - call4 do_exit + movi abi_arg0, SIGSEGV + abi_call do_exit /* shouldn't return, so panic */ @@ -1887,57 +1912,77 @@ ENDPROC(fast_store_prohibited) ENTRY(system_call) +#if defined(__XTENSA_WINDOWED_ABI__) abi_entry_default +#elif defined(__XTENSA_CALL0_ABI__) + abi_entry(12) + + s32i a0, sp, 0 + s32i abi_saved0, sp, 4 + s32i abi_saved1, sp, 8 + mov abi_saved0, a2 +#else +#error Unsupported Xtensa ABI +#endif /* regs->syscall = regs->areg[2] */ - l32i a7, a2, PT_AREG2 - s32i a7, a2, PT_SYSCALL + l32i a7, abi_saved0, PT_AREG2 + s32i a7, abi_saved0, PT_SYSCALL GET_THREAD_INFO(a4, a1) - l32i a3, a4, TI_FLAGS + l32i abi_saved1, a4, TI_FLAGS movi a4, _TIF_WORK_MASK - and a3, a3, a4 - beqz a3, 1f + and abi_saved1, abi_saved1, a4 + beqz abi_saved1, 1f - mov a6, a2 - call4 do_syscall_trace_enter - beqz a6, .Lsyscall_exit - l32i a7, a2, PT_SYSCALL + mov abi_arg0, abi_saved0 + abi_call do_syscall_trace_enter + beqz abi_rv, .Lsyscall_exit + l32i a7, abi_saved0, PT_SYSCALL 1: /* syscall = sys_call_table[syscall_nr] */ movi a4, sys_call_table movi a5, __NR_syscalls - movi a6, -ENOSYS + movi abi_rv, -ENOSYS bgeu a7, a5, 1f addx4 a4, a7, a4 - l32i a4, a4, 0 + l32i abi_tmp0, a4, 0 /* Load args: arg0 - arg5 are passed via regs. */ - l32i a6, a2, PT_AREG6 - l32i a7, a2, PT_AREG3 - l32i a8, a2, PT_AREG4 - l32i a9, a2, PT_AREG5 - l32i a10, a2, PT_AREG8 - l32i a11, a2, PT_AREG9 + l32i abi_arg0, abi_saved0, PT_AREG6 + l32i abi_arg1, abi_saved0, PT_AREG3 + l32i abi_arg2, abi_saved0, PT_AREG4 + l32i abi_arg3, abi_saved0, PT_AREG5 + l32i abi_arg4, abi_saved0, PT_AREG8 + l32i abi_arg5, abi_saved0, PT_AREG9 - callx4 a4 + abi_callx abi_tmp0 1: /* regs->areg[2] = return_value */ - s32i a6, a2, PT_AREG2 - bnez a3, 1f + s32i abi_rv, abi_saved0, PT_AREG2 + bnez abi_saved1, 1f .Lsyscall_exit: +#if defined(__XTENSA_WINDOWED_ABI__) abi_ret_default +#elif defined(__XTENSA_CALL0_ABI__) + l32i a0, sp, 0 + l32i abi_saved0, sp, 4 + l32i abi_saved1, sp, 8 + abi_ret(12) +#else +#error Unsupported Xtensa ABI +#endif 1: - mov a6, a2 - call4 do_syscall_trace_leave - abi_ret_default + mov abi_arg0, abi_saved0 + abi_call do_syscall_trace_leave + j .Lsyscall_exit ENDPROC(system_call) @@ -1988,8 +2033,18 @@ ENDPROC(system_call) ENTRY(_switch_to) +#if defined(__XTENSA_WINDOWED_ABI__) abi_entry(XTENSA_SPILL_STACK_RESERVE) +#elif defined(__XTENSA_CALL0_ABI__) + abi_entry(16) + s32i a12, sp, 0 + s32i a13, sp, 4 + s32i a14, sp, 8 + s32i a15, sp, 12 +#else +#error Unsupported Xtensa ABI +#endif mov a11, a3 # and 'next' (a3) l32i a4, a2, TASK_THREAD_INFO @@ -2033,7 +2088,9 @@ ENTRY(_switch_to) /* Flush register file. */ +#if defined(__XTENSA_WINDOWED_ABI__) spill_registers_kernel +#endif /* Set kernel stack (and leave critical section) * Note: It's save to set it here. The stack will not be overwritten @@ -2055,34 +2112,43 @@ ENTRY(_switch_to) wsr a14, ps rsync +#if defined(__XTENSA_WINDOWED_ABI__) abi_ret(XTENSA_SPILL_STACK_RESERVE) +#elif defined(__XTENSA_CALL0_ABI__) + l32i a12, sp, 0 + l32i a13, sp, 4 + l32i a14, sp, 8 + l32i a15, sp, 12 + abi_ret(16) +#else +#error Unsupported Xtensa ABI +#endif ENDPROC(_switch_to) ENTRY(ret_from_fork) /* void schedule_tail (struct task_struct *prev) - * Note: prev is still in a6 (return value from fake call4 frame) + * Note: prev is still in abi_arg0 (return value from fake call frame) */ - call4 schedule_tail + abi_call schedule_tail - mov a6, a1 - call4 do_syscall_trace_leave - - j common_exception_return + mov abi_arg0, a1 + abi_call do_syscall_trace_leave + j common_exception_return ENDPROC(ret_from_fork) /* * Kernel thread creation helper - * On entry, set up by copy_thread: a2 = thread_fn, a3 = thread_fn arg - * left from _switch_to: a6 = prev + * On entry, set up by copy_thread: abi_saved0 = thread_fn, + * abi_saved1 = thread_fn arg. Left from _switch_to: abi_arg0 = prev */ ENTRY(ret_from_kernel_thread) - call4 schedule_tail - mov a6, a3 - callx4 a2 - j common_exception_return + abi_call schedule_tail + mov abi_arg0, abi_saved1 + abi_callx abi_saved0 + j common_exception_return ENDPROC(ret_from_kernel_thread) diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S index b9b81e76beea..8484294bc623 100644 --- a/arch/xtensa/kernel/head.S +++ b/arch/xtensa/kernel/head.S @@ -15,6 +15,7 @@ * Kevin Chea */ +#include #include #include #include @@ -66,11 +67,13 @@ _SetupOCD: * xt-gdb to single step via DEBUG exceptions received directly * by ocd. */ +#if XCHAL_HAVE_WINDOWED movi a1, 1 movi a0, 0 wsr a1, windowstart wsr a0, windowbase rsync +#endif movi a1, LOCKLEVEL wsr a1, ps @@ -193,9 +196,10 @@ ENTRY(_startup) movi a1, start_info l32i a1, a1, 0 - movi a2, PS_WOE_MASK | LOCKLEVEL - # WOE=1, INTLEVEL=LOCKLEVEL, UM=0 - wsr a2, ps # (enable reg-windows; progmode stack) + /* Disable interrupts. */ + /* Enable window exceptions if kernel is built with windowed ABI. */ + movi a2, KERNEL_PS_WOE_MASK | LOCKLEVEL + wsr a2, ps rsync #ifdef CONFIG_SMP @@ -267,13 +271,13 @@ ENTRY(_startup) l32i a1, a1, 0 #endif - movi a6, 0 - xsr a6, excsave1 + movi abi_arg0, 0 + xsr abi_arg0, excsave1 /* init_arch kick-starts the linux kernel */ - call4 init_arch - call4 start_kernel + abi_call init_arch + abi_call start_kernel should_never_return: j should_never_return @@ -297,10 +301,10 @@ should_never_return: s32i a3, a2, 0 memw - movi a6, 0 - wsr a6, excsave1 + movi abi_arg0, 0 + wsr abi_arg0, excsave1 - call4 secondary_start_kernel + abi_call secondary_start_kernel j should_never_return #endif /* CONFIG_SMP */ diff --git a/arch/xtensa/kernel/mcount.S b/arch/xtensa/kernel/mcount.S index 5e4619f52858..51daaf4e0b82 100644 --- a/arch/xtensa/kernel/mcount.S +++ b/arch/xtensa/kernel/mcount.S @@ -17,11 +17,16 @@ /* * Entry condition: * - * a2: a0 of the caller + * a2: a0 of the caller in windowed ABI + * a10: a0 of the caller in call0 ABI + * + * In call0 ABI the function _mcount is called with the special ABI: + * its argument is in a10 and all the usual argument registers (a2 - a7) + * must be preserved in addition to callee-saved a12 - a15. */ ENTRY(_mcount) - +#if defined(__XTENSA_WINDOWED_ABI__) abi_entry_default movi a4, ftrace_trace_function @@ -42,7 +47,36 @@ ENTRY(_mcount) callx4 a4 abi_ret_default +#elif defined(__XTENSA_CALL0_ABI__) + abi_entry_default + movi a9, ftrace_trace_function + l32i a9, a9, 0 + movi a11, ftrace_stub + bne a9, a11, 1f + abi_ret_default + +1: abi_entry(28) + s32i a0, sp, 0 + s32i a2, sp, 4 + s32i a3, sp, 8 + s32i a4, sp, 12 + s32i a5, sp, 16 + s32i a6, sp, 20 + s32i a7, sp, 24 + addi a2, a10, -MCOUNT_INSN_SIZE + callx0 a9 + l32i a0, sp, 0 + l32i a2, sp, 4 + l32i a3, sp, 8 + l32i a4, sp, 12 + l32i a5, sp, 16 + l32i a6, sp, 20 + l32i a7, sp, 24 + abi_ret(28) +#else +#error Unsupported Xtensa ABI +#endif ENDPROC(_mcount) ENTRY(ftrace_stub) diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 47f933fed870..bd80df890b1e 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -211,11 +211,18 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, struct thread_info *ti; #endif +#if defined(__XTENSA_WINDOWED_ABI__) /* Create a call4 dummy-frame: a0 = 0, a1 = childregs. */ SPILL_SLOT(childregs, 1) = (unsigned long)childregs; SPILL_SLOT(childregs, 0) = 0; p->thread.sp = (unsigned long)childregs; +#elif defined(__XTENSA_CALL0_ABI__) + /* Reserve 16 bytes for the _switch_to stack frame. */ + p->thread.sp = (unsigned long)childregs - 16; +#else +#error Unsupported Xtensa ABI +#endif if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { struct pt_regs *regs = current_pt_regs(); @@ -272,11 +279,25 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, p->thread.ra = MAKE_RA_FOR_CALL( (unsigned long)ret_from_kernel_thread, 1); - /* pass parameters to ret_from_kernel_thread: - * a2 = thread_fn, a3 = thread_fn arg + /* pass parameters to ret_from_kernel_thread: */ +#if defined(__XTENSA_WINDOWED_ABI__) + /* + * a2 = thread_fn, a3 = thread_fn arg. + * Window underflow will load registers from the + * spill slots on the stack on return from _switch_to. */ - SPILL_SLOT(childregs, 3) = thread_fn_arg; SPILL_SLOT(childregs, 2) = usp_thread_fn; + SPILL_SLOT(childregs, 3) = thread_fn_arg; +#elif defined(__XTENSA_CALL0_ABI__) + /* + * a12 = thread_fn, a13 = thread_fn arg. + * _switch_to epilogue will load registers from the stack. + */ + ((unsigned long *)p->thread.sp)[0] = usp_thread_fn; + ((unsigned long *)p->thread.sp)[1] = thread_fn_arg; +#else +#error Unsupported Xtensa ABI +#endif /* Childregs are only used when we're going to userspace * in which case start_thread will set them up. diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index ee9082a142fe..8db20cfb44ab 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -37,14 +37,15 @@ #include #include #include -#include -#include -#include #include -#include #include +#include +#include +#include +#include #include #include +#include #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) struct screen_info screen_info = { @@ -271,49 +272,6 @@ void __init init_arch(bp_tag_t *bp_start) * Initialize system. Setup memory and reserve regions. */ -extern char _end[]; -extern char _stext[]; -extern char _WindowVectors_text_start; -extern char _WindowVectors_text_end; -extern char _DebugInterruptVector_text_start; -extern char _DebugInterruptVector_text_end; -extern char _KernelExceptionVector_text_start; -extern char _KernelExceptionVector_text_end; -extern char _UserExceptionVector_text_start; -extern char _UserExceptionVector_text_end; -extern char _DoubleExceptionVector_text_start; -extern char _DoubleExceptionVector_text_end; -extern char _exception_text_start; -extern char _exception_text_end; -#if XCHAL_EXCM_LEVEL >= 2 -extern char _Level2InterruptVector_text_start; -extern char _Level2InterruptVector_text_end; -#endif -#if XCHAL_EXCM_LEVEL >= 3 -extern char _Level3InterruptVector_text_start; -extern char _Level3InterruptVector_text_end; -#endif -#if XCHAL_EXCM_LEVEL >= 4 -extern char _Level4InterruptVector_text_start; -extern char _Level4InterruptVector_text_end; -#endif -#if XCHAL_EXCM_LEVEL >= 5 -extern char _Level5InterruptVector_text_start; -extern char _Level5InterruptVector_text_end; -#endif -#if XCHAL_EXCM_LEVEL >= 6 -extern char _Level6InterruptVector_text_start; -extern char _Level6InterruptVector_text_end; -#endif -#ifdef CONFIG_SMP -extern char _SecondaryResetVector_text_start; -extern char _SecondaryResetVector_text_end; -#endif -#ifdef CONFIG_XIP_KERNEL -extern char _xip_start[]; -extern char _xip_end[]; -#endif - static inline int __init_memblock mem_reserve(unsigned long start, unsigned long end) { @@ -349,49 +307,51 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_VECTORS_ADDR - mem_reserve(__pa(&_WindowVectors_text_start), - __pa(&_WindowVectors_text_end)); +#ifdef SUPPORT_WINDOWED + mem_reserve(__pa(_WindowVectors_text_start), + __pa(_WindowVectors_text_end)); +#endif - mem_reserve(__pa(&_DebugInterruptVector_text_start), - __pa(&_DebugInterruptVector_text_end)); + mem_reserve(__pa(_DebugInterruptVector_text_start), + __pa(_DebugInterruptVector_text_end)); - mem_reserve(__pa(&_KernelExceptionVector_text_start), - __pa(&_KernelExceptionVector_text_end)); + mem_reserve(__pa(_KernelExceptionVector_text_start), + __pa(_KernelExceptionVector_text_end)); - mem_reserve(__pa(&_UserExceptionVector_text_start), - __pa(&_UserExceptionVector_text_end)); + mem_reserve(__pa(_UserExceptionVector_text_start), + __pa(_UserExceptionVector_text_end)); - mem_reserve(__pa(&_DoubleExceptionVector_text_start), - __pa(&_DoubleExceptionVector_text_end)); + mem_reserve(__pa(_DoubleExceptionVector_text_start), + __pa(_DoubleExceptionVector_text_end)); - mem_reserve(__pa(&_exception_text_start), - __pa(&_exception_text_end)); + mem_reserve(__pa(_exception_text_start), + __pa(_exception_text_end)); #if XCHAL_EXCM_LEVEL >= 2 - mem_reserve(__pa(&_Level2InterruptVector_text_start), - __pa(&_Level2InterruptVector_text_end)); + mem_reserve(__pa(_Level2InterruptVector_text_start), + __pa(_Level2InterruptVector_text_end)); #endif #if XCHAL_EXCM_LEVEL >= 3 - mem_reserve(__pa(&_Level3InterruptVector_text_start), - __pa(&_Level3InterruptVector_text_end)); + mem_reserve(__pa(_Level3InterruptVector_text_start), + __pa(_Level3InterruptVector_text_end)); #endif #if XCHAL_EXCM_LEVEL >= 4 - mem_reserve(__pa(&_Level4InterruptVector_text_start), - __pa(&_Level4InterruptVector_text_end)); + mem_reserve(__pa(_Level4InterruptVector_text_start), + __pa(_Level4InterruptVector_text_end)); #endif #if XCHAL_EXCM_LEVEL >= 5 - mem_reserve(__pa(&_Level5InterruptVector_text_start), - __pa(&_Level5InterruptVector_text_end)); + mem_reserve(__pa(_Level5InterruptVector_text_start), + __pa(_Level5InterruptVector_text_end)); #endif #if XCHAL_EXCM_LEVEL >= 6 - mem_reserve(__pa(&_Level6InterruptVector_text_start), - __pa(&_Level6InterruptVector_text_end)); + mem_reserve(__pa(_Level6InterruptVector_text_start), + __pa(_Level6InterruptVector_text_end)); #endif #endif /* CONFIG_VECTORS_ADDR */ #ifdef CONFIG_SMP - mem_reserve(__pa(&_SecondaryResetVector_text_start), - __pa(&_SecondaryResetVector_text_end)); + mem_reserve(__pa(_SecondaryResetVector_text_start), + __pa(_SecondaryResetVector_text_end)); #endif parse_early_param(); bootmem_init(); diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index c4d77dbfb61a..f6c949895b3e 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -45,12 +45,13 @@ struct rt_sigframe unsigned int window[4]; }; -/* +#if defined(USER_SUPPORT_WINDOWED) +/* * Flush register windows stored in pt_regs to stack. * Returns 1 for errors. */ -int +static int flush_window_regs_user(struct pt_regs *regs) { const unsigned long ws = regs->windowstart; @@ -121,6 +122,13 @@ flush_window_regs_user(struct pt_regs *regs) errout: return err; } +#else +static int +flush_window_regs_user(struct pt_regs *regs) +{ + return 0; +} +#endif /* * Note: We don't copy double exception 'regs', we have to finish double exc. diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 874b6efc6fb3..35a7d47f28cf 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -97,7 +97,9 @@ static dispatch_init_table_t __initdata dispatch_init_table[] = { /* EXCCAUSE_INSTRUCTION_FETCH unhandled */ /* EXCCAUSE_LOAD_STORE_ERROR unhandled*/ { EXCCAUSE_LEVEL1_INTERRUPT, 0, do_interrupt }, +#ifdef SUPPORT_WINDOWED { EXCCAUSE_ALLOCA, USER|KRNL, fast_alloca }, +#endif /* EXCCAUSE_INTEGER_DIVIDE_BY_ZERO unhandled */ /* EXCCAUSE_PRIVILEGED unhandled */ #if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION @@ -462,12 +464,10 @@ void secondary_trap_init(void) void show_regs(struct pt_regs * regs) { - int i, wmask; + int i; show_regs_print_info(KERN_DEFAULT); - wmask = regs->wmask & ~1; - for (i = 0; i < 16; i++) { if ((i % 8) == 0) pr_info("a%02d:", i); diff --git a/arch/xtensa/kernel/vectors.S b/arch/xtensa/kernel/vectors.S index 1a7538ccfc5a..407ece204e7c 100644 --- a/arch/xtensa/kernel/vectors.S +++ b/arch/xtensa/kernel/vectors.S @@ -226,6 +226,7 @@ ENTRY(_DoubleExceptionVector) xsr a0, depc # get DEPC, save a0 +#ifdef SUPPORT_WINDOWED movi a2, WINDOW_VECTORS_VADDR _bltu a0, a2, .Lfixup addi a2, a2, WINDOW_VECTORS_SIZE @@ -275,6 +276,10 @@ _DoubleExceptionVector_WindowUnderflow: l32i a0, a0, EXC_TABLE_FAST_USER jx a0 +#else + j .Lfixup +#endif + /* * We only allow the ITLB miss exception if we are in kernel space. * All other exceptions are unexpected and thus unrecoverable! @@ -343,6 +348,7 @@ _DoubleExceptionVector_WindowUnderflow: l32i a0, a0, EXC_TABLE_FAST_USER jx a0 +#ifdef SUPPORT_WINDOWED /* * Restart window OVERFLOW exception. * Currently: @@ -475,9 +481,12 @@ _DoubleExceptionVector_handle_exception: rsr a0, depc rotw -3 j 1b +#endif ENDPROC(_DoubleExceptionVector) +#ifdef SUPPORT_WINDOWED + /* * Fixup handler for TLB miss in double exception handler for window owerflow. * We get here with windowbase set to the window that was being spilled and @@ -590,6 +599,8 @@ ENTRY(window_overflow_restore_a0_fixup) ENDPROC(window_overflow_restore_a0_fixup) +#endif + /* * Debug interrupt vector * @@ -650,6 +661,25 @@ ENTRY(_Level\level\()InterruptVector) irq_entry_level 5 irq_entry_level 6 +#if XCHAL_EXCM_LEVEL >= 2 + /* + * Continuation of medium priority interrupt dispatch code. + * On entry here, a0 contains PS, and EPC2 contains saved a0: + */ + __XTENSA_HANDLER + .align 4 +_SimulateUserKernelVectorException: + addi a0, a0, (1 << PS_EXCM_BIT) +#if !XTENSA_FAKE_NMI + wsr a0, ps +#endif + bbsi.l a0, PS_UM_BIT, 1f # branch if user mode + xsr a0, excsave2 # restore a0 + j _KernelExceptionVector # simulate kernel vector exception +1: xsr a0, excsave2 # restore a0 + j _UserExceptionVector # simulate user vector exception +#endif + /* Window overflow and underflow handlers. * The handlers must be 64 bytes apart, first starting with the underflow @@ -668,6 +698,8 @@ ENTRY(_Level\level\()InterruptVector) .section .WindowVectors.text, "ax" +#ifdef SUPPORT_WINDOWED + /* 4-Register Window Overflow Vector (Handler) */ ENTRY_ALIGN64(_WindowOverflow4) @@ -680,27 +712,6 @@ ENTRY_ALIGN64(_WindowOverflow4) ENDPROC(_WindowOverflow4) - -#if XCHAL_EXCM_LEVEL >= 2 - /* Not a window vector - but a convenient location - * (where we know there's space) for continuation of - * medium priority interrupt dispatch code. - * On entry here, a0 contains PS, and EPC2 contains saved a0: - */ - .align 4 -_SimulateUserKernelVectorException: - addi a0, a0, (1 << PS_EXCM_BIT) -#if !XTENSA_FAKE_NMI - wsr a0, ps -#endif - bbsi.l a0, PS_UM_BIT, 1f # branch if user mode - xsr a0, excsave2 # restore a0 - j _KernelExceptionVector # simulate kernel vector exception -1: xsr a0, excsave2 # restore a0 - j _UserExceptionVector # simulate user vector exception -#endif - - /* 4-Register Window Underflow Vector (Handler) */ ENTRY_ALIGN64(_WindowUnderflow4) @@ -789,4 +800,6 @@ ENTRY_ALIGN64(_WindowUnderflow12) ENDPROC(_WindowUnderflow12) +#endif + .text diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index d23a6e38f062..eee270a039a4 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -94,7 +94,9 @@ SECTIONS . = ALIGN(PAGE_SIZE); _vecbase = .; +#ifdef SUPPORT_WINDOWED SECTION_VECTOR2 (.WindowVectors.text, WINDOW_VECTORS_VADDR) +#endif #if XCHAL_EXCM_LEVEL >= 2 SECTION_VECTOR2 (.Level2InterruptVector.text, INTLEVEL2_VECTOR_VADDR) #endif @@ -166,8 +168,10 @@ SECTIONS __boot_reloc_table_start = ABSOLUTE(.); #if !MERGED_VECTORS +#ifdef SUPPORT_WINDOWED RELOCATE_ENTRY(_WindowVectors_text, .WindowVectors.text); +#endif #if XCHAL_EXCM_LEVEL >= 2 RELOCATE_ENTRY(_Level2InterruptVector_text, .Level2InterruptVector.text); @@ -229,14 +233,18 @@ SECTIONS #if !MERGED_VECTORS /* The vectors are relocated to the real position at startup time */ +#ifdef SUPPORT_WINDOWED SECTION_VECTOR4 (_WindowVectors_text, .WindowVectors.text, WINDOW_VECTORS_VADDR, - .dummy) + LAST) +#undef LAST +#define LAST .WindowVectors.text +#endif SECTION_VECTOR4 (_DebugInterruptVector_text, .DebugInterruptVector.text, DEBUG_VECTOR_VADDR, - .WindowVectors.text) + LAST) #undef LAST #define LAST .DebugInterruptVector.text #if XCHAL_EXCM_LEVEL >= 2 diff --git a/arch/xtensa/lib/strncpy_user.S b/arch/xtensa/lib/strncpy_user.S index 4faf46fe3f38..0731912227d3 100644 --- a/arch/xtensa/lib/strncpy_user.S +++ b/arch/xtensa/lib/strncpy_user.S @@ -45,7 +45,6 @@ # a9/ tmp # a10/ tmp # a11/ dst -# a12/ tmp .text ENTRY(__strncpy_user) @@ -61,7 +60,7 @@ ENTRY(__strncpy_user) bbsi.l a3, 0, .Lsrc1mod2 # if only 8-bit aligned bbsi.l a3, 1, .Lsrc2mod4 # if only 16-bit aligned .Lsrcaligned: # return here when src is word-aligned - srli a12, a4, 2 # number of loop iterations with 4B per loop + srli a10, a4, 2 # number of loop iterations with 4B per loop movi a9, 3 bnone a11, a9, .Laligned j .Ldstunaligned @@ -102,11 +101,11 @@ EX(10f) s8i a9, a11, 0 # store byte 0 .byte 0 # (0 mod 4 alignment for LBEG) .Laligned: #if XCHAL_HAVE_LOOPS - loopnez a12, .Loop1done + loopnez a10, .Loop1done #else - beqz a12, .Loop1done - slli a12, a12, 2 - add a12, a12, a11 # a12 = end of last 4B chunck + beqz a10, .Loop1done + slli a10, a10, 2 + add a10, a10, a11 # a10 = end of last 4B chunck #endif .Loop1: EX(11f) l32i a9, a3, 0 # get word from src @@ -118,7 +117,7 @@ EX(10f) s32i a9, a11, 0 # store word to dst bnone a9, a8, .Lz3 # if byte 3 is zero addi a11, a11, 4 # advance dst pointer #if !XCHAL_HAVE_LOOPS - blt a11, a12, .Loop1 + blt a11, a10, .Loop1 #endif .Loop1done: @@ -185,7 +184,7 @@ EX(10f) s8i a9, a11, 2 loopnez a4, .Lunalignedend #else beqz a4, .Lunalignedend - add a12, a11, a4 # a12 = ending address + add a10, a11, a4 # a10 = ending address #endif /* XCHAL_HAVE_LOOPS */ .Lnextbyte: EX(11f) l8ui a9, a3, 0 @@ -194,7 +193,7 @@ EX(10f) s8i a9, a11, 0 beqz a9, .Lunalignedend addi a11, a11, 1 #if !XCHAL_HAVE_LOOPS - blt a11, a12, .Lnextbyte + blt a11, a10, .Lnextbyte #endif .Lunalignedend: diff --git a/arch/xtensa/lib/usercopy.S b/arch/xtensa/lib/usercopy.S index a0aa4047f94a..16128c094c62 100644 --- a/arch/xtensa/lib/usercopy.S +++ b/arch/xtensa/lib/usercopy.S @@ -60,7 +60,12 @@ .text ENTRY(__xtensa_copy_user) - abi_entry_default +#if !XCHAL_HAVE_LOOPS && defined(__XTENSA_CALL0_ABI__) +#define STACK_SIZE 4 +#else +#define STACK_SIZE 0 +#endif + abi_entry(STACK_SIZE) # a2/ dst, a3/ src, a4/ len mov a5, a2 # copy dst so that a2 is return value mov a11, a4 # preserve original len for error case @@ -75,7 +80,7 @@ ENTRY(__xtensa_copy_user) __ssa8 a3 # set shift amount from byte offset bnez a4, .Lsrcunaligned movi a2, 0 # return success for len==0 - abi_ret_default + abi_ret(STACK_SIZE) /* * Destination is unaligned @@ -127,7 +132,7 @@ EX(10f) s8i a6, a5, 0 #endif /* !XCHAL_HAVE_LOOPS */ .Lbytecopydone: movi a2, 0 # return success for len bytes copied - abi_ret_default + abi_ret(STACK_SIZE) /* * Destination and source are word-aligned. @@ -187,7 +192,7 @@ EX(10f) l8ui a6, a3, 0 EX(10f) s8i a6, a5, 0 .L5: movi a2, 0 # return success for len bytes copied - abi_ret_default + abi_ret(STACK_SIZE) /* * Destination is aligned, Source is unaligned @@ -205,8 +210,14 @@ EX(10f) l32i a6, a3, 0 # load first word loopnez a7, .Loop2done #else /* !XCHAL_HAVE_LOOPS */ beqz a7, .Loop2done +#if defined(__XTENSA_CALL0_ABI__) + s32i a10, a1, 0 + slli a10, a7, 4 + add a10, a10, a3 # a10 = end of last 16B source chunk +#else slli a12, a7, 4 add a12, a12, a3 # a12 = end of last 16B source chunk +#endif #endif /* !XCHAL_HAVE_LOOPS */ .Loop2: EX(10f) l32i a7, a3, 4 @@ -224,7 +235,12 @@ EX(10f) s32i a8, a5, 8 EX(10f) s32i a9, a5, 12 addi a5, a5, 16 #if !XCHAL_HAVE_LOOPS +#if defined(__XTENSA_CALL0_ABI__) + blt a3, a10, .Loop2 + l32i a10, a1, 0 +#else blt a3, a12, .Loop2 +#endif #endif /* !XCHAL_HAVE_LOOPS */ .Loop2done: bbci.l a4, 3, .L12 @@ -264,7 +280,7 @@ EX(10f) l8ui a6, a3, 0 EX(10f) s8i a6, a5, 0 .L15: movi a2, 0 # return success for len bytes copied - abi_ret_default + abi_ret(STACK_SIZE) ENDPROC(__xtensa_copy_user) @@ -281,4 +297,4 @@ ENDPROC(__xtensa_copy_user) 10: sub a2, a5, a2 /* a2 <-- bytes copied */ sub a2, a11, a2 /* a2 <-- bytes not copied */ - abi_ret_default + abi_ret(STACK_SIZE) diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index d7deedf3548e..ab2f7dfb0c44 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -199,33 +199,20 @@ static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root, acpi_status status; u32 result, capbuf[3]; - support &= OSC_PCI_SUPPORT_MASKS; support |= root->osc_support_set; capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = support; - if (control) { - *control &= OSC_PCI_CONTROL_MASKS; - capbuf[OSC_CONTROL_DWORD] = *control | root->osc_control_set; - } else { - /* Run _OSC query only with existing controls. */ - capbuf[OSC_CONTROL_DWORD] = root->osc_control_set; - } + capbuf[OSC_CONTROL_DWORD] = *control | root->osc_control_set; status = acpi_pci_run_osc(root->device->handle, capbuf, &result); if (ACPI_SUCCESS(status)) { root->osc_support_set = support; - if (control) - *control = result; + *control = result; } return status; } -static acpi_status acpi_pci_osc_support(struct acpi_pci_root *root, u32 flags) -{ - return acpi_pci_query_osc(root, flags, NULL); -} - struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle) { struct acpi_pci_root *root; @@ -348,8 +335,9 @@ EXPORT_SYMBOL_GPL(acpi_get_pci_dev); * _OSC bits the BIOS has granted control of, but its contents are meaningless * on failure. **/ -static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req) +static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 support) { + u32 req = OSC_PCI_EXPRESS_CAPABILITY_CONTROL; struct acpi_pci_root *root; acpi_status status; u32 ctrl, capbuf[3]; @@ -357,22 +345,16 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r if (!mask) return AE_BAD_PARAMETER; - ctrl = *mask & OSC_PCI_CONTROL_MASKS; - if ((ctrl & req) != req) - return AE_TYPE; - root = acpi_pci_find_root(handle); if (!root) return AE_NOT_EXIST; - *mask = ctrl | root->osc_control_set; - /* No need to evaluate _OSC if the control was already granted. */ - if ((root->osc_control_set & ctrl) == ctrl) - return AE_OK; + ctrl = *mask; + *mask |= root->osc_control_set; /* Need to check the available controls bits before requesting them. */ - while (*mask) { - status = acpi_pci_query_osc(root, root->osc_support_set, mask); + do { + status = acpi_pci_query_osc(root, support, mask); if (ACPI_FAILURE(status)) return status; if (ctrl == *mask) @@ -380,7 +362,11 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r decode_osc_control(root, "platform does not support", ctrl & ~(*mask)); ctrl = *mask; - } + } while (*mask); + + /* No need to request _OSC if the control was already granted. */ + if ((root->osc_control_set & ctrl) == ctrl) + return AE_OK; if ((ctrl & req) != req) { decode_osc_control(root, "not requesting control; platform does not support", @@ -399,25 +385,9 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r return AE_OK; } -static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, - bool is_pcie) +static u32 calculate_support(void) { - u32 support, control, requested; - acpi_status status; - struct acpi_device *device = root->device; - acpi_handle handle = device->handle; - - /* - * Apple always return failure on _OSC calls when _OSI("Darwin") has - * been called successfully. We know the feature set supported by the - * platform, so avoid calling _OSC at all - */ - if (x86_apple_machine) { - root->osc_control_set = ~OSC_PCI_EXPRESS_PME_CONTROL; - decode_osc_control(root, "OS assumes control of", - root->osc_control_set); - return; - } + u32 support; /* * All supported architectures that use ACPI have support for @@ -434,30 +404,12 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, if (IS_ENABLED(CONFIG_PCIE_EDR)) support |= OSC_PCI_EDR_SUPPORT; - decode_osc_support(root, "OS supports", support); - status = acpi_pci_osc_support(root, support); - if (ACPI_FAILURE(status)) { - *no_aspm = 1; + return support; +} - /* _OSC is optional for PCI host bridges */ - if ((status == AE_NOT_FOUND) && !is_pcie) - return; - - dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n", - acpi_format_exception(status)); - return; - } - - if (pcie_ports_disabled) { - dev_info(&device->dev, "PCIe port services disabled; not requesting _OSC control\n"); - return; - } - - if ((support & ACPI_PCIE_REQ_SUPPORT) != ACPI_PCIE_REQ_SUPPORT) { - decode_osc_support(root, "not requesting OS control; OS requires", - ACPI_PCIE_REQ_SUPPORT); - return; - } +static u32 calculate_control(void) +{ + u32 control; control = OSC_PCI_EXPRESS_CAPABILITY_CONTROL | OSC_PCI_EXPRESS_PME_CONTROL; @@ -483,11 +435,59 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, if (IS_ENABLED(CONFIG_PCIE_DPC) && IS_ENABLED(CONFIG_PCIE_EDR)) control |= OSC_PCI_EXPRESS_DPC_CONTROL; - requested = control; - status = acpi_pci_osc_control_set(handle, &control, - OSC_PCI_EXPRESS_CAPABILITY_CONTROL); + return control; +} + +static bool os_control_query_checks(struct acpi_pci_root *root, u32 support) +{ + struct acpi_device *device = root->device; + + if (pcie_ports_disabled) { + dev_info(&device->dev, "PCIe port services disabled; not requesting _OSC control\n"); + return false; + } + + if ((support & ACPI_PCIE_REQ_SUPPORT) != ACPI_PCIE_REQ_SUPPORT) { + decode_osc_support(root, "not requesting OS control; OS requires", + ACPI_PCIE_REQ_SUPPORT); + return false; + } + + return true; +} + +static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, + bool is_pcie) +{ + u32 support, control = 0, requested = 0; + acpi_status status; + struct acpi_device *device = root->device; + acpi_handle handle = device->handle; + + /* + * Apple always return failure on _OSC calls when _OSI("Darwin") has + * been called successfully. We know the feature set supported by the + * platform, so avoid calling _OSC at all + */ + if (x86_apple_machine) { + root->osc_control_set = ~OSC_PCI_EXPRESS_PME_CONTROL; + decode_osc_control(root, "OS assumes control of", + root->osc_control_set); + return; + } + + support = calculate_support(); + + decode_osc_support(root, "OS supports", support); + + if (os_control_query_checks(root, support)) + requested = control = calculate_control(); + + status = acpi_pci_osc_control_set(handle, &control, support); if (ACPI_SUCCESS(status)) { - decode_osc_control(root, "OS now controls", control); + if (control) + decode_osc_control(root, "OS now controls", control); + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_ASPM) { /* * We have ASPM control, but the FADT indicates that @@ -498,11 +498,6 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, *no_aspm = 1; } } else { - decode_osc_control(root, "OS requested", requested); - decode_osc_control(root, "platform willing to grant", control); - dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n", - acpi_format_exception(status)); - /* * We want to disable ASPM here, but aspm_disabled * needs to remain in its state from boot so that we @@ -511,6 +506,18 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm, * root scan. */ *no_aspm = 1; + + /* _OSC is optional for PCI host bridges */ + if ((status == AE_NOT_FOUND) && !is_pcie) + return; + + if (control) { + decode_osc_control(root, "OS requested", requested); + decode_osc_control(root, "platform willing to grant", control); + } + + dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n", + acpi_format_exception(status)); } } diff --git a/drivers/base/Makefile b/drivers/base/Makefile index ef8e44a7d288..02f7f1358e86 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -13,7 +13,7 @@ obj-y += power/ obj-$(CONFIG_ISA_BUS_API) += isa.o obj-y += firmware_loader/ obj-$(CONFIG_NUMA) += node.o -obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_SYSFS),y) obj-$(CONFIG_MODULES) += module.o endif diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 00fb4120a5b3..bc1876915457 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -14,6 +14,7 @@ #include #include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -165,25 +166,86 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_early(__pa(ptr), size); + memblock_free(ptr, size); } +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +static void __init pcpu_populate_pte(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + pud_t *new; + + new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; + p4d_populate(&init_mm, p4d, new); + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pmd_t *new; + + new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; + pud_populate(&init_mm, pud, new); + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + pte_t *new; + + new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; + pmd_populate_kernel(&init_mm, pmd, new); + } + + return; + +err_alloc: + panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); +} +#endif + void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; - int rc; + int rc = -EINVAL; - /* - * Always reserve area for module percpu variables. That's - * what the legacy allocator did. - */ - rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, - pcpu_cpu_distance, - pcpu_fc_alloc, pcpu_fc_free); + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + if (rc < 0) + pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); +#endif + } + +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) - panic("Failed to initialize percpu areas."); + rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, + pcpu_fc_alloc, + pcpu_fc_free, + pcpu_populate_pte); +#endif + if (rc < 0) + panic("Failed to initialize percpu areas (err=%d).", rc); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) @@ -264,7 +326,7 @@ void __init numa_free_distance(void) size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - memblock_free_ptr(numa_distance, size); + memblock_free(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; } @@ -275,15 +337,13 @@ void __init numa_free_distance(void) static int __init numa_alloc_distance(void) { size_t size; - u64 phys; int i, j; size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); - phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn)); - if (WARN_ON(!phys)) + numa_distance = memblock_alloc(size, PAGE_SIZE); + if (WARN_ON(!numa_distance)) return -ENOMEM; - numa_distance = __va(phys); numa_distance_cnt = nr_node_ids; /* fill with the default distances */ diff --git a/drivers/base/node.c b/drivers/base/node.c index c56d34f8158f..b5a4ba18f9f9 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -629,7 +629,7 @@ static void node_device_release(struct device *dev) { struct node *node = to_node(dev); -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS) +#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) /* * We schedule the work only when a memory section is * onlined/offlined on this node. When we come here, @@ -782,7 +782,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) return 0; } -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG static int __ref get_nid_for_pfn(unsigned long pfn) { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -958,10 +958,9 @@ static int node_memory_callback(struct notifier_block *self, return NOTIFY_OK; } #endif /* CONFIG_HUGETLBFS */ -#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ -#if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \ - !defined(CONFIG_HUGETLBFS) +#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS) static inline int node_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c index 69c10a7b7c61..960632197b05 100644 --- a/drivers/bcma/host_pci.c +++ b/drivers/bcma/host_pci.c @@ -162,7 +162,6 @@ static int bcma_host_pci_probe(struct pci_dev *dev, { struct bcma_bus *bus; int err = -ENOMEM; - const char *name; u32 val; /* Alloc */ @@ -175,10 +174,7 @@ static int bcma_host_pci_probe(struct pci_dev *dev, if (err) goto err_kfree_bus; - name = dev_name(&dev->dev); - if (dev->driver && dev->driver->name) - name = dev->driver->name; - err = pci_request_regions(dev, name); + err = pci_request_regions(dev, "bcma-pci-bridge"); if (err) goto err_pci_disable; pci_set_master(dev); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a68297fb51a2..c8931ba91b57 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -291,22 +291,16 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } -static ssize_t idle_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +/* + * Mark all pages which are older than or equal to cutoff as IDLE. + * Callers should hold the zram init lock in read mode + */ +static void mark_idle(struct zram *zram, ktime_t cutoff) { - struct zram *zram = dev_to_zram(dev); + int is_idle = 1; unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; int index; - if (!sysfs_streq(buf, "all")) - return -EINVAL; - - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); - return -EINVAL; - } - for (index = 0; index < nr_pages; index++) { /* * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race. @@ -314,14 +308,50 @@ static ssize_t idle_store(struct device *dev, */ zram_slot_lock(zram, index); if (zram_allocated(zram, index) && - !zram_test_flag(zram, index, ZRAM_UNDER_WB)) - zram_set_flag(zram, index, ZRAM_IDLE); + !zram_test_flag(zram, index, ZRAM_UNDER_WB)) { +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time); +#endif + if (is_idle) + zram_set_flag(zram, index, ZRAM_IDLE); + } zram_slot_unlock(zram, index); } +} +static ssize_t idle_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + ktime_t cutoff_time = 0; + ssize_t rv = -EINVAL; + + if (!sysfs_streq(buf, "all")) { + /* + * If it did not parse as 'all' try to treat it as an integer when + * we have memory tracking enabled. + */ + u64 age_sec; + + if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec)) + cutoff_time = ktime_sub(ktime_get_boottime(), + ns_to_ktime(age_sec * NSEC_PER_SEC)); + else + goto out; + } + + down_read(&zram->init_lock); + if (!init_done(zram)) + goto out_unlock; + + /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */ + mark_idle(zram, cutoff_time); + rv = len; + +out_unlock: up_read(&zram->init_lock); - - return len; +out: + return rv; } #ifdef CONFIG_ZRAM_WRITEBACK @@ -587,7 +617,7 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, { struct bio *bio; - bio = bio_alloc(GFP_ATOMIC, 1); + bio = bio_alloc(GFP_NOIO, 1); if (!bio) return -ENOMEM; @@ -910,7 +940,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf, zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.'); - if (count < copied) { + if (count <= copied) { zram_slot_unlock(zram, index); break; } diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index fed52ae516ba..52d6cca6262e 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3118,7 +3118,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm) }; int ret; - ret = strscpy(interface.name, pdev->driver->name, + ret = strscpy(interface.name, dev_driver_string(&pdev->dev), sizeof(interface.name)); if (ret < 0) return -ENAMETOOLONG; diff --git a/drivers/crypto/qat/qat_4xxx/adf_drv.c b/drivers/crypto/qat/qat_4xxx/adf_drv.c index 359fb7989dfb..71ef065914b2 100644 --- a/drivers/crypto/qat/qat_4xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_4xxx/adf_drv.c @@ -247,11 +247,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_master(pdev); - if (adf_enable_aer(accel_dev)) { - dev_err(&pdev->dev, "Failed to enable aer.\n"); - ret = -EFAULT; - goto out_err; - } + adf_enable_aer(accel_dev); if (pci_save_state(pdev)) { dev_err(&pdev->dev, "Failed to save pci state.\n"); @@ -304,6 +300,7 @@ static struct pci_driver adf_driver = { .probe = adf_probe, .remove = adf_remove, .sriov_configure = adf_sriov_configure, + .err_handler = &adf_err_handler, }; module_pci_driver(adf_driver); diff --git a/drivers/crypto/qat/qat_c3xxx/adf_drv.c b/drivers/crypto/qat/qat_c3xxx/adf_drv.c index cc6e75dc60de..2aef0bb791df 100644 --- a/drivers/crypto/qat/qat_c3xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_c3xxx/adf_drv.c @@ -33,6 +33,7 @@ static struct pci_driver adf_driver = { .probe = adf_probe, .remove = adf_remove, .sriov_configure = adf_sriov_configure, + .err_handler = &adf_err_handler, }; static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev) @@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); - if (adf_enable_aer(accel_dev)) { - dev_err(&pdev->dev, "Failed to enable aer\n"); - ret = -EFAULT; - goto out_err_free_reg; - } + adf_enable_aer(accel_dev); if (pci_save_state(pdev)) { dev_err(&pdev->dev, "Failed to save pci state\n"); diff --git a/drivers/crypto/qat/qat_c62x/adf_drv.c b/drivers/crypto/qat/qat_c62x/adf_drv.c index bf251dfe74b3..56163083f161 100644 --- a/drivers/crypto/qat/qat_c62x/adf_drv.c +++ b/drivers/crypto/qat/qat_c62x/adf_drv.c @@ -33,6 +33,7 @@ static struct pci_driver adf_driver = { .probe = adf_probe, .remove = adf_remove, .sriov_configure = adf_sriov_configure, + .err_handler = &adf_err_handler, }; static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev) @@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); - if (adf_enable_aer(accel_dev)) { - dev_err(&pdev->dev, "Failed to enable aer\n"); - ret = -EFAULT; - goto out_err_free_reg; - } + adf_enable_aer(accel_dev); if (pci_save_state(pdev)) { dev_err(&pdev->dev, "Failed to save pci state\n"); diff --git a/drivers/crypto/qat/qat_common/adf_aer.c b/drivers/crypto/qat/qat_common/adf_aer.c index ed3e40bc56eb..fe9bb2f3536a 100644 --- a/drivers/crypto/qat/qat_common/adf_aer.c +++ b/drivers/crypto/qat/qat_common/adf_aer.c @@ -166,11 +166,12 @@ static void adf_resume(struct pci_dev *pdev) dev_info(&pdev->dev, "Device is up and running\n"); } -static const struct pci_error_handlers adf_err_handler = { +const struct pci_error_handlers adf_err_handler = { .error_detected = adf_error_detected, .slot_reset = adf_slot_reset, .resume = adf_resume, }; +EXPORT_SYMBOL_GPL(adf_err_handler); /** * adf_enable_aer() - Enable Advance Error Reporting for acceleration device @@ -179,17 +180,12 @@ static const struct pci_error_handlers adf_err_handler = { * Function enables PCI Advance Error Reporting for the * QAT acceleration device accel_dev. * To be used by QAT device specific drivers. - * - * Return: 0 on success, error code otherwise. */ -int adf_enable_aer(struct adf_accel_dev *accel_dev) +void adf_enable_aer(struct adf_accel_dev *accel_dev) { struct pci_dev *pdev = accel_to_pci_dev(accel_dev); - struct pci_driver *pdrv = pdev->driver; - pdrv->err_handler = &adf_err_handler; pci_enable_pcie_error_reporting(pdev); - return 0; } EXPORT_SYMBOL_GPL(adf_enable_aer); diff --git a/drivers/crypto/qat/qat_common/adf_common_drv.h b/drivers/crypto/qat/qat_common/adf_common_drv.h index 2cc6622833c4..de94b76a6d2c 100644 --- a/drivers/crypto/qat/qat_common/adf_common_drv.h +++ b/drivers/crypto/qat/qat_common/adf_common_drv.h @@ -94,7 +94,8 @@ void adf_ae_fw_release(struct adf_accel_dev *accel_dev); int adf_ae_start(struct adf_accel_dev *accel_dev); int adf_ae_stop(struct adf_accel_dev *accel_dev); -int adf_enable_aer(struct adf_accel_dev *accel_dev); +extern const struct pci_error_handlers adf_err_handler; +void adf_enable_aer(struct adf_accel_dev *accel_dev); void adf_disable_aer(struct adf_accel_dev *accel_dev); void adf_reset_sbr(struct adf_accel_dev *accel_dev); void adf_reset_flr(struct adf_accel_dev *accel_dev); diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c index 3976a81bd99b..acca56752aa0 100644 --- a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c +++ b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c @@ -33,6 +33,7 @@ static struct pci_driver adf_driver = { .probe = adf_probe, .remove = adf_remove, .sriov_configure = adf_sriov_configure, + .err_handler = &adf_err_handler, }; static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev) @@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); - if (adf_enable_aer(accel_dev)) { - dev_err(&pdev->dev, "Failed to enable aer\n"); - ret = -EFAULT; - goto out_err_free_reg; - } + adf_enable_aer(accel_dev); if (pci_save_state(pdev)) { dev_err(&pdev->dev, "Failed to save pci state\n"); diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c index 2ff1883dc788..4df55a55da84 100644 --- a/drivers/firmware/efi/memmap.c +++ b/drivers/firmware/efi/memmap.c @@ -35,7 +35,7 @@ void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags) if (slab_is_available()) memblock_free_late(phys, size); else - memblock_free(phys, size); + memblock_phys_free(phys, size); } else if (flags & EFI_MEMMAP_SLAB) { struct page *p = pfn_to_page(PHYS_PFN(phys)); unsigned int order = get_order(size); diff --git a/drivers/hwmon/occ/p9_sbe.c b/drivers/hwmon/occ/p9_sbe.c index e50243580269..49b13cc01073 100644 --- a/drivers/hwmon/occ/p9_sbe.c +++ b/drivers/hwmon/occ/p9_sbe.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 96d4a1f8de79..565ef5598811 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -737,6 +738,31 @@ static int apple_dart_def_domain_type(struct device *dev) return 0; } +#ifndef CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR +/* Keep things compiling when CONFIG_PCI_APPLE isn't selected */ +#define CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR 0 +#endif +#define DOORBELL_ADDR (CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR & PAGE_MASK) + +static void apple_dart_get_resv_regions(struct device *dev, + struct list_head *head) +{ + if (IS_ENABLED(CONFIG_PCIE_APPLE) && dev_is_pci(dev)) { + struct iommu_resv_region *region; + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; + + region = iommu_alloc_resv_region(DOORBELL_ADDR, + PAGE_SIZE, prot, + IOMMU_RESV_MSI); + if (!region) + return; + + list_add_tail(®ion->list, head); + } + + iommu_dma_get_resv_regions(dev, head); +} + static const struct iommu_ops apple_dart_iommu_ops = { .domain_alloc = apple_dart_domain_alloc, .domain_free = apple_dart_domain_free, @@ -753,6 +779,8 @@ static const struct iommu_ops apple_dart_iommu_ops = { .device_group = apple_dart_device_group, .of_xlate = apple_dart_of_xlate, .def_domain_type = apple_dart_def_domain_type, + .get_resv_regions = apple_dart_get_resv_regions, + .put_resv_regions = generic_iommu_put_resv_regions, .pgsize_bitmap = -1UL, /* Restricted during dart probe */ }; diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index d33913d523c1..a4fbc3fc713d 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -570,7 +570,7 @@ fail_msg_node: fail_db_node: of_node_put(smu->db_node); fail_bootmem: - memblock_free_ptr(smu, sizeof(struct smu_device)); + memblock_free(smu, sizeof(struct smu_device)); smu = NULL; fail_np: of_node_put(np); diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 7f7abc9069f7..b94d5e4fdc23 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -829,7 +829,6 @@ int mpt_device_driver_register(struct mpt_pci_driver * dd_cbfunc, u8 cb_idx) { MPT_ADAPTER *ioc; - const struct pci_device_id *id; if (!cb_idx || cb_idx >= MPT_MAX_PROTOCOL_DRIVERS) return -EINVAL; @@ -838,10 +837,8 @@ mpt_device_driver_register(struct mpt_pci_driver * dd_cbfunc, u8 cb_idx) /* call per pci device probe entry point */ list_for_each_entry(ioc, &ioc_list, list) { - id = ioc->pcidev->driver ? - ioc->pcidev->driver->id_table : NULL; if (dd_cbfunc->probe) - dd_cbfunc->probe(ioc->pcidev, id); + dd_cbfunc->probe(ioc->pcidev); } return 0; @@ -2032,7 +2029,7 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id) for(cb_idx = 0; cb_idx < MPT_MAX_PROTOCOL_DRIVERS; cb_idx++) { if(MptDeviceDriverHandlers[cb_idx] && MptDeviceDriverHandlers[cb_idx]->probe) { - MptDeviceDriverHandlers[cb_idx]->probe(pdev,id); + MptDeviceDriverHandlers[cb_idx]->probe(pdev); } } diff --git a/drivers/message/fusion/mptbase.h b/drivers/message/fusion/mptbase.h index b9e0376be723..4bd0682c65d3 100644 --- a/drivers/message/fusion/mptbase.h +++ b/drivers/message/fusion/mptbase.h @@ -257,7 +257,7 @@ typedef enum { } MPT_DRIVER_CLASS; struct mpt_pci_driver{ - int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); + int (*probe) (struct pci_dev *dev); void (*remove) (struct pci_dev *dev); }; diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index 72025996cd70..ae433c150b37 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -114,7 +114,7 @@ static int mptctl_do_reset(MPT_ADAPTER *iocp, unsigned long arg); static int mptctl_hp_hostinfo(MPT_ADAPTER *iocp, unsigned long arg, unsigned int cmd); static int mptctl_hp_targetinfo(MPT_ADAPTER *iocp, unsigned long arg); -static int mptctl_probe(struct pci_dev *, const struct pci_device_id *); +static int mptctl_probe(struct pci_dev *); static void mptctl_remove(struct pci_dev *); #ifdef CONFIG_COMPAT @@ -2838,7 +2838,7 @@ static long compat_mpctl_ioctl(struct file *f, unsigned int cmd, unsigned long a */ static int -mptctl_probe(struct pci_dev *pdev, const struct pci_device_id *id) +mptctl_probe(struct pci_dev *pdev) { MPT_ADAPTER *ioc = pci_get_drvdata(pdev); diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c index acdc257a900e..117fa4ebf6d7 100644 --- a/drivers/message/fusion/mptlan.c +++ b/drivers/message/fusion/mptlan.c @@ -1377,7 +1377,7 @@ mpt_register_lan_device (MPT_ADAPTER *mpt_dev, int pnum) } static int -mptlan_probe(struct pci_dev *pdev, const struct pci_device_id *id) +mptlan_probe(struct pci_dev *pdev) { MPT_ADAPTER *ioc = pci_get_drvdata(pdev); struct net_device *dev; diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 186308f1f8eb..9d485c9e3fff 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -20,34 +20,38 @@ static void pci_error_handlers(struct cxl_afu *afu, pci_channel_state_t state) { struct pci_dev *afu_dev; + struct pci_driver *afu_drv; + const struct pci_error_handlers *err_handler; if (afu->phb == NULL) return; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - if (!afu_dev->driver) + afu_drv = to_pci_driver(afu_dev->dev.driver); + if (!afu_drv) continue; + err_handler = afu_drv->err_handler; switch (bus_error_event) { case CXL_ERROR_DETECTED_EVENT: afu_dev->error_state = state; - if (afu_dev->driver->err_handler && - afu_dev->driver->err_handler->error_detected) - afu_dev->driver->err_handler->error_detected(afu_dev, state); - break; + if (err_handler && + err_handler->error_detected) + err_handler->error_detected(afu_dev, state); + break; case CXL_SLOT_RESET_EVENT: afu_dev->error_state = state; - if (afu_dev->driver->err_handler && - afu_dev->driver->err_handler->slot_reset) - afu_dev->driver->err_handler->slot_reset(afu_dev); - break; + if (err_handler && + err_handler->slot_reset) + err_handler->slot_reset(afu_dev); + break; case CXL_RESUME_EVENT: - if (afu_dev->driver->err_handler && - afu_dev->driver->err_handler->resume) - afu_dev->driver->err_handler->resume(afu_dev); - break; + if (err_handler && + err_handler->resume) + err_handler->resume(afu_dev); + break; } } } diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 2ba899f5659f..3de0aea62ade 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1795,6 +1795,8 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu, pci_channel_state_t state) { struct pci_dev *afu_dev; + struct pci_driver *afu_drv; + const struct pci_error_handlers *err_handler; pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET; pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET; @@ -1805,14 +1807,16 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu, return result; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - if (!afu_dev->driver) + afu_drv = to_pci_driver(afu_dev->dev.driver); + if (!afu_drv) continue; afu_dev->error_state = state; - if (afu_dev->driver->err_handler) - afu_result = afu_dev->driver->err_handler->error_detected(afu_dev, - state); + err_handler = afu_drv->err_handler; + if (err_handler) + afu_result = err_handler->error_detected(afu_dev, + state); /* Disconnect trumps all, NONE trumps NEED_RESET */ if (afu_result == PCI_ERS_RESULT_DISCONNECT) result = PCI_ERS_RESULT_DISCONNECT; @@ -1972,6 +1976,8 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) struct cxl_afu *afu; struct cxl_context *ctx; struct pci_dev *afu_dev; + struct pci_driver *afu_drv; + const struct pci_error_handlers *err_handler; pci_ers_result_t afu_result = PCI_ERS_RESULT_RECOVERED; pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED; int i; @@ -2028,12 +2034,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev) * shouldn't start new work until we call * their resume function. */ - if (!afu_dev->driver) + afu_drv = to_pci_driver(afu_dev->dev.driver); + if (!afu_drv) continue; - if (afu_dev->driver->err_handler && - afu_dev->driver->err_handler->slot_reset) - afu_result = afu_dev->driver->err_handler->slot_reset(afu_dev); + err_handler = afu_drv->err_handler; + if (err_handler && err_handler->slot_reset) + afu_result = err_handler->slot_reset(afu_dev); if (afu_result == PCI_ERS_RESULT_DISCONNECT) result = PCI_ERS_RESULT_DISCONNECT; @@ -2060,6 +2067,8 @@ static void cxl_pci_resume(struct pci_dev *pdev) struct cxl *adapter = pci_get_drvdata(pdev); struct cxl_afu *afu; struct pci_dev *afu_dev; + struct pci_driver *afu_drv; + const struct pci_error_handlers *err_handler; int i; /* Everything is back now. Drivers should restart work now. @@ -2074,9 +2083,13 @@ static void cxl_pci_resume(struct pci_dev *pdev) continue; list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { - if (afu_dev->driver && afu_dev->driver->err_handler && - afu_dev->driver->err_handler->resume) - afu_dev->driver->err_handler->resume(afu_dev); + afu_drv = to_pci_driver(afu_dev->dev.driver); + if (!afu_drv) + continue; + + err_handler = afu_drv->err_handler; + if (err_handler && err_handler->resume) + err_handler->resume(afu_dev); } } spin_unlock(&adapter->afu_list_lock); diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c index 63524551a13a..e6a2fd2c6d5c 100644 --- a/drivers/mmc/core/mmc_test.c +++ b/drivers/mmc/core/mmc_test.c @@ -10,7 +10,6 @@ #include #include -#include /* For nr_free_buffer_pages() */ #include #include diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index c8fd7f758938..c904e23c8237 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -2409,6 +2409,7 @@ static void __exit cleanup_mtd(void) if (proc_mtd) remove_proc_entry("mtd", NULL); class_unregister(&mtd_class); + bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); idr_destroy(&mtd_idr); } diff --git a/drivers/net/ethernet/chelsio/cxgb3/common.h b/drivers/net/ethernet/chelsio/cxgb3/common.h index a309016f7f8c..ecd025dda8d6 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/common.h +++ b/drivers/net/ethernet/chelsio/cxgb3/common.h @@ -676,8 +676,6 @@ void t3_link_changed(struct adapter *adapter, int port_id); void t3_link_fault(struct adapter *adapter, int port_id); int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc); const struct adapter_info *t3_get_adapter_info(unsigned int board_id); -int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data); -int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data); int t3_seeprom_wp(struct adapter *adapter, int enable); int t3_get_tp_version(struct adapter *adapter, u32 *vers); int t3_check_tpsram_version(struct adapter *adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index 9cf9e33664e4..bfffcaeee624 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2036,20 +2036,16 @@ static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e, { struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; - int i, err = 0; - - u8 *buf = kmalloc(EEPROMSIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; + int cnt; e->magic = EEPROM_MAGIC; - for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4) - err = t3_seeprom_read(adapter, i, (__le32 *) & buf[i]); + cnt = pci_read_vpd(adapter->pdev, e->offset, e->len, data); + if (cnt < 0) + return cnt; - if (!err) - memcpy(data, buf + e->offset, e->len); - kfree(buf); - return err; + e->len = cnt; + + return 0; } static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, @@ -2058,7 +2054,6 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; u32 aligned_offset, aligned_len; - __le32 *p; u8 *buf; int err; @@ -2072,12 +2067,9 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, buf = kmalloc(aligned_len, GFP_KERNEL); if (!buf) return -ENOMEM; - err = t3_seeprom_read(adapter, aligned_offset, (__le32 *) buf); - if (!err && aligned_len > 4) - err = t3_seeprom_read(adapter, - aligned_offset + aligned_len - 4, - (__le32 *) & buf[aligned_len - 4]); - if (err) + err = pci_read_vpd(adapter->pdev, aligned_offset, aligned_len, + buf); + if (err < 0) goto out; memcpy(buf + (eeprom->offset & 3), data, eeprom->len); } else @@ -2087,17 +2079,13 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, if (err) goto out; - for (p = (__le32 *) buf; !err && aligned_len; aligned_len -= 4, p++) { - err = t3_seeprom_write(adapter, aligned_offset, *p); - aligned_offset += 4; - } - - if (!err) + err = pci_write_vpd(adapter->pdev, aligned_offset, aligned_len, buf); + if (err >= 0) err = t3_seeprom_wp(adapter, 1); out: if (buf != data) kfree(buf); - return err; + return err < 0 ? err : 0; } static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c index 53feac8da503..da41eee2f25c 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c @@ -596,80 +596,9 @@ struct t3_vpd { u32 pad; /* for multiple-of-4 sizing and alignment */ }; -#define EEPROM_MAX_POLL 40 #define EEPROM_STAT_ADDR 0x4000 #define VPD_BASE 0xc00 -/** - * t3_seeprom_read - read a VPD EEPROM location - * @adapter: adapter to read - * @addr: EEPROM address - * @data: where to store the read data - * - * Read a 32-bit word from a location in VPD EEPROM using the card's PCI - * VPD ROM capability. A zero is written to the flag bit when the - * address is written to the control register. The hardware device will - * set the flag to 1 when 4 bytes have been read into the data register. - */ -int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data) -{ - u16 val; - int attempts = EEPROM_MAX_POLL; - u32 v; - unsigned int base = adapter->params.pci.vpd_cap_addr; - - if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3)) - return -EINVAL; - - pci_write_config_word(adapter->pdev, base + PCI_VPD_ADDR, addr); - do { - udelay(10); - pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val); - } while (!(val & PCI_VPD_ADDR_F) && --attempts); - - if (!(val & PCI_VPD_ADDR_F)) { - CH_ERR(adapter, "reading EEPROM address 0x%x failed\n", addr); - return -EIO; - } - pci_read_config_dword(adapter->pdev, base + PCI_VPD_DATA, &v); - *data = cpu_to_le32(v); - return 0; -} - -/** - * t3_seeprom_write - write a VPD EEPROM location - * @adapter: adapter to write - * @addr: EEPROM address - * @data: value to write - * - * Write a 32-bit word to a location in VPD EEPROM using the card's PCI - * VPD ROM capability. - */ -int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data) -{ - u16 val; - int attempts = EEPROM_MAX_POLL; - unsigned int base = adapter->params.pci.vpd_cap_addr; - - if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3)) - return -EINVAL; - - pci_write_config_dword(adapter->pdev, base + PCI_VPD_DATA, - le32_to_cpu(data)); - pci_write_config_word(adapter->pdev,base + PCI_VPD_ADDR, - addr | PCI_VPD_ADDR_F); - do { - msleep(1); - pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val); - } while ((val & PCI_VPD_ADDR_F) && --attempts); - - if (val & PCI_VPD_ADDR_F) { - CH_ERR(adapter, "write to EEPROM address 0x%x failed\n", addr); - return -EIO; - } - return 0; -} - /** * t3_seeprom_wp - enable/disable EEPROM write protection * @adapter: the adapter @@ -679,7 +608,14 @@ int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data) */ int t3_seeprom_wp(struct adapter *adapter, int enable) { - return t3_seeprom_write(adapter, EEPROM_STAT_ADDR, enable ? 0xc : 0); + u32 data = enable ? 0xc : 0; + int ret; + + /* EEPROM_STAT_ADDR is outside VPD area, use pci_write_vpd_any() */ + ret = pci_write_vpd_any(adapter->pdev, EEPROM_STAT_ADDR, sizeof(u32), + &data); + + return ret < 0 ? ret : 0; } static int vpdstrtouint(char *s, u8 len, unsigned int base, unsigned int *val) @@ -709,24 +645,22 @@ static int vpdstrtou16(char *s, u8 len, unsigned int base, u16 *val) */ static int get_vpd_params(struct adapter *adapter, struct vpd_params *p) { - int i, addr, ret; struct t3_vpd vpd; + u8 base_val = 0; + int addr, ret; /* * Card information is normally at VPD_BASE but some early cards had * it at 0. */ - ret = t3_seeprom_read(adapter, VPD_BASE, (__le32 *)&vpd); - if (ret) + ret = pci_read_vpd(adapter->pdev, VPD_BASE, 1, &base_val); + if (ret < 0) return ret; - addr = vpd.id_tag == 0x82 ? VPD_BASE : 0; + addr = base_val == PCI_VPD_LRDT_ID_STRING ? VPD_BASE : 0; - for (i = 0; i < sizeof(vpd); i += 4) { - ret = t3_seeprom_read(adapter, addr + i, - (__le32 *)((u8 *)&vpd + i)); - if (ret) - return ret; - } + ret = pci_read_vpd(adapter->pdev, addr, sizeof(vpd), &vpd); + if (ret < 0) + return ret; ret = vpdstrtouint(vpd.cclk_data, vpd.cclk_len, 10, &p->cclk); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 5ebd96f6833d..9fdedd83f392 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -608,7 +608,7 @@ static void hns3_get_drvinfo(struct net_device *netdev, return; } - strncpy(drvinfo->driver, h->pdev->driver->name, + strncpy(drvinfo->driver, dev_driver_string(&h->pdev->dev), sizeof(drvinfo->driver)); drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0'; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_pci.c b/drivers/net/ethernet/marvell/prestera/prestera_pci.c index 5d4d410b07c8..d650082496d6 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_pci.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_pci.c @@ -776,7 +776,7 @@ out_release: static int prestera_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - const char *driver_name = pdev->driver->name; + const char *driver_name = dev_driver_string(&pdev->dev); struct prestera_fw *fw; int err; diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index fcace73eae40..a15c95a10bae 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1875,7 +1875,7 @@ static void mlxsw_pci_cmd_fini(struct mlxsw_pci *mlxsw_pci) static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - const char *driver_name = pdev->driver->name; + const char *driver_name = dev_driver_string(&pdev->dev); struct mlxsw_pci *mlxsw_pci; int err; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c index 0685ece1f155..1de076f55740 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c @@ -202,7 +202,8 @@ nfp_get_drvinfo(struct nfp_app *app, struct pci_dev *pdev, { char nsp_version[ETHTOOL_FWVERS_LEN] = {}; - strlcpy(drvinfo->driver, pdev->driver->name, sizeof(drvinfo->driver)); + strlcpy(drvinfo->driver, dev_driver_string(&pdev->dev), + sizeof(drvinfo->driver)); nfp_net_get_nspinfo(app, nsp_version); snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%s %s %s %s", vnic_version, nsp_version, diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 352e14b007e7..32be5a03951f 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -156,10 +156,14 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) /* Now start the actual "proper" walk of the interrupt tree */ while (ipar != NULL) { - /* Now check if cursor is an interrupt-controller and if it is - * then we are done + /* + * Now check if cursor is an interrupt-controller and + * if it is then we are done, unless there is an + * interrupt-map which takes precedence. */ - if (of_property_read_bool(ipar, "interrupt-controller")) { + imap = of_get_property(ipar, "interrupt-map", &imaplen); + if (imap == NULL && + of_property_read_bool(ipar, "interrupt-controller")) { pr_debug(" -> got it !\n"); return 0; } @@ -173,8 +177,6 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) goto fail; } - /* Now look for an interrupt-map */ - imap = of_get_property(ipar, "interrupt-map", &imaplen); /* No interrupt map, check for an interrupt parent */ if (imap == NULL) { pr_debug(" -> no map, getting parent\n"); @@ -255,6 +257,11 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) out_irq->args_count = intsize = newintsize; addrsize = newaddrsize; + if (ipar == newpar) { + pr_debug("%pOF interrupt-map entry to self\n", ipar); + return 0; + } + skiplevel: /* Iterate again with new parent */ out_irq->np = newpar; diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index 761fd870d1db..b9bd1cff1793 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #define RNG_SEED_SIZE 128 @@ -170,8 +171,7 @@ int ima_free_kexec_buffer(void) if (ret) return ret; - return memblock_free(addr, size); - + return memblock_phys_free(addr, size); } /** diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 9da8835ba5a5..9c0fb962c22b 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -46,7 +46,7 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, if (nomap) { err = memblock_mark_nomap(base, size); if (err) - memblock_free(base, size); + memblock_phys_free(base, size); kmemleak_ignore_phys(base); } @@ -284,7 +284,8 @@ void __init fdt_init_reserved_mem(void) if (nomap) memblock_clear_nomap(rmem->base, rmem->size); else - memblock_free(rmem->base, rmem->size); + memblock_phys_free(rmem->base, + rmem->size); } } } diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 326f7d13024f..e917bb3652bb 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -254,7 +254,7 @@ config PCIE_MEDIATEK_GEN3 MediaTek SoCs. config VMD - depends on PCI_MSI && X86_64 && SRCU + depends on PCI_MSI && X86_64 && SRCU && !UML tristate "Intel Volume Management Device Driver" help Adds support for the Intel Volume Management Device (VMD). VMD is a @@ -312,6 +312,32 @@ config PCIE_HISI_ERR Say Y here if you want error handling support for the PCIe controller's errors on HiSilicon HIP SoCs +config PCIE_APPLE_MSI_DOORBELL_ADDR + hex + default 0xfffff000 + depends on PCIE_APPLE + +config PCIE_APPLE + tristate "Apple PCIe controller" + depends on ARCH_APPLE || COMPILE_TEST + depends on OF + depends on PCI_MSI_IRQ_DOMAIN + select PCI_HOST_COMMON + help + Say Y here if you want to enable PCIe controller support on Apple + system-on-chips, like the Apple M1. This is required for the USB + type-A ports, Ethernet, Wi-Fi, and Bluetooth. + + If unsure, say Y if you have an Apple Silicon system. + +config PCIE_MT7621 + tristate "MediaTek MT7621 PCIe Controller" + depends on (RALINK && SOC_MT7621) || (MIPS && COMPILE_TEST) + select PHY_MT7621_PCI + default SOC_MT7621 + help + This selects a driver for the MediaTek MT7621 PCIe Controller. + source "drivers/pci/controller/dwc/Kconfig" source "drivers/pci/controller/mobiveil/Kconfig" source "drivers/pci/controller/cadence/Kconfig" diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index aaf30b3dcc14..37c8663de7fe 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -37,6 +37,9 @@ obj-$(CONFIG_VMD) += vmd.o obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o obj-$(CONFIG_PCI_LOONGSON) += pci-loongson.o obj-$(CONFIG_PCIE_HISI_ERR) += pcie-hisi-error.o +obj-$(CONFIG_PCIE_APPLE) += pcie-apple.o +obj-$(CONFIG_PCIE_MT7621) += pcie-mt7621.o + # pcie-hisi.o quirks are needed even without CONFIG_PCIE_DW obj-y += dwc/ obj-y += mobiveil/ diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c index ffb176d288cd..918e11082e6a 100644 --- a/drivers/pci/controller/cadence/pci-j721e.c +++ b/drivers/pci/controller/cadence/pci-j721e.c @@ -474,7 +474,7 @@ static int j721e_pcie_probe(struct platform_device *pdev) ret = clk_prepare_enable(clk); if (ret) { dev_err(dev, "failed to enable pcie_refclk\n"); - goto err_get_sync; + goto err_pcie_setup; } pcie->refclk = clk; diff --git a/drivers/pci/controller/cadence/pcie-cadence-plat.c b/drivers/pci/controller/cadence/pcie-cadence-plat.c index 5fee0f89ab59..a224afadbcc0 100644 --- a/drivers/pci/controller/cadence/pcie-cadence-plat.c +++ b/drivers/pci/controller/cadence/pcie-cadence-plat.c @@ -127,6 +127,8 @@ static int cdns_plat_pcie_probe(struct platform_device *pdev) goto err_init; } + return 0; + err_init: err_get_sync: pm_runtime_put_sync(dev); diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 76c0a63a3f64..62ce3abf0f19 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -8,22 +8,20 @@ config PCIE_DW config PCIE_DW_HOST bool - depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW config PCIE_DW_EP bool - depends on PCI_ENDPOINT select PCIE_DW config PCI_DRA7XX - bool + tristate config PCI_DRA7XX_HOST - bool "TI DRA7xx PCIe controller Host Mode" + tristate "TI DRA7xx PCIe controller Host Mode" depends on SOC_DRA7XX || COMPILE_TEST - depends on PCI_MSI_IRQ_DOMAIN depends on OF && HAS_IOMEM && TI_PIPE3 + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST select PCI_DRA7XX default y if SOC_DRA7XX @@ -36,10 +34,10 @@ config PCI_DRA7XX_HOST This uses the DesignWare core. config PCI_DRA7XX_EP - bool "TI DRA7xx PCIe controller Endpoint Mode" + tristate "TI DRA7xx PCIe controller Endpoint Mode" depends on SOC_DRA7XX || COMPILE_TEST - depends on PCI_ENDPOINT depends on OF && HAS_IOMEM && TI_PIPE3 + depends on PCI_ENDPOINT select PCIE_DW_EP select PCI_DRA7XX help @@ -55,7 +53,7 @@ config PCIE_DW_PLAT config PCIE_DW_PLAT_HOST bool "Platform bus based DesignWare PCIe Controller - Host mode" - depends on PCI && PCI_MSI_IRQ_DOMAIN + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST select PCIE_DW_PLAT help @@ -138,8 +136,8 @@ config PCI_LAYERSCAPE bool "Freescale Layerscape PCIe controller - Host mode" depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST) depends on PCI_MSI_IRQ_DOMAIN - select MFD_SYSCON select PCIE_DW_HOST + select MFD_SYSCON help Say Y here if you want to enable PCIe controller support on Layerscape SoCs to work in Host mode. @@ -180,6 +178,16 @@ config PCIE_QCOM PCIe controller uses the DesignWare core plus Qualcomm-specific hardware wrappers. +config PCIE_QCOM_EP + tristate "Qualcomm PCIe controller - Endpoint mode" + depends on OF && (ARCH_QCOM || COMPILE_TEST) + depends on PCI_ENDPOINT + select PCIE_DW_EP + help + Say Y here to enable support for the PCIe controllers on Qualcomm SoCs + to work in endpoint mode. The PCIe controller uses the DesignWare core + plus Qualcomm-specific hardware wrappers. + config PCIE_ARMADA_8K bool "Marvell Armada-8K PCIe controller" depends on ARCH_MVEBU || COMPILE_TEST @@ -266,7 +274,7 @@ config PCIE_KEEMBAY_EP config PCIE_KIRIN depends on OF && (ARM64 || COMPILE_TEST) - bool "HiSilicon Kirin series SoCs PCIe controllers" + tristate "HiSilicon Kirin series SoCs PCIe controllers" depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST help @@ -283,8 +291,8 @@ config PCIE_HISI_STB config PCI_MESON tristate "MESON PCIe controller" - depends on PCI_MSI_IRQ_DOMAIN default m if ARCH_MESON + depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST help Say Y here if you want to enable PCI controller support on Amlogic diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index 73244409792c..8ba7b67f5e50 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o +obj-$(CONFIG_PCIE_QCOM_EP) += pcie-qcom-ep.o obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o obj-$(CONFIG_PCIE_ROCKCHIP_DW_HOST) += pcie-dw-rockchip.o diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c index fbbb78f6885e..a4221f6f3629 100644 --- a/drivers/pci/controller/dwc/pci-dra7xx.c +++ b/drivers/pci/controller/dwc/pci-dra7xx.c @@ -7,6 +7,7 @@ * Authors: Kishon Vijay Abraham I */ +#include #include #include #include @@ -14,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -90,6 +91,7 @@ struct dra7xx_pcie { int phy_count; /* DT phy-names count */ struct phy **phy; struct irq_domain *irq_domain; + struct clk *clk; enum dw_pcie_device_mode mode; }; @@ -607,6 +609,7 @@ static const struct of_device_id of_dra7xx_pcie_match[] = { }, {}, }; +MODULE_DEVICE_TABLE(of, of_dra7xx_pcie_match); /* * dra7xx_pcie_unaligned_memaccess: workaround for AM572x/AM571x Errata i870 @@ -740,6 +743,15 @@ static int dra7xx_pcie_probe(struct platform_device *pdev) if (!link) return -ENOMEM; + dra7xx->clk = devm_clk_get_optional(dev, NULL); + if (IS_ERR(dra7xx->clk)) + return dev_err_probe(dev, PTR_ERR(dra7xx->clk), + "clock request failed"); + + ret = clk_prepare_enable(dra7xx->clk); + if (ret) + return ret; + for (i = 0; i < phy_count; i++) { snprintf(name, sizeof(name), "pcie-phy%d", i); phy[i] = devm_phy_get(dev, name); @@ -925,6 +937,8 @@ static void dra7xx_pcie_shutdown(struct platform_device *pdev) pm_runtime_disable(dev); dra7xx_pcie_disable_phy(dra7xx); + + clk_disable_unprepare(dra7xx->clk); } static const struct dev_pm_ops dra7xx_pcie_pm_ops = { @@ -943,4 +957,8 @@ static struct platform_driver dra7xx_pcie_driver = { }, .shutdown = dra7xx_pcie_shutdown, }; -builtin_platform_driver(dra7xx_pcie_driver); +module_platform_driver(dra7xx_pcie_driver); + +MODULE_AUTHOR("Kishon Vijay Abraham I "); +MODULE_DESCRIPTION("PCIe controller driver for TI DRA7xx SoCs"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 80fc98acf097..26f49f797b0f 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -1132,7 +1132,7 @@ static int imx6_pcie_probe(struct platform_device *pdev) /* Limit link speed */ pci->link_gen = 1; - ret = of_property_read_u32(node, "fsl,max-link-speed", &pci->link_gen); + of_property_read_u32(node, "fsl,max-link-speed", &pci->link_gen); imx6_pcie->vpcie = devm_regulator_get_optional(&pdev->dev, "vpcie"); if (IS_ERR(imx6_pcie->vpcie)) { diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 998b698f4085..0eda8236c125 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -83,6 +83,7 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar) for (func_no = 0; func_no < funcs; func_no++) __dw_pcie_ep_reset_bar(pci, func_no, bar, 0); } +EXPORT_SYMBOL_GPL(dw_pcie_ep_reset_bar); static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie_ep *ep, u8 func_no, u8 cap_ptr, u8 cap) @@ -485,6 +486,7 @@ int dw_pcie_ep_raise_legacy_irq(struct dw_pcie_ep *ep, u8 func_no) return -EINVAL; } +EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_legacy_irq); int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, u8 interrupt_num) @@ -536,6 +538,7 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no, return 0; } +EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_msi_irq); int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no, u16 interrupt_num) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index d1d9b8344ec9..f4755f3a03be 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -335,6 +335,16 @@ int dw_pcie_host_init(struct pcie_port *pp) if (pci->link_gen < 1) pci->link_gen = of_pci_get_max_link_speed(np); + /* Set default bus ops */ + bridge->ops = &dw_pcie_ops; + bridge->child_ops = &dw_child_pcie_ops; + + if (pp->ops->host_init) { + ret = pp->ops->host_init(pp); + if (ret) + return ret; + } + if (pci_msi_enabled()) { pp->has_msi_ctrl = !(pp->ops->msi_host_init || of_property_read_bool(np, "msi-parent") || @@ -388,15 +398,6 @@ int dw_pcie_host_init(struct pcie_port *pp) } } - /* Set default bus ops */ - bridge->ops = &dw_pcie_ops; - bridge->child_ops = &dw_child_pcie_ops; - - if (pp->ops->host_init) { - ret = pp->ops->host_init(pp); - if (ret) - goto err_free_msi; - } dw_pcie_iatu_detect(pci); dw_pcie_setup_rc(pp); diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index a945f0c0e73d..850b4533f4ef 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -538,6 +538,7 @@ int dw_pcie_link_up(struct dw_pcie *pci) return ((val & PCIE_PORT_DEBUG1_LINK_UP) && (!(val & PCIE_PORT_DEBUG1_LINK_IN_TRAINING))); } +EXPORT_SYMBOL_GPL(dw_pcie_link_up); void dw_pcie_upconfig_setup(struct dw_pcie *pci) { diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 026fd1e42a55..095afbccf9c1 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -8,16 +8,18 @@ * Author: Xiaowei Song */ -#include #include +#include #include #include #include #include #include #include +#include #include #include +#include #include #include #include @@ -28,26 +30,16 @@ #define to_kirin_pcie(x) dev_get_drvdata((x)->dev) -#define REF_CLK_FREQ 100000000 - /* PCIe ELBI registers */ #define SOC_PCIECTRL_CTRL0_ADDR 0x000 #define SOC_PCIECTRL_CTRL1_ADDR 0x004 -#define SOC_PCIEPHY_CTRL2_ADDR 0x008 -#define SOC_PCIEPHY_CTRL3_ADDR 0x00c #define PCIE_ELBI_SLV_DBI_ENABLE (0x1 << 21) /* info located in APB */ #define PCIE_APP_LTSSM_ENABLE 0x01c -#define PCIE_APB_PHY_CTRL0 0x0 -#define PCIE_APB_PHY_CTRL1 0x4 #define PCIE_APB_PHY_STATUS0 0x400 #define PCIE_LINKUP_ENABLE (0x8020) #define PCIE_LTSSM_ENABLE_BIT (0x1 << 11) -#define PIPE_CLK_STABLE (0x1 << 19) -#define PHY_REF_PAD_BIT (0x1 << 8) -#define PHY_PWR_DOWN_BIT (0x1 << 22) -#define PHY_RST_ACK_BIT (0x1 << 16) /* info located in sysctrl */ #define SCTRL_PCIE_CMOS_OFFSET 0x60 @@ -60,17 +52,70 @@ #define PCIE_DEBOUNCE_PARAM 0xF0F400 #define PCIE_OE_BYPASS (0x3 << 28) +/* + * Max number of connected PCI slots at an external PCI bridge + * + * This is used on HiKey 970, which has a PEX 8606 bridge with 4 connected + * lanes (lane 0 upstream, and the other three lanes, one connected to an + * in-board Ethernet adapter and the other two connected to M.2 and mini + * PCI slots. + * + * Each slot has a different clock source and uses a separate PERST# pin. + */ +#define MAX_PCI_SLOTS 3 + +enum pcie_kirin_phy_type { + PCIE_KIRIN_INTERNAL_PHY, + PCIE_KIRIN_EXTERNAL_PHY +}; + +struct kirin_pcie { + enum pcie_kirin_phy_type type; + + struct dw_pcie *pci; + struct regmap *apb; + struct phy *phy; + void *phy_priv; /* only for PCIE_KIRIN_INTERNAL_PHY */ + + /* DWC PERST# */ + int gpio_id_dwc_perst; + + /* Per-slot PERST# */ + int num_slots; + int gpio_id_reset[MAX_PCI_SLOTS]; + const char *reset_names[MAX_PCI_SLOTS]; + + /* Per-slot clkreq */ + int n_gpio_clkreq; + int gpio_id_clkreq[MAX_PCI_SLOTS]; + const char *clkreq_names[MAX_PCI_SLOTS]; +}; + +/* + * Kirin 960 PHY. Can't be split into a PHY driver without changing the + * DT schema. + */ + +#define REF_CLK_FREQ 100000000 + +/* PHY info located in APB */ +#define PCIE_APB_PHY_CTRL0 0x0 +#define PCIE_APB_PHY_CTRL1 0x4 +#define PCIE_APB_PHY_STATUS0 0x400 +#define PIPE_CLK_STABLE BIT(19) +#define PHY_REF_PAD_BIT BIT(8) +#define PHY_PWR_DOWN_BIT BIT(22) +#define PHY_RST_ACK_BIT BIT(16) + /* peri_crg ctrl */ #define CRGCTRL_PCIE_ASSERT_OFFSET 0x88 #define CRGCTRL_PCIE_ASSERT_BIT 0x8c000000 /* Time for delay */ -#define REF_2_PERST_MIN 20000 +#define REF_2_PERST_MIN 21000 #define REF_2_PERST_MAX 25000 #define PERST_2_ACCESS_MIN 10000 #define PERST_2_ACCESS_MAX 12000 -#define LINK_WAIT_MIN 900 -#define LINK_WAIT_MAX 1000 #define PIPE_CLK_WAIT_MIN 550 #define PIPE_CLK_WAIT_MAX 600 #define TIME_CMOS_MIN 100 @@ -78,118 +123,101 @@ #define TIME_PHY_PD_MIN 10 #define TIME_PHY_PD_MAX 11 -struct kirin_pcie { - struct dw_pcie *pci; - void __iomem *apb_base; - void __iomem *phy_base; +struct hi3660_pcie_phy { + struct device *dev; + void __iomem *base; struct regmap *crgctrl; struct regmap *sysctrl; struct clk *apb_sys_clk; struct clk *apb_phy_clk; struct clk *phy_ref_clk; - struct clk *pcie_aclk; - struct clk *pcie_aux_clk; - int gpio_id_reset; + struct clk *aclk; + struct clk *aux_clk; }; -/* Registers in PCIeCTRL */ -static inline void kirin_apb_ctrl_writel(struct kirin_pcie *kirin_pcie, - u32 val, u32 reg) -{ - writel(val, kirin_pcie->apb_base + reg); -} - -static inline u32 kirin_apb_ctrl_readl(struct kirin_pcie *kirin_pcie, u32 reg) -{ - return readl(kirin_pcie->apb_base + reg); -} - /* Registers in PCIePHY */ -static inline void kirin_apb_phy_writel(struct kirin_pcie *kirin_pcie, +static inline void kirin_apb_phy_writel(struct hi3660_pcie_phy *hi3660_pcie_phy, u32 val, u32 reg) { - writel(val, kirin_pcie->phy_base + reg); + writel(val, hi3660_pcie_phy->base + reg); } -static inline u32 kirin_apb_phy_readl(struct kirin_pcie *kirin_pcie, u32 reg) +static inline u32 kirin_apb_phy_readl(struct hi3660_pcie_phy *hi3660_pcie_phy, + u32 reg) { - return readl(kirin_pcie->phy_base + reg); + return readl(hi3660_pcie_phy->base + reg); } -static long kirin_pcie_get_clk(struct kirin_pcie *kirin_pcie, - struct platform_device *pdev) +static int hi3660_pcie_phy_get_clk(struct hi3660_pcie_phy *phy) { - struct device *dev = &pdev->dev; + struct device *dev = phy->dev; - kirin_pcie->phy_ref_clk = devm_clk_get(dev, "pcie_phy_ref"); - if (IS_ERR(kirin_pcie->phy_ref_clk)) - return PTR_ERR(kirin_pcie->phy_ref_clk); + phy->phy_ref_clk = devm_clk_get(dev, "pcie_phy_ref"); + if (IS_ERR(phy->phy_ref_clk)) + return PTR_ERR(phy->phy_ref_clk); - kirin_pcie->pcie_aux_clk = devm_clk_get(dev, "pcie_aux"); - if (IS_ERR(kirin_pcie->pcie_aux_clk)) - return PTR_ERR(kirin_pcie->pcie_aux_clk); + phy->aux_clk = devm_clk_get(dev, "pcie_aux"); + if (IS_ERR(phy->aux_clk)) + return PTR_ERR(phy->aux_clk); - kirin_pcie->apb_phy_clk = devm_clk_get(dev, "pcie_apb_phy"); - if (IS_ERR(kirin_pcie->apb_phy_clk)) - return PTR_ERR(kirin_pcie->apb_phy_clk); + phy->apb_phy_clk = devm_clk_get(dev, "pcie_apb_phy"); + if (IS_ERR(phy->apb_phy_clk)) + return PTR_ERR(phy->apb_phy_clk); - kirin_pcie->apb_sys_clk = devm_clk_get(dev, "pcie_apb_sys"); - if (IS_ERR(kirin_pcie->apb_sys_clk)) - return PTR_ERR(kirin_pcie->apb_sys_clk); + phy->apb_sys_clk = devm_clk_get(dev, "pcie_apb_sys"); + if (IS_ERR(phy->apb_sys_clk)) + return PTR_ERR(phy->apb_sys_clk); - kirin_pcie->pcie_aclk = devm_clk_get(dev, "pcie_aclk"); - if (IS_ERR(kirin_pcie->pcie_aclk)) - return PTR_ERR(kirin_pcie->pcie_aclk); + phy->aclk = devm_clk_get(dev, "pcie_aclk"); + if (IS_ERR(phy->aclk)) + return PTR_ERR(phy->aclk); return 0; } -static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie, - struct platform_device *pdev) +static int hi3660_pcie_phy_get_resource(struct hi3660_pcie_phy *phy) { - kirin_pcie->apb_base = - devm_platform_ioremap_resource_byname(pdev, "apb"); - if (IS_ERR(kirin_pcie->apb_base)) - return PTR_ERR(kirin_pcie->apb_base); + struct device *dev = phy->dev; + struct platform_device *pdev; - kirin_pcie->phy_base = - devm_platform_ioremap_resource_byname(pdev, "phy"); - if (IS_ERR(kirin_pcie->phy_base)) - return PTR_ERR(kirin_pcie->phy_base); + /* registers */ + pdev = container_of(dev, struct platform_device, dev); - kirin_pcie->crgctrl = - syscon_regmap_lookup_by_compatible("hisilicon,hi3660-crgctrl"); - if (IS_ERR(kirin_pcie->crgctrl)) - return PTR_ERR(kirin_pcie->crgctrl); + phy->base = devm_platform_ioremap_resource_byname(pdev, "phy"); + if (IS_ERR(phy->base)) + return PTR_ERR(phy->base); - kirin_pcie->sysctrl = - syscon_regmap_lookup_by_compatible("hisilicon,hi3660-sctrl"); - if (IS_ERR(kirin_pcie->sysctrl)) - return PTR_ERR(kirin_pcie->sysctrl); + phy->crgctrl = syscon_regmap_lookup_by_compatible("hisilicon,hi3660-crgctrl"); + if (IS_ERR(phy->crgctrl)) + return PTR_ERR(phy->crgctrl); + + phy->sysctrl = syscon_regmap_lookup_by_compatible("hisilicon,hi3660-sctrl"); + if (IS_ERR(phy->sysctrl)) + return PTR_ERR(phy->sysctrl); return 0; } -static int kirin_pcie_phy_init(struct kirin_pcie *kirin_pcie) +static int hi3660_pcie_phy_start(struct hi3660_pcie_phy *phy) { - struct device *dev = kirin_pcie->pci->dev; + struct device *dev = phy->dev; u32 reg_val; - reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL1); + reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL1); reg_val &= ~PHY_REF_PAD_BIT; - kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL1); + kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL1); - reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL0); + reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL0); reg_val &= ~PHY_PWR_DOWN_BIT; - kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL0); + kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL0); usleep_range(TIME_PHY_PD_MIN, TIME_PHY_PD_MAX); - reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL1); + reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL1); reg_val &= ~PHY_RST_ACK_BIT; - kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL1); + kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL1); usleep_range(PIPE_CLK_WAIT_MIN, PIPE_CLK_WAIT_MAX); - reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_STATUS0); + reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_STATUS0); if (reg_val & PIPE_CLK_STABLE) { dev_err(dev, "PIPE clk is not stable\n"); return -EINVAL; @@ -198,102 +226,274 @@ static int kirin_pcie_phy_init(struct kirin_pcie *kirin_pcie) return 0; } -static void kirin_pcie_oe_enable(struct kirin_pcie *kirin_pcie) +static void hi3660_pcie_phy_oe_enable(struct hi3660_pcie_phy *phy) { u32 val; - regmap_read(kirin_pcie->sysctrl, SCTRL_PCIE_OE_OFFSET, &val); + regmap_read(phy->sysctrl, SCTRL_PCIE_OE_OFFSET, &val); val |= PCIE_DEBOUNCE_PARAM; val &= ~PCIE_OE_BYPASS; - regmap_write(kirin_pcie->sysctrl, SCTRL_PCIE_OE_OFFSET, val); + regmap_write(phy->sysctrl, SCTRL_PCIE_OE_OFFSET, val); } -static int kirin_pcie_clk_ctrl(struct kirin_pcie *kirin_pcie, bool enable) +static int hi3660_pcie_phy_clk_ctrl(struct hi3660_pcie_phy *phy, bool enable) { int ret = 0; if (!enable) goto close_clk; - ret = clk_set_rate(kirin_pcie->phy_ref_clk, REF_CLK_FREQ); + ret = clk_set_rate(phy->phy_ref_clk, REF_CLK_FREQ); if (ret) return ret; - ret = clk_prepare_enable(kirin_pcie->phy_ref_clk); + ret = clk_prepare_enable(phy->phy_ref_clk); if (ret) return ret; - ret = clk_prepare_enable(kirin_pcie->apb_sys_clk); + ret = clk_prepare_enable(phy->apb_sys_clk); if (ret) goto apb_sys_fail; - ret = clk_prepare_enable(kirin_pcie->apb_phy_clk); + ret = clk_prepare_enable(phy->apb_phy_clk); if (ret) goto apb_phy_fail; - ret = clk_prepare_enable(kirin_pcie->pcie_aclk); + ret = clk_prepare_enable(phy->aclk); if (ret) goto aclk_fail; - ret = clk_prepare_enable(kirin_pcie->pcie_aux_clk); + ret = clk_prepare_enable(phy->aux_clk); if (ret) goto aux_clk_fail; return 0; close_clk: - clk_disable_unprepare(kirin_pcie->pcie_aux_clk); + clk_disable_unprepare(phy->aux_clk); aux_clk_fail: - clk_disable_unprepare(kirin_pcie->pcie_aclk); + clk_disable_unprepare(phy->aclk); aclk_fail: - clk_disable_unprepare(kirin_pcie->apb_phy_clk); + clk_disable_unprepare(phy->apb_phy_clk); apb_phy_fail: - clk_disable_unprepare(kirin_pcie->apb_sys_clk); + clk_disable_unprepare(phy->apb_sys_clk); apb_sys_fail: - clk_disable_unprepare(kirin_pcie->phy_ref_clk); + clk_disable_unprepare(phy->phy_ref_clk); return ret; } -static int kirin_pcie_power_on(struct kirin_pcie *kirin_pcie) +static int hi3660_pcie_phy_power_on(struct kirin_pcie *pcie) { + struct hi3660_pcie_phy *phy = pcie->phy_priv; int ret; /* Power supply for Host */ - regmap_write(kirin_pcie->sysctrl, + regmap_write(phy->sysctrl, SCTRL_PCIE_CMOS_OFFSET, SCTRL_PCIE_CMOS_BIT); usleep_range(TIME_CMOS_MIN, TIME_CMOS_MAX); - kirin_pcie_oe_enable(kirin_pcie); - ret = kirin_pcie_clk_ctrl(kirin_pcie, true); + hi3660_pcie_phy_oe_enable(phy); + + ret = hi3660_pcie_phy_clk_ctrl(phy, true); if (ret) return ret; /* ISO disable, PCIeCtrl, PHY assert and clk gate clear */ - regmap_write(kirin_pcie->sysctrl, + regmap_write(phy->sysctrl, SCTRL_PCIE_ISO_OFFSET, SCTRL_PCIE_ISO_BIT); - regmap_write(kirin_pcie->crgctrl, + regmap_write(phy->crgctrl, CRGCTRL_PCIE_ASSERT_OFFSET, CRGCTRL_PCIE_ASSERT_BIT); - regmap_write(kirin_pcie->sysctrl, + regmap_write(phy->sysctrl, SCTRL_PCIE_HPCLK_OFFSET, SCTRL_PCIE_HPCLK_BIT); - ret = kirin_pcie_phy_init(kirin_pcie); + ret = hi3660_pcie_phy_start(phy); if (ret) - goto close_clk; + goto disable_clks; - /* perst assert Endpoint */ - if (!gpio_request(kirin_pcie->gpio_id_reset, "pcie_perst")) { - usleep_range(REF_2_PERST_MIN, REF_2_PERST_MAX); - ret = gpio_direction_output(kirin_pcie->gpio_id_reset, 1); - if (ret) - goto close_clk; - usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX); + return 0; +disable_clks: + hi3660_pcie_phy_clk_ctrl(phy, false); + return ret; +} + +static int hi3660_pcie_phy_init(struct platform_device *pdev, + struct kirin_pcie *pcie) +{ + struct device *dev = &pdev->dev; + struct hi3660_pcie_phy *phy; + int ret; + + phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL); + if (!phy) + return -ENOMEM; + + pcie->phy_priv = phy; + phy->dev = dev; + + /* registers */ + pdev = container_of(dev, struct platform_device, dev); + + ret = hi3660_pcie_phy_get_clk(phy); + if (ret) + return ret; + + return hi3660_pcie_phy_get_resource(phy); +} + +static int hi3660_pcie_phy_power_off(struct kirin_pcie *pcie) +{ + struct hi3660_pcie_phy *phy = pcie->phy_priv; + + /* Drop power supply for Host */ + regmap_write(phy->sysctrl, SCTRL_PCIE_CMOS_OFFSET, 0x00); + + hi3660_pcie_phy_clk_ctrl(phy, false); + + return 0; +} + +/* + * The non-PHY part starts here + */ + +static const struct regmap_config pcie_kirin_regmap_conf = { + .name = "kirin_pcie_apb", + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, +}; + +static int kirin_pcie_get_gpio_enable(struct kirin_pcie *pcie, + struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + char name[32]; + int ret, i; + + /* This is an optional property */ + ret = of_gpio_named_count(np, "hisilicon,clken-gpios"); + if (ret < 0) return 0; + + if (ret > MAX_PCI_SLOTS) { + dev_err(dev, "Too many GPIO clock requests!\n"); + return -EINVAL; } -close_clk: - kirin_pcie_clk_ctrl(kirin_pcie, false); + pcie->n_gpio_clkreq = ret; + + for (i = 0; i < pcie->n_gpio_clkreq; i++) { + pcie->gpio_id_clkreq[i] = of_get_named_gpio(dev->of_node, + "hisilicon,clken-gpios", i); + if (pcie->gpio_id_clkreq[i] < 0) + return pcie->gpio_id_clkreq[i]; + + sprintf(name, "pcie_clkreq_%d", i); + pcie->clkreq_names[i] = devm_kstrdup_const(dev, name, + GFP_KERNEL); + if (!pcie->clkreq_names[i]) + return -ENOMEM; + } + + return 0; +} + +static int kirin_pcie_parse_port(struct kirin_pcie *pcie, + struct platform_device *pdev, + struct device_node *node) +{ + struct device *dev = &pdev->dev; + struct device_node *parent, *child; + int ret, slot, i; + char name[32]; + + for_each_available_child_of_node(node, parent) { + for_each_available_child_of_node(parent, child) { + i = pcie->num_slots; + + pcie->gpio_id_reset[i] = of_get_named_gpio(child, + "reset-gpios", 0); + if (pcie->gpio_id_reset[i] < 0) + continue; + + pcie->num_slots++; + if (pcie->num_slots > MAX_PCI_SLOTS) { + dev_err(dev, "Too many PCI slots!\n"); + ret = -EINVAL; + goto put_node; + } + + ret = of_pci_get_devfn(child); + if (ret < 0) { + dev_err(dev, "failed to parse devfn: %d\n", ret); + goto put_node; + } + + slot = PCI_SLOT(ret); + + sprintf(name, "pcie_perst_%d", slot); + pcie->reset_names[i] = devm_kstrdup_const(dev, name, + GFP_KERNEL); + if (!pcie->reset_names[i]) { + ret = -ENOMEM; + goto put_node; + } + } + } + + return 0; + +put_node: + of_node_put(child); + of_node_put(parent); + return ret; +} + +static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie, + struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct device_node *child, *node = dev->of_node; + void __iomem *apb_base; + int ret; + + apb_base = devm_platform_ioremap_resource_byname(pdev, "apb"); + if (IS_ERR(apb_base)) + return PTR_ERR(apb_base); + + kirin_pcie->apb = devm_regmap_init_mmio(dev, apb_base, + &pcie_kirin_regmap_conf); + if (IS_ERR(kirin_pcie->apb)) + return PTR_ERR(kirin_pcie->apb); + + /* pcie internal PERST# gpio */ + kirin_pcie->gpio_id_dwc_perst = of_get_named_gpio(dev->of_node, + "reset-gpios", 0); + if (kirin_pcie->gpio_id_dwc_perst == -EPROBE_DEFER) { + return -EPROBE_DEFER; + } else if (!gpio_is_valid(kirin_pcie->gpio_id_dwc_perst)) { + dev_err(dev, "unable to get a valid gpio pin\n"); + return -ENODEV; + } + + ret = kirin_pcie_get_gpio_enable(kirin_pcie, pdev); + if (ret) + return ret; + + /* Parse OF children */ + for_each_available_child_of_node(node, child) { + ret = kirin_pcie_parse_port(kirin_pcie, pdev, child); + if (ret) + goto put_node; + } + + return 0; + +put_node: + of_node_put(child); return ret; } @@ -302,13 +502,13 @@ static void kirin_pcie_sideband_dbi_w_mode(struct kirin_pcie *kirin_pcie, { u32 val; - val = kirin_apb_ctrl_readl(kirin_pcie, SOC_PCIECTRL_CTRL0_ADDR); + regmap_read(kirin_pcie->apb, SOC_PCIECTRL_CTRL0_ADDR, &val); if (on) val = val | PCIE_ELBI_SLV_DBI_ENABLE; else val = val & ~PCIE_ELBI_SLV_DBI_ENABLE; - kirin_apb_ctrl_writel(kirin_pcie, val, SOC_PCIECTRL_CTRL0_ADDR); + regmap_write(kirin_pcie->apb, SOC_PCIECTRL_CTRL0_ADDR, val); } static void kirin_pcie_sideband_dbi_r_mode(struct kirin_pcie *kirin_pcie, @@ -316,13 +516,13 @@ static void kirin_pcie_sideband_dbi_r_mode(struct kirin_pcie *kirin_pcie, { u32 val; - val = kirin_apb_ctrl_readl(kirin_pcie, SOC_PCIECTRL_CTRL1_ADDR); + regmap_read(kirin_pcie->apb, SOC_PCIECTRL_CTRL1_ADDR, &val); if (on) val = val | PCIE_ELBI_SLV_DBI_ENABLE; else val = val & ~PCIE_ELBI_SLV_DBI_ENABLE; - kirin_apb_ctrl_writel(kirin_pcie, val, SOC_PCIECTRL_CTRL1_ADDR); + regmap_write(kirin_pcie->apb, SOC_PCIECTRL_CTRL1_ADDR, val); } static int kirin_pcie_rd_own_conf(struct pci_bus *bus, unsigned int devfn, @@ -351,9 +551,32 @@ static int kirin_pcie_wr_own_conf(struct pci_bus *bus, unsigned int devfn, return PCIBIOS_SUCCESSFUL; } +static int kirin_pcie_add_bus(struct pci_bus *bus) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(bus->sysdata); + struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci); + int i, ret; + + if (!kirin_pcie->num_slots) + return 0; + + /* Send PERST# to each slot */ + for (i = 0; i < kirin_pcie->num_slots; i++) { + ret = gpio_direction_output(kirin_pcie->gpio_id_reset[i], 1); + if (ret) { + dev_err(pci->dev, "PERST# %s error: %d\n", + kirin_pcie->reset_names[i], ret); + } + } + usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX); + + return 0; +} + static struct pci_ops kirin_pci_ops = { .read = kirin_pcie_rd_own_conf, .write = kirin_pcie_wr_own_conf, + .add_bus = kirin_pcie_add_bus, }; static u32 kirin_pcie_read_dbi(struct dw_pcie *pci, void __iomem *base, @@ -382,8 +605,9 @@ static void kirin_pcie_write_dbi(struct dw_pcie *pci, void __iomem *base, static int kirin_pcie_link_up(struct dw_pcie *pci) { struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci); - u32 val = kirin_apb_ctrl_readl(kirin_pcie, PCIE_APB_PHY_STATUS0); + u32 val; + regmap_read(kirin_pcie->apb, PCIE_APB_PHY_STATUS0, &val); if ((val & PCIE_LINKUP_ENABLE) == PCIE_LINKUP_ENABLE) return 1; @@ -395,8 +619,8 @@ static int kirin_pcie_start_link(struct dw_pcie *pci) struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci); /* assert LTSSM enable */ - kirin_apb_ctrl_writel(kirin_pcie, PCIE_LTSSM_ENABLE_BIT, - PCIE_APP_LTSSM_ENABLE); + regmap_write(kirin_pcie->apb, PCIE_APP_LTSSM_ENABLE, + PCIE_LTSSM_ENABLE_BIT); return 0; } @@ -408,6 +632,44 @@ static int kirin_pcie_host_init(struct pcie_port *pp) return 0; } +static int kirin_pcie_gpio_request(struct kirin_pcie *kirin_pcie, + struct device *dev) +{ + int ret, i; + + for (i = 0; i < kirin_pcie->num_slots; i++) { + if (!gpio_is_valid(kirin_pcie->gpio_id_reset[i])) { + dev_err(dev, "unable to get a valid %s gpio\n", + kirin_pcie->reset_names[i]); + return -ENODEV; + } + + ret = devm_gpio_request(dev, kirin_pcie->gpio_id_reset[i], + kirin_pcie->reset_names[i]); + if (ret) + return ret; + } + + for (i = 0; i < kirin_pcie->n_gpio_clkreq; i++) { + if (!gpio_is_valid(kirin_pcie->gpio_id_clkreq[i])) { + dev_err(dev, "unable to get a valid %s gpio\n", + kirin_pcie->clkreq_names[i]); + return -ENODEV; + } + + ret = devm_gpio_request(dev, kirin_pcie->gpio_id_clkreq[i], + kirin_pcie->clkreq_names[i]); + if (ret) + return ret; + + ret = gpio_direction_output(kirin_pcie->gpio_id_clkreq[i], 0); + if (ret) + return ret; + } + + return 0; +} + static const struct dw_pcie_ops kirin_dw_pcie_ops = { .read_dbi = kirin_pcie_read_dbi, .write_dbi = kirin_pcie_write_dbi, @@ -419,8 +681,99 @@ static const struct dw_pcie_host_ops kirin_pcie_host_ops = { .host_init = kirin_pcie_host_init, }; +static int kirin_pcie_power_off(struct kirin_pcie *kirin_pcie) +{ + int i; + + if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) + return hi3660_pcie_phy_power_off(kirin_pcie); + + for (i = 0; i < kirin_pcie->n_gpio_clkreq; i++) + gpio_direction_output(kirin_pcie->gpio_id_clkreq[i], 1); + + phy_power_off(kirin_pcie->phy); + phy_exit(kirin_pcie->phy); + + return 0; +} + +static int kirin_pcie_power_on(struct platform_device *pdev, + struct kirin_pcie *kirin_pcie) +{ + struct device *dev = &pdev->dev; + int ret; + + if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) { + ret = hi3660_pcie_phy_init(pdev, kirin_pcie); + if (ret) + return ret; + + ret = hi3660_pcie_phy_power_on(kirin_pcie); + if (ret) + return ret; + } else { + kirin_pcie->phy = devm_of_phy_get(dev, dev->of_node, NULL); + if (IS_ERR(kirin_pcie->phy)) + return PTR_ERR(kirin_pcie->phy); + + ret = kirin_pcie_gpio_request(kirin_pcie, dev); + if (ret) + return ret; + + ret = phy_init(kirin_pcie->phy); + if (ret) + goto err; + + ret = phy_power_on(kirin_pcie->phy); + if (ret) + goto err; + } + + /* perst assert Endpoint */ + usleep_range(REF_2_PERST_MIN, REF_2_PERST_MAX); + + if (!gpio_request(kirin_pcie->gpio_id_dwc_perst, "pcie_perst_bridge")) { + ret = gpio_direction_output(kirin_pcie->gpio_id_dwc_perst, 1); + if (ret) + goto err; + } + + usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX); + + return 0; +err: + kirin_pcie_power_off(kirin_pcie); + + return ret; +} + +static int __exit kirin_pcie_remove(struct platform_device *pdev) +{ + struct kirin_pcie *kirin_pcie = platform_get_drvdata(pdev); + + dw_pcie_host_deinit(&kirin_pcie->pci->pp); + + kirin_pcie_power_off(kirin_pcie); + + return 0; +} + +static const struct of_device_id kirin_pcie_match[] = { + { + .compatible = "hisilicon,kirin960-pcie", + .data = (void *)PCIE_KIRIN_INTERNAL_PHY + }, + { + .compatible = "hisilicon,kirin970-pcie", + .data = (void *)PCIE_KIRIN_EXTERNAL_PHY + }, + {}, +}; + static int kirin_pcie_probe(struct platform_device *pdev) { + enum pcie_kirin_phy_type phy_type; + const struct of_device_id *of_id; struct device *dev = &pdev->dev; struct kirin_pcie *kirin_pcie; struct dw_pcie *pci; @@ -431,6 +784,14 @@ static int kirin_pcie_probe(struct platform_device *pdev) return -EINVAL; } + of_id = of_match_device(kirin_pcie_match, dev); + if (!of_id) { + dev_err(dev, "OF data missing\n"); + return -EINVAL; + } + + phy_type = (long)of_id->data; + kirin_pcie = devm_kzalloc(dev, sizeof(struct kirin_pcie), GFP_KERNEL); if (!kirin_pcie) return -ENOMEM; @@ -443,44 +804,33 @@ static int kirin_pcie_probe(struct platform_device *pdev) pci->ops = &kirin_dw_pcie_ops; pci->pp.ops = &kirin_pcie_host_ops; kirin_pcie->pci = pci; - - ret = kirin_pcie_get_clk(kirin_pcie, pdev); - if (ret) - return ret; + kirin_pcie->type = phy_type; ret = kirin_pcie_get_resource(kirin_pcie, pdev); if (ret) return ret; - kirin_pcie->gpio_id_reset = of_get_named_gpio(dev->of_node, - "reset-gpios", 0); - if (kirin_pcie->gpio_id_reset == -EPROBE_DEFER) { - return -EPROBE_DEFER; - } else if (!gpio_is_valid(kirin_pcie->gpio_id_reset)) { - dev_err(dev, "unable to get a valid gpio pin\n"); - return -ENODEV; - } + platform_set_drvdata(pdev, kirin_pcie); - ret = kirin_pcie_power_on(kirin_pcie); + ret = kirin_pcie_power_on(pdev, kirin_pcie); if (ret) return ret; - platform_set_drvdata(pdev, kirin_pcie); - return dw_pcie_host_init(&pci->pp); } -static const struct of_device_id kirin_pcie_match[] = { - { .compatible = "hisilicon,kirin960-pcie" }, - {}, -}; - static struct platform_driver kirin_pcie_driver = { .probe = kirin_pcie_probe, + .remove = __exit_p(kirin_pcie_remove), .driver = { .name = "kirin-pcie", - .of_match_table = kirin_pcie_match, - .suppress_bind_attrs = true, + .of_match_table = kirin_pcie_match, + .suppress_bind_attrs = true, }, }; -builtin_platform_driver(kirin_pcie_driver); +module_platform_driver(kirin_pcie_driver); + +MODULE_DEVICE_TABLE(of, kirin_pcie_match); +MODULE_DESCRIPTION("PCIe host controller driver for Kirin Phone SoCs"); +MODULE_AUTHOR("Xiaowei Song "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c new file mode 100644 index 000000000000..7b17da2f9b3f --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -0,0 +1,721 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Qualcomm PCIe Endpoint controller driver + * + * Copyright (c) 2020, The Linux Foundation. All rights reserved. + * Author: Siddartha Mohanadoss +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pcie-designware.h" + +/* PARF registers */ +#define PARF_SYS_CTRL 0x00 +#define PARF_DB_CTRL 0x10 +#define PARF_PM_CTRL 0x20 +#define PARF_MHI_BASE_ADDR_LOWER 0x178 +#define PARF_MHI_BASE_ADDR_UPPER 0x17c +#define PARF_DEBUG_INT_EN 0x190 +#define PARF_AXI_MSTR_RD_HALT_NO_WRITES 0x1a4 +#define PARF_AXI_MSTR_WR_ADDR_HALT 0x1a8 +#define PARF_Q2A_FLUSH 0x1ac +#define PARF_LTSSM 0x1b0 +#define PARF_CFG_BITS 0x210 +#define PARF_INT_ALL_STATUS 0x224 +#define PARF_INT_ALL_CLEAR 0x228 +#define PARF_INT_ALL_MASK 0x22c +#define PARF_SLV_ADDR_MSB_CTRL 0x2c0 +#define PARF_DBI_BASE_ADDR 0x350 +#define PARF_DBI_BASE_ADDR_HI 0x354 +#define PARF_SLV_ADDR_SPACE_SIZE 0x358 +#define PARF_SLV_ADDR_SPACE_SIZE_HI 0x35c +#define PARF_ATU_BASE_ADDR 0x634 +#define PARF_ATU_BASE_ADDR_HI 0x638 +#define PARF_SRIS_MODE 0x644 +#define PARF_DEVICE_TYPE 0x1000 +#define PARF_BDF_TO_SID_CFG 0x2c00 + +/* PARF_INT_ALL_{STATUS/CLEAR/MASK} register fields */ +#define PARF_INT_ALL_LINK_DOWN BIT(1) +#define PARF_INT_ALL_BME BIT(2) +#define PARF_INT_ALL_PM_TURNOFF BIT(3) +#define PARF_INT_ALL_DEBUG BIT(4) +#define PARF_INT_ALL_LTR BIT(5) +#define PARF_INT_ALL_MHI_Q6 BIT(6) +#define PARF_INT_ALL_MHI_A7 BIT(7) +#define PARF_INT_ALL_DSTATE_CHANGE BIT(8) +#define PARF_INT_ALL_L1SUB_TIMEOUT BIT(9) +#define PARF_INT_ALL_MMIO_WRITE BIT(10) +#define PARF_INT_ALL_CFG_WRITE BIT(11) +#define PARF_INT_ALL_BRIDGE_FLUSH_N BIT(12) +#define PARF_INT_ALL_LINK_UP BIT(13) +#define PARF_INT_ALL_AER_LEGACY BIT(14) +#define PARF_INT_ALL_PLS_ERR BIT(15) +#define PARF_INT_ALL_PME_LEGACY BIT(16) +#define PARF_INT_ALL_PLS_PME BIT(17) + +/* PARF_BDF_TO_SID_CFG register fields */ +#define PARF_BDF_TO_SID_BYPASS BIT(0) + +/* PARF_DEBUG_INT_EN register fields */ +#define PARF_DEBUG_INT_PM_DSTATE_CHANGE BIT(1) +#define PARF_DEBUG_INT_CFG_BUS_MASTER_EN BIT(2) +#define PARF_DEBUG_INT_RADM_PM_TURNOFF BIT(3) + +/* PARF_DEVICE_TYPE register fields */ +#define PARF_DEVICE_TYPE_EP 0x0 + +/* PARF_PM_CTRL register fields */ +#define PARF_PM_CTRL_REQ_EXIT_L1 BIT(1) +#define PARF_PM_CTRL_READY_ENTR_L23 BIT(2) +#define PARF_PM_CTRL_REQ_NOT_ENTR_L1 BIT(5) + +/* PARF_AXI_MSTR_RD_HALT_NO_WRITES register fields */ +#define PARF_AXI_MSTR_RD_HALT_NO_WRITE_EN BIT(0) + +/* PARF_AXI_MSTR_WR_ADDR_HALT register fields */ +#define PARF_AXI_MSTR_WR_ADDR_HALT_EN BIT(31) + +/* PARF_Q2A_FLUSH register fields */ +#define PARF_Q2A_FLUSH_EN BIT(16) + +/* PARF_SYS_CTRL register fields */ +#define PARF_SYS_CTRL_AUX_PWR_DET BIT(4) +#define PARF_SYS_CTRL_CORE_CLK_CGC_DIS BIT(6) +#define PARF_SYS_CTRL_SLV_DBI_WAKE_DISABLE BIT(11) + +/* PARF_DB_CTRL register fields */ +#define PARF_DB_CTRL_INSR_DBNCR_BLOCK BIT(0) +#define PARF_DB_CTRL_RMVL_DBNCR_BLOCK BIT(1) +#define PARF_DB_CTRL_DBI_WKP_BLOCK BIT(4) +#define PARF_DB_CTRL_SLV_WKP_BLOCK BIT(5) +#define PARF_DB_CTRL_MST_WKP_BLOCK BIT(6) + +/* PARF_CFG_BITS register fields */ +#define PARF_CFG_BITS_REQ_EXIT_L1SS_MSI_LTR_EN BIT(1) + +/* ELBI registers */ +#define ELBI_SYS_STTS 0x08 + +/* DBI registers */ +#define DBI_CON_STATUS 0x44 + +/* DBI register fields */ +#define DBI_CON_STATUS_POWER_STATE_MASK GENMASK(1, 0) + +#define XMLH_LINK_UP 0x400 +#define CORE_RESET_TIME_US_MIN 1000 +#define CORE_RESET_TIME_US_MAX 1005 +#define WAKE_DELAY_US 2000 /* 2 ms */ + +#define to_pcie_ep(x) dev_get_drvdata((x)->dev) + +enum qcom_pcie_ep_link_status { + QCOM_PCIE_EP_LINK_DISABLED, + QCOM_PCIE_EP_LINK_ENABLED, + QCOM_PCIE_EP_LINK_UP, + QCOM_PCIE_EP_LINK_DOWN, +}; + +static struct clk_bulk_data qcom_pcie_ep_clks[] = { + { .id = "cfg" }, + { .id = "aux" }, + { .id = "bus_master" }, + { .id = "bus_slave" }, + { .id = "ref" }, + { .id = "sleep" }, + { .id = "slave_q2a" }, +}; + +struct qcom_pcie_ep { + struct dw_pcie pci; + + void __iomem *parf; + void __iomem *elbi; + struct regmap *perst_map; + struct resource *mmio_res; + + struct reset_control *core_reset; + struct gpio_desc *reset; + struct gpio_desc *wake; + struct phy *phy; + + u32 perst_en; + u32 perst_sep_en; + + enum qcom_pcie_ep_link_status link_status; + int global_irq; + int perst_irq; +}; + +static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep) +{ + struct dw_pcie *pci = &pcie_ep->pci; + struct device *dev = pci->dev; + int ret; + + ret = reset_control_assert(pcie_ep->core_reset); + if (ret) { + dev_err(dev, "Cannot assert core reset\n"); + return ret; + } + + usleep_range(CORE_RESET_TIME_US_MIN, CORE_RESET_TIME_US_MAX); + + ret = reset_control_deassert(pcie_ep->core_reset); + if (ret) { + dev_err(dev, "Cannot de-assert core reset\n"); + return ret; + } + + usleep_range(CORE_RESET_TIME_US_MIN, CORE_RESET_TIME_US_MAX); + + return 0; +} + +/* + * Delatch PERST_EN and PERST_SEPARATION_ENABLE with TCSR to avoid + * device reset during host reboot and hibernation. The driver is + * expected to handle this situation. + */ +static void qcom_pcie_ep_configure_tcsr(struct qcom_pcie_ep *pcie_ep) +{ + regmap_write(pcie_ep->perst_map, pcie_ep->perst_en, 0); + regmap_write(pcie_ep->perst_map, pcie_ep->perst_sep_en, 0); +} + +static int qcom_pcie_dw_link_up(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + u32 reg; + + reg = readl_relaxed(pcie_ep->elbi + ELBI_SYS_STTS); + + return reg & XMLH_LINK_UP; +} + +static int qcom_pcie_dw_start_link(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + + enable_irq(pcie_ep->perst_irq); + + return 0; +} + +static void qcom_pcie_dw_stop_link(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + + disable_irq(pcie_ep->perst_irq); +} + +static int qcom_pcie_perst_deassert(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + struct device *dev = pci->dev; + u32 val, offset; + int ret; + + ret = clk_bulk_prepare_enable(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + if (ret) + return ret; + + ret = qcom_pcie_ep_core_reset(pcie_ep); + if (ret) + goto err_disable_clk; + + ret = phy_init(pcie_ep->phy); + if (ret) + goto err_disable_clk; + + ret = phy_power_on(pcie_ep->phy); + if (ret) + goto err_phy_exit; + + /* Assert WAKE# to RC to indicate device is ready */ + gpiod_set_value_cansleep(pcie_ep->wake, 1); + usleep_range(WAKE_DELAY_US, WAKE_DELAY_US + 500); + gpiod_set_value_cansleep(pcie_ep->wake, 0); + + qcom_pcie_ep_configure_tcsr(pcie_ep); + + /* Disable BDF to SID mapping */ + val = readl_relaxed(pcie_ep->parf + PARF_BDF_TO_SID_CFG); + val |= PARF_BDF_TO_SID_BYPASS; + writel_relaxed(val, pcie_ep->parf + PARF_BDF_TO_SID_CFG); + + /* Enable debug IRQ */ + val = readl_relaxed(pcie_ep->parf + PARF_DEBUG_INT_EN); + val |= PARF_DEBUG_INT_RADM_PM_TURNOFF | + PARF_DEBUG_INT_CFG_BUS_MASTER_EN | + PARF_DEBUG_INT_PM_DSTATE_CHANGE; + writel_relaxed(val, pcie_ep->parf + PARF_DEBUG_INT_EN); + + /* Configure PCIe to endpoint mode */ + writel_relaxed(PARF_DEVICE_TYPE_EP, pcie_ep->parf + PARF_DEVICE_TYPE); + + /* Allow entering L1 state */ + val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL); + val &= ~PARF_PM_CTRL_REQ_NOT_ENTR_L1; + writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL); + + /* Read halts write */ + val = readl_relaxed(pcie_ep->parf + PARF_AXI_MSTR_RD_HALT_NO_WRITES); + val &= ~PARF_AXI_MSTR_RD_HALT_NO_WRITE_EN; + writel_relaxed(val, pcie_ep->parf + PARF_AXI_MSTR_RD_HALT_NO_WRITES); + + /* Write after write halt */ + val = readl_relaxed(pcie_ep->parf + PARF_AXI_MSTR_WR_ADDR_HALT); + val |= PARF_AXI_MSTR_WR_ADDR_HALT_EN; + writel_relaxed(val, pcie_ep->parf + PARF_AXI_MSTR_WR_ADDR_HALT); + + /* Q2A flush disable */ + val = readl_relaxed(pcie_ep->parf + PARF_Q2A_FLUSH); + val &= ~PARF_Q2A_FLUSH_EN; + writel_relaxed(val, pcie_ep->parf + PARF_Q2A_FLUSH); + + /* Disable DBI Wakeup, core clock CGC and enable AUX power */ + val = readl_relaxed(pcie_ep->parf + PARF_SYS_CTRL); + val |= PARF_SYS_CTRL_SLV_DBI_WAKE_DISABLE | + PARF_SYS_CTRL_CORE_CLK_CGC_DIS | + PARF_SYS_CTRL_AUX_PWR_DET; + writel_relaxed(val, pcie_ep->parf + PARF_SYS_CTRL); + + /* Disable the debouncers */ + val = readl_relaxed(pcie_ep->parf + PARF_DB_CTRL); + val |= PARF_DB_CTRL_INSR_DBNCR_BLOCK | PARF_DB_CTRL_RMVL_DBNCR_BLOCK | + PARF_DB_CTRL_DBI_WKP_BLOCK | PARF_DB_CTRL_SLV_WKP_BLOCK | + PARF_DB_CTRL_MST_WKP_BLOCK; + writel_relaxed(val, pcie_ep->parf + PARF_DB_CTRL); + + /* Request to exit from L1SS for MSI and LTR MSG */ + val = readl_relaxed(pcie_ep->parf + PARF_CFG_BITS); + val |= PARF_CFG_BITS_REQ_EXIT_L1SS_MSI_LTR_EN; + writel_relaxed(val, pcie_ep->parf + PARF_CFG_BITS); + + dw_pcie_dbi_ro_wr_en(pci); + + /* Set the L0s Exit Latency to 2us-4us = 0x6 */ + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + val = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_L0SEL; + val |= FIELD_PREP(PCI_EXP_LNKCAP_L0SEL, 0x6); + dw_pcie_writel_dbi(pci, offset + PCI_EXP_LNKCAP, val); + + /* Set the L1 Exit Latency to be 32us-64 us = 0x6 */ + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + val = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_L1EL; + val |= FIELD_PREP(PCI_EXP_LNKCAP_L1EL, 0x6); + dw_pcie_writel_dbi(pci, offset + PCI_EXP_LNKCAP, val); + + dw_pcie_dbi_ro_wr_dis(pci); + + writel_relaxed(0, pcie_ep->parf + PARF_INT_ALL_MASK); + val = PARF_INT_ALL_LINK_DOWN | PARF_INT_ALL_BME | + PARF_INT_ALL_PM_TURNOFF | PARF_INT_ALL_DSTATE_CHANGE | + PARF_INT_ALL_LINK_UP; + writel_relaxed(val, pcie_ep->parf + PARF_INT_ALL_MASK); + + ret = dw_pcie_ep_init_complete(&pcie_ep->pci.ep); + if (ret) { + dev_err(dev, "Failed to complete initialization: %d\n", ret); + goto err_phy_power_off; + } + + /* + * The physical address of the MMIO region which is exposed as the BAR + * should be written to MHI BASE registers. + */ + writel_relaxed(pcie_ep->mmio_res->start, + pcie_ep->parf + PARF_MHI_BASE_ADDR_LOWER); + writel_relaxed(0, pcie_ep->parf + PARF_MHI_BASE_ADDR_UPPER); + + dw_pcie_ep_init_notify(&pcie_ep->pci.ep); + + /* Enable LTSSM */ + val = readl_relaxed(pcie_ep->parf + PARF_LTSSM); + val |= BIT(8); + writel_relaxed(val, pcie_ep->parf + PARF_LTSSM); + + return 0; + +err_phy_power_off: + phy_power_off(pcie_ep->phy); +err_phy_exit: + phy_exit(pcie_ep->phy); +err_disable_clk: + clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + + return ret; +} + +static void qcom_pcie_perst_assert(struct dw_pcie *pci) +{ + struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci); + struct device *dev = pci->dev; + + if (pcie_ep->link_status == QCOM_PCIE_EP_LINK_DISABLED) { + dev_dbg(dev, "Link is already disabled\n"); + return; + } + + phy_power_off(pcie_ep->phy); + phy_exit(pcie_ep->phy); + clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + pcie_ep->link_status = QCOM_PCIE_EP_LINK_DISABLED; +} + +/* Common DWC controller ops */ +static const struct dw_pcie_ops pci_ops = { + .link_up = qcom_pcie_dw_link_up, + .start_link = qcom_pcie_dw_start_link, + .stop_link = qcom_pcie_dw_stop_link, +}; + +static int qcom_pcie_ep_get_io_resources(struct platform_device *pdev, + struct qcom_pcie_ep *pcie_ep) +{ + struct device *dev = &pdev->dev; + struct dw_pcie *pci = &pcie_ep->pci; + struct device_node *syscon; + struct resource *res; + int ret; + + pcie_ep->parf = devm_platform_ioremap_resource_byname(pdev, "parf"); + if (IS_ERR(pcie_ep->parf)) + return PTR_ERR(pcie_ep->parf); + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + pci->dbi_base = devm_pci_remap_cfg_resource(dev, res); + if (IS_ERR(pci->dbi_base)) + return PTR_ERR(pci->dbi_base); + pci->dbi_base2 = pci->dbi_base; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi"); + pcie_ep->elbi = devm_pci_remap_cfg_resource(dev, res); + if (IS_ERR(pcie_ep->elbi)) + return PTR_ERR(pcie_ep->elbi); + + pcie_ep->mmio_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "mmio"); + + syscon = of_parse_phandle(dev->of_node, "qcom,perst-regs", 0); + if (!syscon) { + dev_err(dev, "Failed to parse qcom,perst-regs\n"); + return -EINVAL; + } + + pcie_ep->perst_map = syscon_node_to_regmap(syscon); + of_node_put(syscon); + if (IS_ERR(pcie_ep->perst_map)) + return PTR_ERR(pcie_ep->perst_map); + + ret = of_property_read_u32_index(dev->of_node, "qcom,perst-regs", + 1, &pcie_ep->perst_en); + if (ret < 0) { + dev_err(dev, "No Perst Enable offset in syscon\n"); + return ret; + } + + ret = of_property_read_u32_index(dev->of_node, "qcom,perst-regs", + 2, &pcie_ep->perst_sep_en); + if (ret < 0) { + dev_err(dev, "No Perst Separation Enable offset in syscon\n"); + return ret; + } + + return 0; +} + +static int qcom_pcie_ep_get_resources(struct platform_device *pdev, + struct qcom_pcie_ep *pcie_ep) +{ + struct device *dev = &pdev->dev; + int ret; + + ret = qcom_pcie_ep_get_io_resources(pdev, pcie_ep); + if (ret) { + dev_err(&pdev->dev, "Failed to get io resources %d\n", ret); + return ret; + } + + ret = devm_clk_bulk_get(dev, ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + if (ret) + return ret; + + pcie_ep->core_reset = devm_reset_control_get_exclusive(dev, "core"); + if (IS_ERR(pcie_ep->core_reset)) + return PTR_ERR(pcie_ep->core_reset); + + pcie_ep->reset = devm_gpiod_get(dev, "reset", GPIOD_IN); + if (IS_ERR(pcie_ep->reset)) + return PTR_ERR(pcie_ep->reset); + + pcie_ep->wake = devm_gpiod_get_optional(dev, "wake", GPIOD_OUT_LOW); + if (IS_ERR(pcie_ep->wake)) + return PTR_ERR(pcie_ep->wake); + + pcie_ep->phy = devm_phy_optional_get(&pdev->dev, "pciephy"); + if (IS_ERR(pcie_ep->phy)) + ret = PTR_ERR(pcie_ep->phy); + + return ret; +} + +/* TODO: Notify clients about PCIe state change */ +static irqreturn_t qcom_pcie_ep_global_irq_thread(int irq, void *data) +{ + struct qcom_pcie_ep *pcie_ep = data; + struct dw_pcie *pci = &pcie_ep->pci; + struct device *dev = pci->dev; + u32 status = readl_relaxed(pcie_ep->parf + PARF_INT_ALL_STATUS); + u32 mask = readl_relaxed(pcie_ep->parf + PARF_INT_ALL_MASK); + u32 dstate, val; + + writel_relaxed(status, pcie_ep->parf + PARF_INT_ALL_CLEAR); + status &= mask; + + if (FIELD_GET(PARF_INT_ALL_LINK_DOWN, status)) { + dev_dbg(dev, "Received Linkdown event\n"); + pcie_ep->link_status = QCOM_PCIE_EP_LINK_DOWN; + } else if (FIELD_GET(PARF_INT_ALL_BME, status)) { + dev_dbg(dev, "Received BME event. Link is enabled!\n"); + pcie_ep->link_status = QCOM_PCIE_EP_LINK_ENABLED; + } else if (FIELD_GET(PARF_INT_ALL_PM_TURNOFF, status)) { + dev_dbg(dev, "Received PM Turn-off event! Entering L23\n"); + val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL); + val |= PARF_PM_CTRL_READY_ENTR_L23; + writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL); + } else if (FIELD_GET(PARF_INT_ALL_DSTATE_CHANGE, status)) { + dstate = dw_pcie_readl_dbi(pci, DBI_CON_STATUS) & + DBI_CON_STATUS_POWER_STATE_MASK; + dev_dbg(dev, "Received D%d state event\n", dstate); + if (dstate == 3) { + val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL); + val |= PARF_PM_CTRL_REQ_EXIT_L1; + writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL); + } + } else if (FIELD_GET(PARF_INT_ALL_LINK_UP, status)) { + dev_dbg(dev, "Received Linkup event. Enumeration complete!\n"); + dw_pcie_ep_linkup(&pci->ep); + pcie_ep->link_status = QCOM_PCIE_EP_LINK_UP; + } else { + dev_dbg(dev, "Received unknown event: %d\n", status); + } + + return IRQ_HANDLED; +} + +static irqreturn_t qcom_pcie_ep_perst_irq_thread(int irq, void *data) +{ + struct qcom_pcie_ep *pcie_ep = data; + struct dw_pcie *pci = &pcie_ep->pci; + struct device *dev = pci->dev; + u32 perst; + + perst = gpiod_get_value(pcie_ep->reset); + if (perst) { + dev_dbg(dev, "PERST asserted by host. Shutting down the PCIe link!\n"); + qcom_pcie_perst_assert(pci); + } else { + dev_dbg(dev, "PERST de-asserted by host. Starting link training!\n"); + qcom_pcie_perst_deassert(pci); + } + + irq_set_irq_type(gpiod_to_irq(pcie_ep->reset), + (perst ? IRQF_TRIGGER_HIGH : IRQF_TRIGGER_LOW)); + + return IRQ_HANDLED; +} + +static int qcom_pcie_ep_enable_irq_resources(struct platform_device *pdev, + struct qcom_pcie_ep *pcie_ep) +{ + int irq, ret; + + irq = platform_get_irq_byname(pdev, "global"); + if (irq < 0) { + dev_err(&pdev->dev, "Failed to get Global IRQ\n"); + return irq; + } + + ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, + qcom_pcie_ep_global_irq_thread, + IRQF_ONESHOT, + "global_irq", pcie_ep); + if (ret) { + dev_err(&pdev->dev, "Failed to request Global IRQ\n"); + return ret; + } + + pcie_ep->perst_irq = gpiod_to_irq(pcie_ep->reset); + irq_set_status_flags(pcie_ep->perst_irq, IRQ_NOAUTOEN); + ret = devm_request_threaded_irq(&pdev->dev, pcie_ep->perst_irq, NULL, + qcom_pcie_ep_perst_irq_thread, + IRQF_TRIGGER_HIGH | IRQF_ONESHOT, + "perst_irq", pcie_ep); + if (ret) { + dev_err(&pdev->dev, "Failed to request PERST IRQ\n"); + disable_irq(irq); + return ret; + } + + return 0; +} + +static int qcom_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no, + enum pci_epc_irq_type type, u16 interrupt_num) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + + switch (type) { + case PCI_EPC_IRQ_LEGACY: + return dw_pcie_ep_raise_legacy_irq(ep, func_no); + case PCI_EPC_IRQ_MSI: + return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num); + default: + dev_err(pci->dev, "Unknown IRQ type\n"); + return -EINVAL; + } +} + +static const struct pci_epc_features qcom_pcie_epc_features = { + .linkup_notifier = true, + .core_init_notifier = true, + .msi_capable = true, + .msix_capable = false, +}; + +static const struct pci_epc_features * +qcom_pcie_epc_get_features(struct dw_pcie_ep *pci_ep) +{ + return &qcom_pcie_epc_features; +} + +static void qcom_pcie_ep_init(struct dw_pcie_ep *ep) +{ + struct dw_pcie *pci = to_dw_pcie_from_ep(ep); + enum pci_barno bar; + + for (bar = BAR_0; bar <= BAR_5; bar++) + dw_pcie_ep_reset_bar(pci, bar); +} + +static struct dw_pcie_ep_ops pci_ep_ops = { + .ep_init = qcom_pcie_ep_init, + .raise_irq = qcom_pcie_ep_raise_irq, + .get_features = qcom_pcie_epc_get_features, +}; + +static int qcom_pcie_ep_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct qcom_pcie_ep *pcie_ep; + int ret; + + pcie_ep = devm_kzalloc(dev, sizeof(*pcie_ep), GFP_KERNEL); + if (!pcie_ep) + return -ENOMEM; + + pcie_ep->pci.dev = dev; + pcie_ep->pci.ops = &pci_ops; + pcie_ep->pci.ep.ops = &pci_ep_ops; + platform_set_drvdata(pdev, pcie_ep); + + ret = qcom_pcie_ep_get_resources(pdev, pcie_ep); + if (ret) + return ret; + + ret = clk_bulk_prepare_enable(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + if (ret) + return ret; + + ret = qcom_pcie_ep_core_reset(pcie_ep); + if (ret) + goto err_disable_clk; + + ret = phy_init(pcie_ep->phy); + if (ret) + goto err_disable_clk; + + /* PHY needs to be powered on for dw_pcie_ep_init() */ + ret = phy_power_on(pcie_ep->phy); + if (ret) + goto err_phy_exit; + + ret = dw_pcie_ep_init(&pcie_ep->pci.ep); + if (ret) { + dev_err(dev, "Failed to initialize endpoint: %d\n", ret); + goto err_phy_power_off; + } + + ret = qcom_pcie_ep_enable_irq_resources(pdev, pcie_ep); + if (ret) + goto err_phy_power_off; + + return 0; + +err_phy_power_off: + phy_power_off(pcie_ep->phy); +err_phy_exit: + phy_exit(pcie_ep->phy); +err_disable_clk: + clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + + return ret; +} + +static int qcom_pcie_ep_remove(struct platform_device *pdev) +{ + struct qcom_pcie_ep *pcie_ep = platform_get_drvdata(pdev); + + if (pcie_ep->link_status == QCOM_PCIE_EP_LINK_DISABLED) + return 0; + + phy_power_off(pcie_ep->phy); + phy_exit(pcie_ep->phy); + clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks), + qcom_pcie_ep_clks); + + return 0; +} + +static const struct of_device_id qcom_pcie_ep_match[] = { + { .compatible = "qcom,sdx55-pcie-ep", }, + { } +}; + +static struct platform_driver qcom_pcie_ep_driver = { + .probe = qcom_pcie_ep_probe, + .remove = qcom_pcie_ep_remove, + .driver = { + .name = "qcom-pcie-ep", + .of_match_table = qcom_pcie_ep_match, + }, +}; +builtin_platform_driver(qcom_pcie_ep_driver); + +MODULE_AUTHOR("Siddartha Mohanadoss "); +MODULE_AUTHOR("Manivannan Sadhasivam "); +MODULE_DESCRIPTION("Qualcomm PCIe Endpoint controller driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 8a7a300163e5..1c3d1116bb60 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -166,6 +166,9 @@ struct qcom_pcie_resources_2_7_0 { struct regulator_bulk_data supplies[2]; struct reset_control *pci_reset; struct clk *pipe_clk; + struct clk *pipe_clk_src; + struct clk *phy_pipe_clk; + struct clk *ref_clk_src; }; union qcom_pcie_resources { @@ -189,6 +192,11 @@ struct qcom_pcie_ops { int (*config_sid)(struct qcom_pcie *pcie); }; +struct qcom_pcie_cfg { + const struct qcom_pcie_ops *ops; + unsigned int pipe_clk_need_muxing:1; +}; + struct qcom_pcie { struct dw_pcie *pci; void __iomem *parf; /* DT parf */ @@ -197,6 +205,7 @@ struct qcom_pcie { struct phy *phy; struct gpio_desc *reset; const struct qcom_pcie_ops *ops; + unsigned int pipe_clk_need_muxing:1; }; #define to_qcom_pcie(x) dev_get_drvdata((x)->dev) @@ -1167,6 +1176,20 @@ static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie) if (ret < 0) return ret; + if (pcie->pipe_clk_need_muxing) { + res->pipe_clk_src = devm_clk_get(dev, "pipe_mux"); + if (IS_ERR(res->pipe_clk_src)) + return PTR_ERR(res->pipe_clk_src); + + res->phy_pipe_clk = devm_clk_get(dev, "phy_pipe"); + if (IS_ERR(res->phy_pipe_clk)) + return PTR_ERR(res->phy_pipe_clk); + + res->ref_clk_src = devm_clk_get(dev, "ref"); + if (IS_ERR(res->ref_clk_src)) + return PTR_ERR(res->ref_clk_src); + } + res->pipe_clk = devm_clk_get(dev, "pipe"); return PTR_ERR_OR_ZERO(res->pipe_clk); } @@ -1185,6 +1208,10 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie) return ret; } + /* Set TCXO as clock source for pcie_pipe_clk_src */ + if (pcie->pipe_clk_need_muxing) + clk_set_parent(res->pipe_clk_src, res->ref_clk_src); + ret = clk_bulk_prepare_enable(res->num_clks, res->clks); if (ret < 0) goto err_disable_regulators; @@ -1256,6 +1283,10 @@ static int qcom_pcie_post_init_2_7_0(struct qcom_pcie *pcie) { struct qcom_pcie_resources_2_7_0 *res = &pcie->res.v2_7_0; + /* Set pipe clock as clock source for pcie_pipe_clk_src */ + if (pcie->pipe_clk_need_muxing) + clk_set_parent(res->pipe_clk_src, res->phy_pipe_clk); + return clk_prepare_enable(res->pipe_clk); } @@ -1456,6 +1487,39 @@ static const struct qcom_pcie_ops ops_1_9_0 = { .config_sid = qcom_pcie_config_sid_sm8250, }; +static const struct qcom_pcie_cfg apq8084_cfg = { + .ops = &ops_1_0_0, +}; + +static const struct qcom_pcie_cfg ipq8064_cfg = { + .ops = &ops_2_1_0, +}; + +static const struct qcom_pcie_cfg msm8996_cfg = { + .ops = &ops_2_3_2, +}; + +static const struct qcom_pcie_cfg ipq8074_cfg = { + .ops = &ops_2_3_3, +}; + +static const struct qcom_pcie_cfg ipq4019_cfg = { + .ops = &ops_2_4_0, +}; + +static const struct qcom_pcie_cfg sdm845_cfg = { + .ops = &ops_2_7_0, +}; + +static const struct qcom_pcie_cfg sm8250_cfg = { + .ops = &ops_1_9_0, +}; + +static const struct qcom_pcie_cfg sc7280_cfg = { + .ops = &ops_1_9_0, + .pipe_clk_need_muxing = true, +}; + static const struct dw_pcie_ops dw_pcie_ops = { .link_up = qcom_pcie_link_up, .start_link = qcom_pcie_start_link, @@ -1467,6 +1531,7 @@ static int qcom_pcie_probe(struct platform_device *pdev) struct pcie_port *pp; struct dw_pcie *pci; struct qcom_pcie *pcie; + const struct qcom_pcie_cfg *pcie_cfg; int ret; pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); @@ -1488,7 +1553,14 @@ static int qcom_pcie_probe(struct platform_device *pdev) pcie->pci = pci; - pcie->ops = of_device_get_match_data(dev); + pcie_cfg = of_device_get_match_data(dev); + if (!pcie_cfg || !pcie_cfg->ops) { + dev_err(dev, "Invalid platform data\n"); + return -EINVAL; + } + + pcie->ops = pcie_cfg->ops; + pcie->pipe_clk_need_muxing = pcie_cfg->pipe_clk_need_muxing; pcie->reset = devm_gpiod_get_optional(dev, "perst", GPIOD_OUT_HIGH); if (IS_ERR(pcie->reset)) { @@ -1545,16 +1617,18 @@ err_pm_runtime_put: } static const struct of_device_id qcom_pcie_match[] = { - { .compatible = "qcom,pcie-apq8084", .data = &ops_1_0_0 }, - { .compatible = "qcom,pcie-ipq8064", .data = &ops_2_1_0 }, - { .compatible = "qcom,pcie-ipq8064-v2", .data = &ops_2_1_0 }, - { .compatible = "qcom,pcie-apq8064", .data = &ops_2_1_0 }, - { .compatible = "qcom,pcie-msm8996", .data = &ops_2_3_2 }, - { .compatible = "qcom,pcie-ipq8074", .data = &ops_2_3_3 }, - { .compatible = "qcom,pcie-ipq4019", .data = &ops_2_4_0 }, - { .compatible = "qcom,pcie-qcs404", .data = &ops_2_4_0 }, - { .compatible = "qcom,pcie-sdm845", .data = &ops_2_7_0 }, - { .compatible = "qcom,pcie-sm8250", .data = &ops_1_9_0 }, + { .compatible = "qcom,pcie-apq8084", .data = &apq8084_cfg }, + { .compatible = "qcom,pcie-ipq8064", .data = &ipq8064_cfg }, + { .compatible = "qcom,pcie-ipq8064-v2", .data = &ipq8064_cfg }, + { .compatible = "qcom,pcie-apq8064", .data = &ipq8064_cfg }, + { .compatible = "qcom,pcie-msm8996", .data = &msm8996_cfg }, + { .compatible = "qcom,pcie-ipq8074", .data = &ipq8074_cfg }, + { .compatible = "qcom,pcie-ipq4019", .data = &ipq4019_cfg }, + { .compatible = "qcom,pcie-qcs404", .data = &ipq4019_cfg }, + { .compatible = "qcom,pcie-sdm845", .data = &sdm845_cfg }, + { .compatible = "qcom,pcie-sm8250", .data = &sm8250_cfg }, + { .compatible = "qcom,pcie-sc8180x", .data = &sm8250_cfg }, + { .compatible = "qcom,pcie-sc7280", .data = &sc7280_cfg }, { } }; diff --git a/drivers/pci/controller/dwc/pcie-uniphier.c b/drivers/pci/controller/dwc/pcie-uniphier.c index d842fd018129..d05be942956e 100644 --- a/drivers/pci/controller/dwc/pcie-uniphier.c +++ b/drivers/pci/controller/dwc/pcie-uniphier.c @@ -168,30 +168,21 @@ static void uniphier_pcie_irq_enable(struct uniphier_pcie_priv *priv) writel(PCL_RCV_INTX_ALL_ENABLE, priv->base + PCL_RCV_INTX); } -static void uniphier_pcie_irq_ack(struct irq_data *d) -{ - struct pcie_port *pp = irq_data_get_irq_chip_data(d); - struct dw_pcie *pci = to_dw_pcie_from_pp(pp); - struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci); - u32 val; - - val = readl(priv->base + PCL_RCV_INTX); - val &= ~PCL_RCV_INTX_ALL_STATUS; - val |= BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_STATUS_SHIFT); - writel(val, priv->base + PCL_RCV_INTX); -} - static void uniphier_pcie_irq_mask(struct irq_data *d) { struct pcie_port *pp = irq_data_get_irq_chip_data(d); struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci); + unsigned long flags; u32 val; + raw_spin_lock_irqsave(&pp->lock, flags); + val = readl(priv->base + PCL_RCV_INTX); - val &= ~PCL_RCV_INTX_ALL_MASK; val |= BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_MASK_SHIFT); writel(val, priv->base + PCL_RCV_INTX); + + raw_spin_unlock_irqrestore(&pp->lock, flags); } static void uniphier_pcie_irq_unmask(struct irq_data *d) @@ -199,17 +190,20 @@ static void uniphier_pcie_irq_unmask(struct irq_data *d) struct pcie_port *pp = irq_data_get_irq_chip_data(d); struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci); + unsigned long flags; u32 val; + raw_spin_lock_irqsave(&pp->lock, flags); + val = readl(priv->base + PCL_RCV_INTX); - val &= ~PCL_RCV_INTX_ALL_MASK; val &= ~BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_MASK_SHIFT); writel(val, priv->base + PCL_RCV_INTX); + + raw_spin_unlock_irqrestore(&pp->lock, flags); } static struct irq_chip uniphier_pcie_irq_chip = { .name = "PCI", - .irq_ack = uniphier_pcie_irq_ack, .irq_mask = uniphier_pcie_irq_mask, .irq_unmask = uniphier_pcie_irq_unmask, }; diff --git a/drivers/pci/controller/dwc/pcie-visconti.c b/drivers/pci/controller/dwc/pcie-visconti.c index a88eab6829bb..50f80f07e4db 100644 --- a/drivers/pci/controller/dwc/pcie-visconti.c +++ b/drivers/pci/controller/dwc/pcie-visconti.c @@ -279,13 +279,10 @@ static int visconti_add_pcie_port(struct visconti_pcie *pcie, { struct dw_pcie *pci = &pcie->pci; struct pcie_port *pp = &pci->pp; - struct device *dev = &pdev->dev; pp->irq = platform_get_irq_byname(pdev, "intr"); - if (pp->irq < 0) { - dev_err(dev, "Interrupt intr is missing"); + if (pp->irq < 0) return pp->irq; - } pp->ops = &visconti_pcie_host_ops; diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c index 596ebcfcc82d..c5300d49807a 100644 --- a/drivers/pci/controller/pci-aardvark.c +++ b/drivers/pci/controller/pci-aardvark.c @@ -31,10 +31,8 @@ /* PCIe core registers */ #define PCIE_CORE_DEV_ID_REG 0x0 #define PCIE_CORE_CMD_STATUS_REG 0x4 -#define PCIE_CORE_CMD_IO_ACCESS_EN BIT(0) -#define PCIE_CORE_CMD_MEM_ACCESS_EN BIT(1) -#define PCIE_CORE_CMD_MEM_IO_REQ_EN BIT(2) #define PCIE_CORE_DEV_REV_REG 0x8 +#define PCIE_CORE_EXP_ROM_BAR_REG 0x30 #define PCIE_CORE_PCIEXP_CAP 0xc0 #define PCIE_CORE_ERR_CAPCTL_REG 0x118 #define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX BIT(5) @@ -99,6 +97,7 @@ #define PCIE_CORE_CTRL2_MSI_ENABLE BIT(10) #define PCIE_CORE_REF_CLK_REG (CONTROL_BASE_ADDR + 0x14) #define PCIE_CORE_REF_CLK_TX_ENABLE BIT(1) +#define PCIE_CORE_REF_CLK_RX_ENABLE BIT(2) #define PCIE_MSG_LOG_REG (CONTROL_BASE_ADDR + 0x30) #define PCIE_ISR0_REG (CONTROL_BASE_ADDR + 0x40) #define PCIE_MSG_PM_PME_MASK BIT(7) @@ -106,18 +105,19 @@ #define PCIE_ISR0_MSI_INT_PENDING BIT(24) #define PCIE_ISR0_INTX_ASSERT(val) BIT(16 + (val)) #define PCIE_ISR0_INTX_DEASSERT(val) BIT(20 + (val)) -#define PCIE_ISR0_ALL_MASK GENMASK(26, 0) +#define PCIE_ISR0_ALL_MASK GENMASK(31, 0) #define PCIE_ISR1_REG (CONTROL_BASE_ADDR + 0x48) #define PCIE_ISR1_MASK_REG (CONTROL_BASE_ADDR + 0x4C) #define PCIE_ISR1_POWER_STATE_CHANGE BIT(4) #define PCIE_ISR1_FLUSH BIT(5) #define PCIE_ISR1_INTX_ASSERT(val) BIT(8 + (val)) -#define PCIE_ISR1_ALL_MASK GENMASK(11, 4) +#define PCIE_ISR1_ALL_MASK GENMASK(31, 0) #define PCIE_MSI_ADDR_LOW_REG (CONTROL_BASE_ADDR + 0x50) #define PCIE_MSI_ADDR_HIGH_REG (CONTROL_BASE_ADDR + 0x54) #define PCIE_MSI_STATUS_REG (CONTROL_BASE_ADDR + 0x58) #define PCIE_MSI_MASK_REG (CONTROL_BASE_ADDR + 0x5C) #define PCIE_MSI_PAYLOAD_REG (CONTROL_BASE_ADDR + 0x9C) +#define PCIE_MSI_DATA_MASK GENMASK(15, 0) /* PCIe window configuration */ #define OB_WIN_BASE_ADDR 0x4c00 @@ -164,8 +164,50 @@ #define CFG_REG (LMI_BASE_ADDR + 0x0) #define LTSSM_SHIFT 24 #define LTSSM_MASK 0x3f -#define LTSSM_L0 0x10 #define RC_BAR_CONFIG 0x300 + +/* LTSSM values in CFG_REG */ +enum { + LTSSM_DETECT_QUIET = 0x0, + LTSSM_DETECT_ACTIVE = 0x1, + LTSSM_POLLING_ACTIVE = 0x2, + LTSSM_POLLING_COMPLIANCE = 0x3, + LTSSM_POLLING_CONFIGURATION = 0x4, + LTSSM_CONFIG_LINKWIDTH_START = 0x5, + LTSSM_CONFIG_LINKWIDTH_ACCEPT = 0x6, + LTSSM_CONFIG_LANENUM_ACCEPT = 0x7, + LTSSM_CONFIG_LANENUM_WAIT = 0x8, + LTSSM_CONFIG_COMPLETE = 0x9, + LTSSM_CONFIG_IDLE = 0xa, + LTSSM_RECOVERY_RCVR_LOCK = 0xb, + LTSSM_RECOVERY_SPEED = 0xc, + LTSSM_RECOVERY_RCVR_CFG = 0xd, + LTSSM_RECOVERY_IDLE = 0xe, + LTSSM_L0 = 0x10, + LTSSM_RX_L0S_ENTRY = 0x11, + LTSSM_RX_L0S_IDLE = 0x12, + LTSSM_RX_L0S_FTS = 0x13, + LTSSM_TX_L0S_ENTRY = 0x14, + LTSSM_TX_L0S_IDLE = 0x15, + LTSSM_TX_L0S_FTS = 0x16, + LTSSM_L1_ENTRY = 0x17, + LTSSM_L1_IDLE = 0x18, + LTSSM_L2_IDLE = 0x19, + LTSSM_L2_TRANSMIT_WAKE = 0x1a, + LTSSM_DISABLED = 0x20, + LTSSM_LOOPBACK_ENTRY_MASTER = 0x21, + LTSSM_LOOPBACK_ACTIVE_MASTER = 0x22, + LTSSM_LOOPBACK_EXIT_MASTER = 0x23, + LTSSM_LOOPBACK_ENTRY_SLAVE = 0x24, + LTSSM_LOOPBACK_ACTIVE_SLAVE = 0x25, + LTSSM_LOOPBACK_EXIT_SLAVE = 0x26, + LTSSM_HOT_RESET = 0x27, + LTSSM_RECOVERY_EQUALIZATION_PHASE0 = 0x28, + LTSSM_RECOVERY_EQUALIZATION_PHASE1 = 0x29, + LTSSM_RECOVERY_EQUALIZATION_PHASE2 = 0x2a, + LTSSM_RECOVERY_EQUALIZATION_PHASE3 = 0x2b, +}; + #define VENDOR_ID_REG (LMI_BASE_ADDR + 0x44) /* PCIe core controller registers */ @@ -198,7 +240,7 @@ #define PCIE_IRQ_MSI_INT2_DET BIT(21) #define PCIE_IRQ_RC_DBELL_DET BIT(22) #define PCIE_IRQ_EP_STATUS BIT(23) -#define PCIE_IRQ_ALL_MASK 0xfff0fb +#define PCIE_IRQ_ALL_MASK GENMASK(31, 0) #define PCIE_IRQ_ENABLE_INTS_MASK PCIE_IRQ_CORE_INT /* Transaction types */ @@ -257,18 +299,49 @@ static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg) return readl(pcie->base + reg); } -static inline u16 advk_read16(struct advk_pcie *pcie, u64 reg) +static u8 advk_pcie_ltssm_state(struct advk_pcie *pcie) { - return advk_readl(pcie, (reg & ~0x3)) >> ((reg & 0x3) * 8); -} - -static int advk_pcie_link_up(struct advk_pcie *pcie) -{ - u32 val, ltssm_state; + u32 val; + u8 ltssm_state; val = advk_readl(pcie, CFG_REG); ltssm_state = (val >> LTSSM_SHIFT) & LTSSM_MASK; - return ltssm_state >= LTSSM_L0; + return ltssm_state; +} + +static inline bool advk_pcie_link_up(struct advk_pcie *pcie) +{ + /* check if LTSSM is in normal operation - some L* state */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ltssm_state >= LTSSM_L0 && ltssm_state < LTSSM_DISABLED; +} + +static inline bool advk_pcie_link_active(struct advk_pcie *pcie) +{ + /* + * According to PCIe Base specification 3.0, Table 4-14: Link + * Status Mapped to the LTSSM, and 4.2.6.3.6 Configuration.Idle + * is Link Up mapped to LTSSM Configuration.Idle, Recovery, L0, + * L0s, L1 and L2 states. And according to 3.2.1. Data Link + * Control and Management State Machine Rules is DL Up status + * reported in DL Active state. + */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ltssm_state >= LTSSM_CONFIG_IDLE && ltssm_state < LTSSM_DISABLED; +} + +static inline bool advk_pcie_link_training(struct advk_pcie *pcie) +{ + /* + * According to PCIe Base specification 3.0, Table 4-14: Link + * Status Mapped to the LTSSM is Link Training mapped to LTSSM + * Configuration and Recovery states. + */ + u8 ltssm_state = advk_pcie_ltssm_state(pcie); + return ((ltssm_state >= LTSSM_CONFIG_LINKWIDTH_START && + ltssm_state < LTSSM_L0) || + (ltssm_state >= LTSSM_RECOVERY_EQUALIZATION_PHASE0 && + ltssm_state <= LTSSM_RECOVERY_EQUALIZATION_PHASE3)); } static int advk_pcie_wait_for_link(struct advk_pcie *pcie) @@ -291,7 +364,7 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) size_t retries; for (retries = 0; retries < RETRAIN_WAIT_MAX_RETRIES; ++retries) { - if (!advk_pcie_link_up(pcie)) + if (advk_pcie_link_training(pcie)) break; udelay(RETRAIN_WAIT_USLEEP_US); } @@ -299,23 +372,9 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie) static void advk_pcie_issue_perst(struct advk_pcie *pcie) { - u32 reg; - if (!pcie->reset_gpio) return; - /* - * As required by PCI Express spec (PCI Express Base Specification, REV. - * 4.0 PCI Express, February 19 2014, 6.6.1 Conventional Reset) a delay - * for at least 100ms after de-asserting PERST# signal is needed before - * link training is enabled. So ensure that link training is disabled - * prior de-asserting PERST# signal to fulfill that PCI Express spec - * requirement. - */ - reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); - reg &= ~LINK_TRAINING_EN; - advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* 10ms delay is needed for some cards */ dev_info(&pcie->pdev->dev, "issuing PERST via reset GPIO for 10ms\n"); gpiod_set_value_cansleep(pcie->reset_gpio, 1); @@ -323,53 +382,46 @@ static void advk_pcie_issue_perst(struct advk_pcie *pcie) gpiod_set_value_cansleep(pcie->reset_gpio, 0); } -static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen) +static void advk_pcie_train_link(struct advk_pcie *pcie) { - int ret, neg_gen; + struct device *dev = &pcie->pdev->dev; u32 reg; + int ret; - /* Setup link speed */ + /* + * Setup PCIe rev / gen compliance based on device tree property + * 'max-link-speed' which also forces maximal link speed. + */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg &= ~PCIE_GEN_SEL_MSK; - if (gen == 3) + if (pcie->link_gen == 3) reg |= SPEED_GEN_3; - else if (gen == 2) + else if (pcie->link_gen == 2) reg |= SPEED_GEN_2; else reg |= SPEED_GEN_1; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); /* - * Enable link training. This is not needed in every call to this - * function, just once suffices, but it does not break anything either. + * Set maximal link speed value also into PCIe Link Control 2 register. + * Armada 3700 Functional Specification says that default value is based + * on SPEED_GEN but tests showed that default value is always 8.0 GT/s. */ + reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL2); + reg &= ~PCI_EXP_LNKCTL2_TLS; + if (pcie->link_gen == 3) + reg |= PCI_EXP_LNKCTL2_TLS_8_0GT; + else if (pcie->link_gen == 2) + reg |= PCI_EXP_LNKCTL2_TLS_5_0GT; + else + reg |= PCI_EXP_LNKCTL2_TLS_2_5GT; + advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL2); + + /* Enable link training after selecting PCIe generation */ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG); reg |= LINK_TRAINING_EN; advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG); - /* - * Start link training immediately after enabling it. - * This solves problems for some buggy cards. - */ - reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); - reg |= PCI_EXP_LNKCTL_RL; - advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL); - - ret = advk_pcie_wait_for_link(pcie); - if (ret) - return ret; - - reg = advk_read16(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKSTA); - neg_gen = reg & PCI_EXP_LNKSTA_CLS; - - return neg_gen; -} - -static void advk_pcie_train_link(struct advk_pcie *pcie) -{ - struct device *dev = &pcie->pdev->dev; - int neg_gen = -1, gen; - /* * Reset PCIe card via PERST# signal. Some cards are not detected * during link training when they are in some non-initial state. @@ -380,41 +432,18 @@ static void advk_pcie_train_link(struct advk_pcie *pcie) * PERST# signal could have been asserted by pinctrl subsystem before * probe() callback has been called or issued explicitly by reset gpio * function advk_pcie_issue_perst(), making the endpoint going into - * fundamental reset. As required by PCI Express spec a delay for at - * least 100ms after such a reset before link training is needed. + * fundamental reset. As required by PCI Express spec (PCI Express + * Base Specification, REV. 4.0 PCI Express, February 19 2014, 6.6.1 + * Conventional Reset) a delay for at least 100ms after such a reset + * before sending a Configuration Request to the device is needed. + * So wait until PCIe link is up. Function advk_pcie_wait_for_link() + * waits for link at least 900ms. */ - msleep(PCI_PM_D3COLD_WAIT); - - /* - * Try link training at link gen specified by device tree property - * 'max-link-speed'. If this fails, iteratively train at lower gen. - */ - for (gen = pcie->link_gen; gen > 0; --gen) { - neg_gen = advk_pcie_train_at_gen(pcie, gen); - if (neg_gen > 0) - break; - } - - if (neg_gen < 0) - goto err; - - /* - * After successful training if negotiated gen is lower than requested, - * train again on negotiated gen. This solves some stability issues for - * some buggy gen1 cards. - */ - if (neg_gen < gen) { - gen = neg_gen; - neg_gen = advk_pcie_train_at_gen(pcie, gen); - } - - if (neg_gen == gen) { - dev_info(dev, "link up at gen %i\n", gen); - return; - } - -err: - dev_err(dev, "link never came up\n"); + ret = advk_pcie_wait_for_link(pcie); + if (ret < 0) + dev_err(dev, "link never came up\n"); + else + dev_info(dev, "link up\n"); } /* @@ -451,9 +480,15 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) u32 reg; int i; - /* Enable TX */ + /* + * Configure PCIe Reference clock. Direction is from the PCIe + * controller to the endpoint card, so enable transmitting of + * Reference clock differential signal off-chip and disable + * receiving off-chip differential signal. + */ reg = advk_readl(pcie, PCIE_CORE_REF_CLK_REG); reg |= PCIE_CORE_REF_CLK_TX_ENABLE; + reg &= ~PCIE_CORE_REF_CLK_RX_ENABLE; advk_writel(pcie, reg, PCIE_CORE_REF_CLK_REG); /* Set to Direct mode */ @@ -477,6 +512,31 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = (PCI_VENDOR_ID_MARVELL << 16) | PCI_VENDOR_ID_MARVELL; advk_writel(pcie, reg, VENDOR_ID_REG); + /* + * Change Class Code of PCI Bridge device to PCI Bridge (0x600400), + * because the default value is Mass storage controller (0x010400). + * + * Note that this Aardvark PCI Bridge does not have compliant Type 1 + * Configuration Space and it even cannot be accessed via Aardvark's + * PCI config space access method. Something like config space is + * available in internal Aardvark registers starting at offset 0x0 + * and is reported as Type 0. In range 0x10 - 0x34 it has totally + * different registers. + * + * Therefore driver uses emulation of PCI Bridge which emulates + * access to configuration space via internal Aardvark registers or + * emulated configuration buffer. + */ + reg = advk_readl(pcie, PCIE_CORE_DEV_REV_REG); + reg &= ~0xffffff00; + reg |= (PCI_CLASS_BRIDGE_PCI << 8) << 8; + advk_writel(pcie, reg, PCIE_CORE_DEV_REV_REG); + + /* Disable Root Bridge I/O space, memory space and bus mastering */ + reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); + reg &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); + /* Set Advanced Error Capabilities and Control PF0 register */ reg = PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX | PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN | @@ -488,8 +548,9 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); reg &= ~PCI_EXP_DEVCTL_RELAX_EN; reg &= ~PCI_EXP_DEVCTL_NOSNOOP_EN; + reg &= ~PCI_EXP_DEVCTL_PAYLOAD; reg &= ~PCI_EXP_DEVCTL_READRQ; - reg |= PCI_EXP_DEVCTL_PAYLOAD; /* Set max payload size */ + reg |= PCI_EXP_DEVCTL_PAYLOAD_512B; reg |= PCI_EXP_DEVCTL_READRQ_512B; advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL); @@ -574,19 +635,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie) advk_pcie_disable_ob_win(pcie, i); advk_pcie_train_link(pcie); - - /* - * FIXME: The following register update is suspicious. This register is - * applicable only when the PCI controller is configured for Endpoint - * mode, not as a Root Complex. But apparently when this code is - * removed, some cards stop working. This should be investigated and - * a comment explaining this should be put here. - */ - reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); - reg |= PCIE_CORE_CMD_MEM_ACCESS_EN | - PCIE_CORE_CMD_IO_ACCESS_EN | - PCIE_CORE_CMD_MEM_IO_REQ_EN; - advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG); } static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u32 *val) @@ -595,6 +643,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 u32 reg; unsigned int status; char *strcomp_status, *str_posted; + int ret; reg = advk_readl(pcie, PIO_STAT); status = (reg & PIO_COMPLETION_STATUS_MASK) >> @@ -619,6 +668,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 case PIO_COMPLETION_STATUS_OK: if (reg & PIO_ERR_STATUS) { strcomp_status = "COMP_ERR"; + ret = -EFAULT; break; } /* Get the read result */ @@ -626,9 +676,11 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 *val = advk_readl(pcie, PIO_RD_DATA); /* No error */ strcomp_status = NULL; + ret = 0; break; case PIO_COMPLETION_STATUS_UR: strcomp_status = "UR"; + ret = -EOPNOTSUPP; break; case PIO_COMPLETION_STATUS_CRS: if (allow_crs && val) { @@ -646,6 +698,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 */ *val = CFG_RD_CRS_VAL; strcomp_status = NULL; + ret = 0; break; } /* PCIe r4.0, sec 2.3.2, says: @@ -661,31 +714,34 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3 * Request and taking appropriate action, e.g., complete the * Request to the host as a failed transaction. * - * To simplify implementation do not re-issue the Configuration - * Request and complete the Request as a failed transaction. + * So return -EAGAIN and caller (pci-aardvark.c driver) will + * re-issue request again up to the PIO_RETRY_CNT retries. */ strcomp_status = "CRS"; + ret = -EAGAIN; break; case PIO_COMPLETION_STATUS_CA: strcomp_status = "CA"; + ret = -ECANCELED; break; default: strcomp_status = "Unknown"; + ret = -EINVAL; break; } if (!strcomp_status) - return 0; + return ret; if (reg & PIO_NON_POSTED_REQ) str_posted = "Non-posted"; else str_posted = "Posted"; - dev_err(dev, "%s PIO Response Status: %s, %#x @ %#x\n", + dev_dbg(dev, "%s PIO Response Status: %s, %#x @ %#x\n", str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS)); - return -EFAULT; + return ret; } static int advk_pcie_wait_pio(struct advk_pcie *pcie) @@ -693,13 +749,13 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie) struct device *dev = &pcie->pdev->dev; int i; - for (i = 0; i < PIO_RETRY_CNT; i++) { + for (i = 1; i <= PIO_RETRY_CNT; i++) { u32 start, isr; start = advk_readl(pcie, PIO_START); isr = advk_readl(pcie, PIO_ISR); if (!start && isr) - return 0; + return i; udelay(PIO_RETRY_DELAY); } @@ -707,6 +763,72 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie) return -ETIMEDOUT; } +static pci_bridge_emul_read_status_t +advk_pci_bridge_emul_base_conf_read(struct pci_bridge_emul *bridge, + int reg, u32 *value) +{ + struct advk_pcie *pcie = bridge->data; + + switch (reg) { + case PCI_COMMAND: + *value = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG); + return PCI_BRIDGE_EMUL_HANDLED; + + case PCI_ROM_ADDRESS1: + *value = advk_readl(pcie, PCIE_CORE_EXP_ROM_BAR_REG); + return PCI_BRIDGE_EMUL_HANDLED; + + case PCI_INTERRUPT_LINE: { + /* + * From the whole 32bit register we support reading from HW only + * one bit: PCI_BRIDGE_CTL_BUS_RESET. + * Other bits are retrieved only from emulated config buffer. + */ + __le32 *cfgspace = (__le32 *)&bridge->conf; + u32 val = le32_to_cpu(cfgspace[PCI_INTERRUPT_LINE / 4]); + if (advk_readl(pcie, PCIE_CORE_CTRL1_REG) & HOT_RESET_GEN) + val |= PCI_BRIDGE_CTL_BUS_RESET << 16; + else + val &= ~(PCI_BRIDGE_CTL_BUS_RESET << 16); + *value = val; + return PCI_BRIDGE_EMUL_HANDLED; + } + + default: + return PCI_BRIDGE_EMUL_NOT_HANDLED; + } +} + +static void +advk_pci_bridge_emul_base_conf_write(struct pci_bridge_emul *bridge, + int reg, u32 old, u32 new, u32 mask) +{ + struct advk_pcie *pcie = bridge->data; + + switch (reg) { + case PCI_COMMAND: + advk_writel(pcie, new, PCIE_CORE_CMD_STATUS_REG); + break; + + case PCI_ROM_ADDRESS1: + advk_writel(pcie, new, PCIE_CORE_EXP_ROM_BAR_REG); + break; + + case PCI_INTERRUPT_LINE: + if (mask & (PCI_BRIDGE_CTL_BUS_RESET << 16)) { + u32 val = advk_readl(pcie, PCIE_CORE_CTRL1_REG); + if (new & (PCI_BRIDGE_CTL_BUS_RESET << 16)) + val |= HOT_RESET_GEN; + else + val &= ~HOT_RESET_GEN; + advk_writel(pcie, val, PCIE_CORE_CTRL1_REG); + } + break; + + default: + break; + } +} static pci_bridge_emul_read_status_t advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, @@ -723,6 +845,7 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, case PCI_EXP_RTCTL: { u32 val = advk_readl(pcie, PCIE_ISR0_MASK_REG); *value = (val & PCIE_MSG_PM_PME_MASK) ? 0 : PCI_EXP_RTCTL_PMEIE; + *value |= le16_to_cpu(bridge->pcie_conf.rootctl) & PCI_EXP_RTCTL_CRSSVE; *value |= PCI_EXP_RTCAP_CRSVIS << 16; return PCI_BRIDGE_EMUL_HANDLED; } @@ -734,12 +857,26 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, return PCI_BRIDGE_EMUL_HANDLED; } + case PCI_EXP_LNKCAP: { + u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg); + /* + * PCI_EXP_LNKCAP_DLLLARC bit is hardwired in aardvark HW to 0. + * But support for PCI_EXP_LNKSTA_DLLLA is emulated via ltssm + * state so explicitly enable PCI_EXP_LNKCAP_DLLLARC flag. + */ + val |= PCI_EXP_LNKCAP_DLLLARC; + *value = val; + return PCI_BRIDGE_EMUL_HANDLED; + } + case PCI_EXP_LNKCTL: { /* u32 contains both PCI_EXP_LNKCTL and PCI_EXP_LNKSTA */ u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg) & ~(PCI_EXP_LNKSTA_LT << 16); - if (!advk_pcie_link_up(pcie)) + if (advk_pcie_link_training(pcie)) val |= (PCI_EXP_LNKSTA_LT << 16); + if (advk_pcie_link_active(pcie)) + val |= (PCI_EXP_LNKSTA_DLLLA << 16); *value = val; return PCI_BRIDGE_EMUL_HANDLED; } @@ -747,7 +884,6 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge, case PCI_CAP_LIST_ID: case PCI_EXP_DEVCAP: case PCI_EXP_DEVCTL: - case PCI_EXP_LNKCAP: *value = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg); return PCI_BRIDGE_EMUL_HANDLED; default: @@ -794,6 +930,8 @@ advk_pci_bridge_emul_pcie_conf_write(struct pci_bridge_emul *bridge, } static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = { + .read_base = advk_pci_bridge_emul_base_conf_read, + .write_base = advk_pci_bridge_emul_base_conf_write, .read_pcie = advk_pci_bridge_emul_pcie_conf_read, .write_pcie = advk_pci_bridge_emul_pcie_conf_write, }; @@ -805,7 +943,6 @@ static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = { static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) { struct pci_bridge_emul *bridge = &pcie->bridge; - int ret; bridge->conf.vendor = cpu_to_le16(advk_readl(pcie, PCIE_CORE_DEV_ID_REG) & 0xffff); @@ -825,19 +962,14 @@ static int advk_sw_pci_bridge_init(struct advk_pcie *pcie) /* Support interrupt A for MSI feature */ bridge->conf.intpin = PCIE_CORE_INT_A_ASSERT_ENABLE; + /* Indicates supports for Completion Retry Status */ + bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); + bridge->has_pcie = true; bridge->data = pcie; bridge->ops = &advk_pci_bridge_emul_ops; - /* PCIe config space can be initialized after pci_bridge_emul_init() */ - ret = pci_bridge_emul_init(bridge, 0); - if (ret < 0) - return ret; - - /* Indicates supports for Completion Retry Status */ - bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS); - - return 0; + return pci_bridge_emul_init(bridge, 0); } static bool advk_pcie_valid_device(struct advk_pcie *pcie, struct pci_bus *bus, @@ -889,6 +1021,7 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, int where, int size, u32 *val) { struct advk_pcie *pcie = bus->sysdata; + int retry_count; bool allow_crs; u32 reg; int ret; @@ -911,18 +1044,8 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, (le16_to_cpu(pcie->bridge.pcie_conf.rootctl) & PCI_EXP_RTCTL_CRSSVE); - if (advk_pcie_pio_is_running(pcie)) { - /* - * If it is possible return Completion Retry Status so caller - * tries to issue the request again instead of failing. - */ - if (allow_crs) { - *val = CFG_RD_CRS_VAL; - return PCIBIOS_SUCCESSFUL; - } - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + if (advk_pcie_pio_is_running(pcie)) + goto try_crs; /* Program the control register */ reg = advk_readl(pcie, PIO_CTRL); @@ -941,30 +1064,24 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, /* Program the data strobe */ advk_writel(pcie, 0xf, PIO_WR_DATA_STRB); - /* Clear PIO DONE ISR and start the transfer */ - advk_writel(pcie, 1, PIO_ISR); - advk_writel(pcie, 1, PIO_START); + retry_count = 0; + do { + /* Clear PIO DONE ISR and start the transfer */ + advk_writel(pcie, 1, PIO_ISR); + advk_writel(pcie, 1, PIO_START); - ret = advk_pcie_wait_pio(pcie); - if (ret < 0) { - /* - * If it is possible return Completion Retry Status so caller - * tries to issue the request again instead of failing. - */ - if (allow_crs) { - *val = CFG_RD_CRS_VAL; - return PCIBIOS_SUCCESSFUL; - } - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + ret = advk_pcie_wait_pio(pcie); + if (ret < 0) + goto try_crs; - /* Check PIO status and get the read result */ - ret = advk_pcie_check_pio_status(pcie, allow_crs, val); - if (ret < 0) { - *val = 0xffffffff; - return PCIBIOS_SET_FAILED; - } + retry_count += ret; + + /* Check PIO status and get the read result */ + ret = advk_pcie_check_pio_status(pcie, allow_crs, val); + } while (ret == -EAGAIN && retry_count < PIO_RETRY_CNT); + + if (ret < 0) + goto fail; if (size == 1) *val = (*val >> (8 * (where & 3))) & 0xff; @@ -972,6 +1089,20 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn, *val = (*val >> (8 * (where & 3))) & 0xffff; return PCIBIOS_SUCCESSFUL; + +try_crs: + /* + * If it is possible, return Completion Retry Status so that caller + * tries to issue the request again instead of failing. + */ + if (allow_crs) { + *val = CFG_RD_CRS_VAL; + return PCIBIOS_SUCCESSFUL; + } + +fail: + *val = 0xffffffff; + return PCIBIOS_SET_FAILED; } static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, @@ -980,6 +1111,7 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, struct advk_pcie *pcie = bus->sysdata; u32 reg; u32 data_strobe = 0x0; + int retry_count; int offset; int ret; @@ -1021,19 +1153,22 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn, /* Program the data strobe */ advk_writel(pcie, data_strobe, PIO_WR_DATA_STRB); - /* Clear PIO DONE ISR and start the transfer */ - advk_writel(pcie, 1, PIO_ISR); - advk_writel(pcie, 1, PIO_START); + retry_count = 0; + do { + /* Clear PIO DONE ISR and start the transfer */ + advk_writel(pcie, 1, PIO_ISR); + advk_writel(pcie, 1, PIO_START); - ret = advk_pcie_wait_pio(pcie); - if (ret < 0) - return PCIBIOS_SET_FAILED; + ret = advk_pcie_wait_pio(pcie); + if (ret < 0) + return PCIBIOS_SET_FAILED; - ret = advk_pcie_check_pio_status(pcie, false, NULL); - if (ret < 0) - return PCIBIOS_SET_FAILED; + retry_count += ret; - return PCIBIOS_SUCCESSFUL; + ret = advk_pcie_check_pio_status(pcie, false, NULL); + } while (ret == -EAGAIN && retry_count < PIO_RETRY_CNT); + + return ret < 0 ? PCIBIOS_SET_FAILED : PCIBIOS_SUCCESSFUL; } static struct pci_ops advk_pcie_ops = { @@ -1082,7 +1217,7 @@ static int advk_msi_irq_domain_alloc(struct irq_domain *domain, domain->host_data, handle_simple_irq, NULL, NULL); - return hwirq; + return 0; } static void advk_msi_irq_domain_free(struct irq_domain *domain, @@ -1263,8 +1398,12 @@ static void advk_pcie_handle_msi(struct advk_pcie *pcie) if (!(BIT(msi_idx) & msi_status)) continue; + /* + * msi_idx contains bits [4:0] of the msi_data and msi_data + * contains 16bit MSI interrupt number + */ advk_writel(pcie, BIT(msi_idx), PCIE_MSI_STATUS_REG); - msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & 0xFF; + msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & PCIE_MSI_DATA_MASK; generic_handle_irq(msi_data); } @@ -1286,12 +1425,6 @@ static void advk_pcie_handle_int(struct advk_pcie *pcie) isr1_mask = advk_readl(pcie, PCIE_ISR1_MASK_REG); isr1_status = isr1_val & ((~isr1_mask) & PCIE_ISR1_ALL_MASK); - if (!isr0_status && !isr1_status) { - advk_writel(pcie, isr0_val, PCIE_ISR0_REG); - advk_writel(pcie, isr1_val, PCIE_ISR1_REG); - return; - } - /* Process MSI interrupts */ if (isr0_status & PCIE_ISR0_MSI_INT_PENDING) advk_pcie_handle_msi(pcie); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 67c46e52c0dc..6733cb14e775 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3126,14 +3126,14 @@ static int hv_pci_probe(struct hv_device *hdev, if (dom == HVPCI_DOM_INVALID) { dev_err(&hdev->device, - "Unable to use dom# 0x%hx or other numbers", dom_req); + "Unable to use dom# 0x%x or other numbers", dom_req); ret = -EINVAL; goto free_bus; } if (dom != dom_req) dev_info(&hdev->device, - "PCI dom# 0x%hx has collision, using 0x%hx", + "PCI dom# 0x%x has collision, using 0x%x", dom_req, dom); hbus->bridge->domain_nr = dom; diff --git a/drivers/pci/controller/pci-thunder-ecam.c b/drivers/pci/controller/pci-thunder-ecam.c index ffd84656544f..e9d5ca245f5e 100644 --- a/drivers/pci/controller/pci-thunder-ecam.c +++ b/drivers/pci/controller/pci-thunder-ecam.c @@ -17,7 +17,7 @@ static void set_val(u32 v, int where, int size, u32 *val) { int shift = (where & 3) * 8; - pr_debug("set_val %04x: %08x\n", (unsigned)(where & ~3), v); + pr_debug("set_val %04x: %08x\n", (unsigned int)(where & ~3), v); v >>= shift; if (size == 1) v &= 0xff; @@ -187,7 +187,7 @@ static int thunder_ecam_config_read(struct pci_bus *bus, unsigned int devfn, pr_debug("%04x:%04x - Fix pass#: %08x, where: %03x, devfn: %03x\n", vendor_device & 0xffff, vendor_device >> 16, class_rev, - (unsigned) where, devfn); + (unsigned int)where, devfn); /* Check for non type-00 header */ if (cfg_type == 0) { diff --git a/drivers/pci/controller/pci-xgene-msi.c b/drivers/pci/controller/pci-xgene-msi.c index b7a8e062fcc5..c50ff279903c 100644 --- a/drivers/pci/controller/pci-xgene-msi.c +++ b/drivers/pci/controller/pci-xgene-msi.c @@ -302,7 +302,7 @@ static void xgene_msi_isr(struct irq_desc *desc) /* * MSIINTn (n is 0..F) indicates if there is a pending MSI interrupt - * If bit x of this register is set (x is 0..7), one or more interupts + * If bit x of this register is set (x is 0..7), one or more interrupts * corresponding to MSInIRx is set. */ grp_select = xgene_msi_int_read(xgene_msi, msi_grp); diff --git a/drivers/pci/controller/pci-xgene.c b/drivers/pci/controller/pci-xgene.c index e64536047b65..56d0d50338c8 100644 --- a/drivers/pci/controller/pci-xgene.c +++ b/drivers/pci/controller/pci-xgene.c @@ -48,7 +48,6 @@ #define EN_COHERENCY 0xF0000000 #define EN_REG 0x00000001 #define OB_LO_IO 0x00000002 -#define XGENE_PCIE_VENDORID 0x10E8 #define XGENE_PCIE_DEVICEID 0xE004 #define SZ_1T (SZ_1G*1024ULL) #define PIPE_PHY_RATE_RD(src) ((0xc000 & (u32)(src)) >> 0xe) @@ -560,7 +559,7 @@ static int xgene_pcie_setup(struct xgene_pcie_port *port) xgene_pcie_clear_config(port); /* setup the vendor and device IDs correctly */ - val = (XGENE_PCIE_DEVICEID << 16) | XGENE_PCIE_VENDORID; + val = (XGENE_PCIE_DEVICEID << 16) | PCI_VENDOR_ID_AMCC; xgene_pcie_writel(port, BRIDGE_CFG_0, val); ret = xgene_pcie_map_ranges(port); diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c new file mode 100644 index 000000000000..1bf4d75b61be --- /dev/null +++ b/drivers/pci/controller/pcie-apple.c @@ -0,0 +1,824 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCIe host bridge driver for Apple system-on-chips. + * + * The HW is ECAM compliant, so once the controller is initialized, + * the driver mostly deals MSI mapping and handling of per-port + * interrupts (INTx, management and error signals). + * + * Initialization requires enabling power and clocks, along with a + * number of register pokes. + * + * Copyright (C) 2021 Alyssa Rosenzweig + * Copyright (C) 2021 Google LLC + * Copyright (C) 2021 Corellium LLC + * Copyright (C) 2021 Mark Kettenis + * + * Author: Alyssa Rosenzweig + * Author: Marc Zyngier + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CORE_RC_PHYIF_CTL 0x00024 +#define CORE_RC_PHYIF_CTL_RUN BIT(0) +#define CORE_RC_PHYIF_STAT 0x00028 +#define CORE_RC_PHYIF_STAT_REFCLK BIT(4) +#define CORE_RC_CTL 0x00050 +#define CORE_RC_CTL_RUN BIT(0) +#define CORE_RC_STAT 0x00058 +#define CORE_RC_STAT_READY BIT(0) +#define CORE_FABRIC_STAT 0x04000 +#define CORE_FABRIC_STAT_MASK 0x001F001F +#define CORE_LANE_CFG(port) (0x84000 + 0x4000 * (port)) +#define CORE_LANE_CFG_REFCLK0REQ BIT(0) +#define CORE_LANE_CFG_REFCLK1 BIT(1) +#define CORE_LANE_CFG_REFCLK0ACK BIT(2) +#define CORE_LANE_CFG_REFCLKEN (BIT(9) | BIT(10)) +#define CORE_LANE_CTL(port) (0x84004 + 0x4000 * (port)) +#define CORE_LANE_CTL_CFGACC BIT(15) + +#define PORT_LTSSMCTL 0x00080 +#define PORT_LTSSMCTL_START BIT(0) +#define PORT_INTSTAT 0x00100 +#define PORT_INT_TUNNEL_ERR 31 +#define PORT_INT_CPL_TIMEOUT 23 +#define PORT_INT_RID2SID_MAPERR 22 +#define PORT_INT_CPL_ABORT 21 +#define PORT_INT_MSI_BAD_DATA 19 +#define PORT_INT_MSI_ERR 18 +#define PORT_INT_REQADDR_GT32 17 +#define PORT_INT_AF_TIMEOUT 15 +#define PORT_INT_LINK_DOWN 14 +#define PORT_INT_LINK_UP 12 +#define PORT_INT_LINK_BWMGMT 11 +#define PORT_INT_AER_MASK (15 << 4) +#define PORT_INT_PORT_ERR 4 +#define PORT_INT_INTx(i) i +#define PORT_INT_INTx_MASK 15 +#define PORT_INTMSK 0x00104 +#define PORT_INTMSKSET 0x00108 +#define PORT_INTMSKCLR 0x0010c +#define PORT_MSICFG 0x00124 +#define PORT_MSICFG_EN BIT(0) +#define PORT_MSICFG_L2MSINUM_SHIFT 4 +#define PORT_MSIBASE 0x00128 +#define PORT_MSIBASE_1_SHIFT 16 +#define PORT_MSIADDR 0x00168 +#define PORT_LINKSTS 0x00208 +#define PORT_LINKSTS_UP BIT(0) +#define PORT_LINKSTS_BUSY BIT(2) +#define PORT_LINKCMDSTS 0x00210 +#define PORT_OUTS_NPREQS 0x00284 +#define PORT_OUTS_NPREQS_REQ BIT(24) +#define PORT_OUTS_NPREQS_CPL BIT(16) +#define PORT_RXWR_FIFO 0x00288 +#define PORT_RXWR_FIFO_HDR GENMASK(15, 10) +#define PORT_RXWR_FIFO_DATA GENMASK(9, 0) +#define PORT_RXRD_FIFO 0x0028C +#define PORT_RXRD_FIFO_REQ GENMASK(6, 0) +#define PORT_OUTS_CPLS 0x00290 +#define PORT_OUTS_CPLS_SHRD GENMASK(14, 8) +#define PORT_OUTS_CPLS_WAIT GENMASK(6, 0) +#define PORT_APPCLK 0x00800 +#define PORT_APPCLK_EN BIT(0) +#define PORT_APPCLK_CGDIS BIT(8) +#define PORT_STATUS 0x00804 +#define PORT_STATUS_READY BIT(0) +#define PORT_REFCLK 0x00810 +#define PORT_REFCLK_EN BIT(0) +#define PORT_REFCLK_CGDIS BIT(8) +#define PORT_PERST 0x00814 +#define PORT_PERST_OFF BIT(0) +#define PORT_RID2SID(i16) (0x00828 + 4 * (i16)) +#define PORT_RID2SID_VALID BIT(31) +#define PORT_RID2SID_SID_SHIFT 16 +#define PORT_RID2SID_BUS_SHIFT 8 +#define PORT_RID2SID_DEV_SHIFT 3 +#define PORT_RID2SID_FUNC_SHIFT 0 +#define PORT_OUTS_PREQS_HDR 0x00980 +#define PORT_OUTS_PREQS_HDR_MASK GENMASK(9, 0) +#define PORT_OUTS_PREQS_DATA 0x00984 +#define PORT_OUTS_PREQS_DATA_MASK GENMASK(15, 0) +#define PORT_TUNCTRL 0x00988 +#define PORT_TUNCTRL_PERST_ON BIT(0) +#define PORT_TUNCTRL_PERST_ACK_REQ BIT(1) +#define PORT_TUNSTAT 0x0098c +#define PORT_TUNSTAT_PERST_ON BIT(0) +#define PORT_TUNSTAT_PERST_ACK_PEND BIT(1) +#define PORT_PREFMEM_ENABLE 0x00994 + +#define MAX_RID2SID 64 + +/* + * The doorbell address is set to 0xfffff000, which by convention + * matches what MacOS does, and it is possible to use any other + * address (in the bottom 4GB, as the base register is only 32bit). + * However, it has to be excluded from the IOVA range, and the DART + * driver has to know about it. + */ +#define DOORBELL_ADDR CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR + +struct apple_pcie { + struct mutex lock; + struct device *dev; + void __iomem *base; + struct irq_domain *domain; + unsigned long *bitmap; + struct list_head ports; + struct completion event; + struct irq_fwspec fwspec; + u32 nvecs; +}; + +struct apple_pcie_port { + struct apple_pcie *pcie; + struct device_node *np; + void __iomem *base; + struct irq_domain *domain; + struct list_head entry; + DECLARE_BITMAP(sid_map, MAX_RID2SID); + int sid_map_sz; + int idx; +}; + +static void rmw_set(u32 set, void __iomem *addr) +{ + writel_relaxed(readl_relaxed(addr) | set, addr); +} + +static void rmw_clear(u32 clr, void __iomem *addr) +{ + writel_relaxed(readl_relaxed(addr) & ~clr, addr); +} + +static void apple_msi_top_irq_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void apple_msi_top_irq_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip apple_msi_top_chip = { + .name = "PCIe MSI", + .irq_mask = apple_msi_top_irq_mask, + .irq_unmask = apple_msi_top_irq_unmask, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_set_type = irq_chip_set_type_parent, +}; + +static void apple_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + msg->address_hi = upper_32_bits(DOORBELL_ADDR); + msg->address_lo = lower_32_bits(DOORBELL_ADDR); + msg->data = data->hwirq; +} + +static struct irq_chip apple_msi_bottom_chip = { + .name = "MSI", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_set_type = irq_chip_set_type_parent, + .irq_compose_msi_msg = apple_msi_compose_msg, +}; + +static int apple_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct apple_pcie *pcie = domain->host_data; + struct irq_fwspec fwspec = pcie->fwspec; + unsigned int i; + int ret, hwirq; + + mutex_lock(&pcie->lock); + + hwirq = bitmap_find_free_region(pcie->bitmap, pcie->nvecs, + order_base_2(nr_irqs)); + + mutex_unlock(&pcie->lock); + + if (hwirq < 0) + return -ENOSPC; + + fwspec.param[1] += hwirq; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, &fwspec); + if (ret) + return ret; + + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &apple_msi_bottom_chip, + domain->host_data); + } + + return 0; +} + +static void apple_msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct apple_pcie *pcie = domain->host_data; + + mutex_lock(&pcie->lock); + + bitmap_release_region(pcie->bitmap, d->hwirq, order_base_2(nr_irqs)); + + mutex_unlock(&pcie->lock); +} + +static const struct irq_domain_ops apple_msi_domain_ops = { + .alloc = apple_msi_domain_alloc, + .free = apple_msi_domain_free, +}; + +static struct msi_domain_info apple_msi_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), + .chip = &apple_msi_top_chip, +}; + +static void apple_port_irq_mask(struct irq_data *data) +{ + struct apple_pcie_port *port = irq_data_get_irq_chip_data(data); + + writel_relaxed(BIT(data->hwirq), port->base + PORT_INTMSKSET); +} + +static void apple_port_irq_unmask(struct irq_data *data) +{ + struct apple_pcie_port *port = irq_data_get_irq_chip_data(data); + + writel_relaxed(BIT(data->hwirq), port->base + PORT_INTMSKCLR); +} + +static bool hwirq_is_intx(unsigned int hwirq) +{ + return BIT(hwirq) & PORT_INT_INTx_MASK; +} + +static void apple_port_irq_ack(struct irq_data *data) +{ + struct apple_pcie_port *port = irq_data_get_irq_chip_data(data); + + if (!hwirq_is_intx(data->hwirq)) + writel_relaxed(BIT(data->hwirq), port->base + PORT_INTSTAT); +} + +static int apple_port_irq_set_type(struct irq_data *data, unsigned int type) +{ + /* + * It doesn't seem that there is any way to configure the + * trigger, so assume INTx have to be level (as per the spec), + * and the rest is edge (which looks likely). + */ + if (hwirq_is_intx(data->hwirq) ^ !!(type & IRQ_TYPE_LEVEL_MASK)) + return -EINVAL; + + irqd_set_trigger_type(data, type); + return 0; +} + +static struct irq_chip apple_port_irqchip = { + .name = "PCIe", + .irq_ack = apple_port_irq_ack, + .irq_mask = apple_port_irq_mask, + .irq_unmask = apple_port_irq_unmask, + .irq_set_type = apple_port_irq_set_type, +}; + +static int apple_port_irq_domain_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *args) +{ + struct apple_pcie_port *port = domain->host_data; + struct irq_fwspec *fwspec = args; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_flow_handler_t flow = handle_edge_irq; + unsigned int type = IRQ_TYPE_EDGE_RISING; + + if (hwirq_is_intx(fwspec->param[0] + i)) { + flow = handle_level_irq; + type = IRQ_TYPE_LEVEL_HIGH; + } + + irq_domain_set_info(domain, virq + i, fwspec->param[0] + i, + &apple_port_irqchip, port, flow, + NULL, NULL); + + irq_set_irq_type(virq + i, type); + } + + return 0; +} + +static void apple_port_irq_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + int i; + + for (i = 0; i < nr_irqs; i++) { + struct irq_data *d = irq_domain_get_irq_data(domain, virq + i); + + irq_set_handler(virq + i, NULL); + irq_domain_reset_irq_data(d); + } +} + +static const struct irq_domain_ops apple_port_irq_domain_ops = { + .translate = irq_domain_translate_onecell, + .alloc = apple_port_irq_domain_alloc, + .free = apple_port_irq_domain_free, +}; + +static void apple_port_irq_handler(struct irq_desc *desc) +{ + struct apple_pcie_port *port = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned long stat; + int i; + + chained_irq_enter(chip, desc); + + stat = readl_relaxed(port->base + PORT_INTSTAT); + + for_each_set_bit(i, &stat, 32) + generic_handle_domain_irq(port->domain, i); + + chained_irq_exit(chip, desc); +} + +static int apple_pcie_port_setup_irq(struct apple_pcie_port *port) +{ + struct fwnode_handle *fwnode = &port->np->fwnode; + unsigned int irq; + + /* FIXME: consider moving each interrupt under each port */ + irq = irq_of_parse_and_map(to_of_node(dev_fwnode(port->pcie->dev)), + port->idx); + if (!irq) + return -ENXIO; + + port->domain = irq_domain_create_linear(fwnode, 32, + &apple_port_irq_domain_ops, + port); + if (!port->domain) + return -ENOMEM; + + /* Disable all interrupts */ + writel_relaxed(~0, port->base + PORT_INTMSKSET); + writel_relaxed(~0, port->base + PORT_INTSTAT); + + irq_set_chained_handler_and_data(irq, apple_port_irq_handler, port); + + /* Configure MSI base address */ + BUILD_BUG_ON(upper_32_bits(DOORBELL_ADDR)); + writel_relaxed(lower_32_bits(DOORBELL_ADDR), port->base + PORT_MSIADDR); + + /* Enable MSIs, shared between all ports */ + writel_relaxed(0, port->base + PORT_MSIBASE); + writel_relaxed((ilog2(port->pcie->nvecs) << PORT_MSICFG_L2MSINUM_SHIFT) | + PORT_MSICFG_EN, port->base + PORT_MSICFG); + + return 0; +} + +static irqreturn_t apple_pcie_port_irq(int irq, void *data) +{ + struct apple_pcie_port *port = data; + unsigned int hwirq = irq_domain_get_irq_data(port->domain, irq)->hwirq; + + switch (hwirq) { + case PORT_INT_LINK_UP: + dev_info_ratelimited(port->pcie->dev, "Link up on %pOF\n", + port->np); + complete_all(&port->pcie->event); + break; + case PORT_INT_LINK_DOWN: + dev_info_ratelimited(port->pcie->dev, "Link down on %pOF\n", + port->np); + break; + default: + return IRQ_NONE; + } + + return IRQ_HANDLED; +} + +static int apple_pcie_port_register_irqs(struct apple_pcie_port *port) +{ + static struct { + unsigned int hwirq; + const char *name; + } port_irqs[] = { + { PORT_INT_LINK_UP, "Link up", }, + { PORT_INT_LINK_DOWN, "Link down", }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(port_irqs); i++) { + struct irq_fwspec fwspec = { + .fwnode = &port->np->fwnode, + .param_count = 1, + .param = { + [0] = port_irqs[i].hwirq, + }, + }; + unsigned int irq; + int ret; + + irq = irq_domain_alloc_irqs(port->domain, 1, NUMA_NO_NODE, + &fwspec); + if (WARN_ON(!irq)) + continue; + + ret = request_irq(irq, apple_pcie_port_irq, 0, + port_irqs[i].name, port); + WARN_ON(ret); + } + + return 0; +} + +static int apple_pcie_setup_refclk(struct apple_pcie *pcie, + struct apple_pcie_port *port) +{ + u32 stat; + int res; + + res = readl_relaxed_poll_timeout(pcie->base + CORE_RC_PHYIF_STAT, stat, + stat & CORE_RC_PHYIF_STAT_REFCLK, + 100, 50000); + if (res < 0) + return res; + + rmw_set(CORE_LANE_CTL_CFGACC, pcie->base + CORE_LANE_CTL(port->idx)); + rmw_set(CORE_LANE_CFG_REFCLK0REQ, pcie->base + CORE_LANE_CFG(port->idx)); + + res = readl_relaxed_poll_timeout(pcie->base + CORE_LANE_CFG(port->idx), + stat, stat & CORE_LANE_CFG_REFCLK0ACK, + 100, 50000); + if (res < 0) + return res; + + rmw_set(CORE_LANE_CFG_REFCLK1, pcie->base + CORE_LANE_CFG(port->idx)); + res = readl_relaxed_poll_timeout(pcie->base + CORE_LANE_CFG(port->idx), + stat, stat & CORE_LANE_CFG_REFCLK1, + 100, 50000); + + if (res < 0) + return res; + + rmw_clear(CORE_LANE_CTL_CFGACC, pcie->base + CORE_LANE_CTL(port->idx)); + + rmw_set(CORE_LANE_CFG_REFCLKEN, pcie->base + CORE_LANE_CFG(port->idx)); + rmw_set(PORT_REFCLK_EN, port->base + PORT_REFCLK); + + return 0; +} + +static u32 apple_pcie_rid2sid_write(struct apple_pcie_port *port, + int idx, u32 val) +{ + writel_relaxed(val, port->base + PORT_RID2SID(idx)); + /* Read back to ensure completion of the write */ + return readl_relaxed(port->base + PORT_RID2SID(idx)); +} + +static int apple_pcie_setup_port(struct apple_pcie *pcie, + struct device_node *np) +{ + struct platform_device *platform = to_platform_device(pcie->dev); + struct apple_pcie_port *port; + struct gpio_desc *reset; + u32 stat, idx; + int ret, i; + + reset = gpiod_get_from_of_node(np, "reset-gpios", 0, + GPIOD_OUT_LOW, "#PERST"); + if (IS_ERR(reset)) + return PTR_ERR(reset); + + port = devm_kzalloc(pcie->dev, sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + ret = of_property_read_u32_index(np, "reg", 0, &idx); + if (ret) + return ret; + + /* Use the first reg entry to work out the port index */ + port->idx = idx >> 11; + port->pcie = pcie; + port->np = np; + + port->base = devm_platform_ioremap_resource(platform, port->idx + 2); + if (IS_ERR(port->base)) + return PTR_ERR(port->base); + + rmw_set(PORT_APPCLK_EN, port->base + PORT_APPCLK); + + ret = apple_pcie_setup_refclk(pcie, port); + if (ret < 0) + return ret; + + rmw_set(PORT_PERST_OFF, port->base + PORT_PERST); + gpiod_set_value(reset, 1); + + ret = readl_relaxed_poll_timeout(port->base + PORT_STATUS, stat, + stat & PORT_STATUS_READY, 100, 250000); + if (ret < 0) { + dev_err(pcie->dev, "port %pOF ready wait timeout\n", np); + return ret; + } + + ret = apple_pcie_port_setup_irq(port); + if (ret) + return ret; + + /* Reset all RID/SID mappings, and check for RAZ/WI registers */ + for (i = 0; i < MAX_RID2SID; i++) { + if (apple_pcie_rid2sid_write(port, i, 0xbad1d) != 0xbad1d) + break; + apple_pcie_rid2sid_write(port, i, 0); + } + + dev_dbg(pcie->dev, "%pOF: %d RID/SID mapping entries\n", np, i); + + port->sid_map_sz = i; + + list_add_tail(&port->entry, &pcie->ports); + init_completion(&pcie->event); + + ret = apple_pcie_port_register_irqs(port); + WARN_ON(ret); + + writel_relaxed(PORT_LTSSMCTL_START, port->base + PORT_LTSSMCTL); + + if (!wait_for_completion_timeout(&pcie->event, HZ / 10)) + dev_warn(pcie->dev, "%pOF link didn't come up\n", np); + + return 0; +} + +static int apple_msi_init(struct apple_pcie *pcie) +{ + struct fwnode_handle *fwnode = dev_fwnode(pcie->dev); + struct of_phandle_args args = {}; + struct irq_domain *parent; + int ret; + + ret = of_parse_phandle_with_args(to_of_node(fwnode), "msi-ranges", + "#interrupt-cells", 0, &args); + if (ret) + return ret; + + ret = of_property_read_u32_index(to_of_node(fwnode), "msi-ranges", + args.args_count + 1, &pcie->nvecs); + if (ret) + return ret; + + of_phandle_args_to_fwspec(args.np, args.args, args.args_count, + &pcie->fwspec); + + pcie->bitmap = devm_bitmap_zalloc(pcie->dev, pcie->nvecs, GFP_KERNEL); + if (!pcie->bitmap) + return -ENOMEM; + + parent = irq_find_matching_fwspec(&pcie->fwspec, DOMAIN_BUS_WIRED); + if (!parent) { + dev_err(pcie->dev, "failed to find parent domain\n"); + return -ENXIO; + } + + parent = irq_domain_create_hierarchy(parent, 0, pcie->nvecs, fwnode, + &apple_msi_domain_ops, pcie); + if (!parent) { + dev_err(pcie->dev, "failed to create IRQ domain\n"); + return -ENOMEM; + } + irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS); + + pcie->domain = pci_msi_create_irq_domain(fwnode, &apple_msi_info, + parent); + if (!pcie->domain) { + dev_err(pcie->dev, "failed to create MSI domain\n"); + irq_domain_remove(parent); + return -ENOMEM; + } + + return 0; +} + +static struct apple_pcie_port *apple_pcie_get_port(struct pci_dev *pdev) +{ + struct pci_config_window *cfg = pdev->sysdata; + struct apple_pcie *pcie = cfg->priv; + struct pci_dev *port_pdev; + struct apple_pcie_port *port; + + /* Find the root port this device is on */ + port_pdev = pcie_find_root_port(pdev); + + /* If finding the port itself, nothing to do */ + if (WARN_ON(!port_pdev) || pdev == port_pdev) + return NULL; + + list_for_each_entry(port, &pcie->ports, entry) { + if (port->idx == PCI_SLOT(port_pdev->devfn)) + return port; + } + + return NULL; +} + +static int apple_pcie_add_device(struct apple_pcie_port *port, + struct pci_dev *pdev) +{ + u32 sid, rid = PCI_DEVID(pdev->bus->number, pdev->devfn); + int idx, err; + + dev_dbg(&pdev->dev, "added to bus %s, index %d\n", + pci_name(pdev->bus->self), port->idx); + + err = of_map_id(port->pcie->dev->of_node, rid, "iommu-map", + "iommu-map-mask", NULL, &sid); + if (err) + return err; + + mutex_lock(&port->pcie->lock); + + idx = bitmap_find_free_region(port->sid_map, port->sid_map_sz, 0); + if (idx >= 0) { + apple_pcie_rid2sid_write(port, idx, + PORT_RID2SID_VALID | + (sid << PORT_RID2SID_SID_SHIFT) | rid); + + dev_dbg(&pdev->dev, "mapping RID%x to SID%x (index %d)\n", + rid, sid, idx); + } + + mutex_unlock(&port->pcie->lock); + + return idx >= 0 ? 0 : -ENOSPC; +} + +static void apple_pcie_release_device(struct apple_pcie_port *port, + struct pci_dev *pdev) +{ + u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn); + int idx; + + mutex_lock(&port->pcie->lock); + + for_each_set_bit(idx, port->sid_map, port->sid_map_sz) { + u32 val; + + val = readl_relaxed(port->base + PORT_RID2SID(idx)); + if ((val & 0xffff) == rid) { + apple_pcie_rid2sid_write(port, idx, 0); + bitmap_release_region(port->sid_map, idx, 0); + dev_dbg(&pdev->dev, "Released %x (%d)\n", val, idx); + break; + } + } + + mutex_unlock(&port->pcie->lock); +} + +static int apple_pcie_bus_notifier(struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct apple_pcie_port *port; + int err; + + /* + * This is a bit ugly. We assume that if we get notified for + * any PCI device, we must be in charge of it, and that there + * is no other PCI controller in the whole system. It probably + * holds for now, but who knows for how long? + */ + port = apple_pcie_get_port(pdev); + if (!port) + return NOTIFY_DONE; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + err = apple_pcie_add_device(port, pdev); + if (err) + return notifier_from_errno(err); + break; + case BUS_NOTIFY_DEL_DEVICE: + apple_pcie_release_device(port, pdev); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static struct notifier_block apple_pcie_nb = { + .notifier_call = apple_pcie_bus_notifier, +}; + +static int apple_pcie_init(struct pci_config_window *cfg) +{ + struct device *dev = cfg->parent; + struct platform_device *platform = to_platform_device(dev); + struct device_node *of_port; + struct apple_pcie *pcie; + int ret; + + pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); + if (!pcie) + return -ENOMEM; + + pcie->dev = dev; + + mutex_init(&pcie->lock); + + pcie->base = devm_platform_ioremap_resource(platform, 1); + if (IS_ERR(pcie->base)) + return PTR_ERR(pcie->base); + + cfg->priv = pcie; + INIT_LIST_HEAD(&pcie->ports); + + for_each_child_of_node(dev->of_node, of_port) { + ret = apple_pcie_setup_port(pcie, of_port); + if (ret) { + dev_err(pcie->dev, "Port %pOF setup fail: %d\n", of_port, ret); + of_node_put(of_port); + return ret; + } + } + + return apple_msi_init(pcie); +} + +static int apple_pcie_probe(struct platform_device *pdev) +{ + int ret; + + ret = bus_register_notifier(&pci_bus_type, &apple_pcie_nb); + if (ret) + return ret; + + ret = pci_host_common_probe(pdev); + if (ret) + bus_unregister_notifier(&pci_bus_type, &apple_pcie_nb); + + return ret; +} + +static const struct pci_ecam_ops apple_pcie_cfg_ecam_ops = { + .init = apple_pcie_init, + .pci_ops = { + .map_bus = pci_ecam_map_bus, + .read = pci_generic_config_read, + .write = pci_generic_config_write, + } +}; + +static const struct of_device_id apple_pcie_of_match[] = { + { .compatible = "apple,pcie", .data = &apple_pcie_cfg_ecam_ops }, + { } +}; +MODULE_DEVICE_TABLE(of, apple_pcie_of_match); + +static struct platform_driver apple_pcie_driver = { + .probe = apple_pcie_probe, + .driver = { + .name = "pcie-apple", + .of_match_table = apple_pcie_of_match, + .suppress_bind_attrs = true, + }, +}; +module_platform_driver(apple_pcie_driver); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index cc30215f5a43..1fc7bd49a7ad 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -145,7 +145,7 @@ #define BRCM_INT_PCI_MSI_LEGACY_NR 8 #define BRCM_INT_PCI_MSI_SHIFT 0 -/* MSI target adresses */ +/* MSI target addresses */ #define BRCM_MSI_TARGET_ADDR_LT_4GB 0x0fffffffcULL #define BRCM_MSI_TARGET_ADDR_GT_4GB 0xffffffffcULL diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c index 30ac5fbefbbf..36b9d2c46cfa 100644 --- a/drivers/pci/controller/pcie-iproc.c +++ b/drivers/pci/controller/pcie-iproc.c @@ -249,7 +249,7 @@ enum iproc_pcie_reg { /* * To hold the address of the register where the MSI writes are - * programed. When ARM GICv3 ITS is used, this should be programmed + * programmed. When ARM GICv3 ITS is used, this should be programmed * with the address of the GITS_TRANSLATER register. */ IPROC_PCIE_MSI_ADDR_LO, diff --git a/drivers/staging/mt7621-pci/pci-mt7621.c b/drivers/pci/controller/pcie-mt7621.c similarity index 95% rename from drivers/staging/mt7621-pci/pci-mt7621.c rename to drivers/pci/controller/pcie-mt7621.c index 503cb1fca2e0..b60dfb45ef7b 100644 --- a/drivers/staging/mt7621-pci/pci-mt7621.c +++ b/drivers/pci/controller/pcie-mt7621.c @@ -30,18 +30,18 @@ #include #include -/* MediaTek specific configuration registers */ +/* MediaTek-specific configuration registers */ #define PCIE_FTS_NUM 0x70c #define PCIE_FTS_NUM_MASK GENMASK(15, 8) #define PCIE_FTS_NUM_L0(x) (((x) & 0xff) << 8) /* Host-PCI bridge registers */ #define RALINK_PCI_PCICFG_ADDR 0x0000 -#define RALINK_PCI_PCIMSK_ADDR 0x000C +#define RALINK_PCI_PCIMSK_ADDR 0x000c #define RALINK_PCI_CONFIG_ADDR 0x0020 #define RALINK_PCI_CONFIG_DATA 0x0024 #define RALINK_PCI_MEMBASE 0x0028 -#define RALINK_PCI_IOBASE 0x002C +#define RALINK_PCI_IOBASE 0x002c /* PCIe RC control registers */ #define RALINK_PCI_ID 0x0030 @@ -132,7 +132,7 @@ static inline void pcie_port_write(struct mt7621_pcie_port *port, static inline u32 mt7621_pci_get_cfgaddr(unsigned int bus, unsigned int slot, unsigned int func, unsigned int where) { - return (((where & 0xF00) >> 8) << 24) | (bus << 16) | (slot << 11) | + return (((where & 0xf00) >> 8) << 24) | (bus << 16) | (slot << 11) | (func << 8) | (where & 0xfc) | 0x80000000; } @@ -217,7 +217,7 @@ static int setup_cm_memory_region(struct pci_host_bridge *host) entry = resource_list_first_type(&host->windows, IORESOURCE_MEM); if (!entry) { - dev_err(dev, "Cannot get memory resource\n"); + dev_err(dev, "cannot get memory resource\n"); return -EINVAL; } @@ -280,7 +280,7 @@ static int mt7621_pcie_parse_port(struct mt7621_pcie *pcie, port->gpio_rst = devm_gpiod_get_index_optional(dev, "reset", slot, GPIOD_OUT_LOW); if (IS_ERR(port->gpio_rst)) { - dev_err(dev, "Failed to get GPIO for PCIe%d\n", slot); + dev_err(dev, "failed to get GPIO for PCIe%d\n", slot); err = PTR_ERR(port->gpio_rst); goto remove_reset; } @@ -409,7 +409,7 @@ static int mt7621_pcie_init_ports(struct mt7621_pcie *pcie) err = mt7621_pcie_init_port(port); if (err) { - dev_err(dev, "Initiating port %d failed\n", slot); + dev_err(dev, "initializing port %d failed\n", slot); list_del(&port->list); } } @@ -476,7 +476,7 @@ static int mt7621_pcie_enable_ports(struct pci_host_bridge *host) entry = resource_list_first_type(&host->windows, IORESOURCE_IO); if (!entry) { - dev_err(dev, "Cannot get io resource\n"); + dev_err(dev, "cannot get io resource\n"); return -EINVAL; } @@ -541,25 +541,25 @@ static int mt7621_pci_probe(struct platform_device *pdev) err = mt7621_pcie_parse_dt(pcie); if (err) { - dev_err(dev, "Parsing DT failed\n"); + dev_err(dev, "parsing DT failed\n"); return err; } err = mt7621_pcie_init_ports(pcie); if (err) { - dev_err(dev, "Nothing connected in virtual bridges\n"); + dev_err(dev, "nothing connected in virtual bridges\n"); return 0; } err = mt7621_pcie_enable_ports(bridge); if (err) { - dev_err(dev, "Error enabling pcie ports\n"); + dev_err(dev, "error enabling pcie ports\n"); goto remove_resets; } err = setup_cm_memory_region(bridge); if (err) { - dev_err(dev, "Error setting up iocu mem regions\n"); + dev_err(dev, "error setting up iocu mem regions\n"); goto remove_resets; } diff --git a/drivers/pci/controller/pcie-rcar-ep.c b/drivers/pci/controller/pcie-rcar-ep.c index aa1cf24a5a72..f9682df1da61 100644 --- a/drivers/pci/controller/pcie-rcar-ep.c +++ b/drivers/pci/controller/pcie-rcar-ep.c @@ -6,16 +6,13 @@ * Author: Lad Prabhakar */ -#include #include #include -#include -#include #include #include #include -#include #include +#include #include "pcie-rcar.h" diff --git a/drivers/pci/controller/pcie-rcar-host.c b/drivers/pci/controller/pcie-rcar-host.c index 8f3131844e77..e12c2d8be05a 100644 --- a/drivers/pci/controller/pcie-rcar-host.c +++ b/drivers/pci/controller/pcie-rcar-host.c @@ -24,13 +24,11 @@ #include #include #include -#include #include #include #include #include #include -#include #include "pcie-rcar.h" diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index a5987e52700e..a45e8e59d3d4 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -18,8 +19,6 @@ #include #include -#include -#include #define VMD_CFGBAR 0 #define VMD_MEMBAR1 2 @@ -70,6 +69,8 @@ enum vmd_features { VMD_FEAT_CAN_BYPASS_MSI_REMAP = (1 << 4), }; +static DEFINE_IDA(vmd_instance_ida); + /* * Lock for manipulating VMD IRQ lists. */ @@ -120,6 +121,8 @@ struct vmd_dev { struct pci_bus *bus; u8 busn_start; u8 first_vec; + char *name; + int instance; }; static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus) @@ -650,7 +653,7 @@ static int vmd_alloc_irqs(struct vmd_dev *vmd) INIT_LIST_HEAD(&vmd->irqs[i].irq_list); err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i), vmd_irq, IRQF_NO_THREAD, - "vmd", &vmd->irqs[i]); + vmd->name, &vmd->irqs[i]); if (err) return err; } @@ -761,7 +764,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) * acceptable because the guest is usually CPU-limited and MSI * remapping doesn't become a performance bottleneck. */ - if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) || + if (iommu_capable(vmd->dev->dev.bus, IOMMU_CAP_INTR_REMAP) || + !(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) || offset[0] || offset[1]) { ret = vmd_alloc_irqs(vmd); if (ret) @@ -834,18 +838,32 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) return -ENOMEM; vmd->dev = dev; + vmd->instance = ida_simple_get(&vmd_instance_ida, 0, 0, GFP_KERNEL); + if (vmd->instance < 0) + return vmd->instance; + + vmd->name = kasprintf(GFP_KERNEL, "vmd%d", vmd->instance); + if (!vmd->name) { + err = -ENOMEM; + goto out_release_instance; + } + err = pcim_enable_device(dev); if (err < 0) - return err; + goto out_release_instance; vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0); - if (!vmd->cfgbar) - return -ENOMEM; + if (!vmd->cfgbar) { + err = -ENOMEM; + goto out_release_instance; + } pci_set_master(dev); if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) - return -ENODEV; + dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) { + err = -ENODEV; + goto out_release_instance; + } if (features & VMD_FEAT_OFFSET_FIRST_VECTOR) vmd->first_vec = 1; @@ -854,11 +872,16 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) pci_set_drvdata(dev, vmd); err = vmd_enable_domain(vmd, features); if (err) - return err; + goto out_release_instance; dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n", vmd->sysdata.domain); return 0; + + out_release_instance: + ida_simple_remove(&vmd_instance_ida, vmd->instance); + kfree(vmd->name); + return err; } static void vmd_cleanup_srcu(struct vmd_dev *vmd) @@ -879,6 +902,8 @@ static void vmd_remove(struct pci_dev *dev) vmd_cleanup_srcu(vmd); vmd_detach_resources(vmd); vmd_remove_irq_domain(vmd); + ida_simple_remove(&vmd_instance_ida, vmd->instance); + kfree(vmd->name); } #ifdef CONFIG_PM_SLEEP @@ -903,7 +928,7 @@ static int vmd_resume(struct device *dev) for (i = 0; i < vmd->msix_count; i++) { err = devm_request_irq(dev, pci_irq_vector(pdev, i), vmd_irq, IRQF_NO_THREAD, - "vmd", &vmd->irqs[i]); + vmd->name, &vmd->irqs[i]); if (err) return err; } diff --git a/drivers/pci/endpoint/functions/pci-epf-ntb.c b/drivers/pci/endpoint/functions/pci-epf-ntb.c index 8b4756159f15..5a03401f4571 100644 --- a/drivers/pci/endpoint/functions/pci-epf-ntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-ntb.c @@ -1937,7 +1937,7 @@ static ssize_t epf_ntb_##_name##_show(struct config_item *item, \ struct config_group *group = to_config_group(item); \ struct epf_ntb *ntb = to_epf_ntb(group); \ \ - return sprintf(page, "%d\n", ntb->_name); \ + return sysfs_emit(page, "%d\n", ntb->_name); \ } #define EPF_NTB_W(_name) \ @@ -1947,11 +1947,9 @@ static ssize_t epf_ntb_##_name##_store(struct config_item *item, \ struct config_group *group = to_config_group(item); \ struct epf_ntb *ntb = to_epf_ntb(group); \ u32 val; \ - int ret; \ \ - ret = kstrtou32(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou32(page, 0, &val) < 0) \ + return -EINVAL; \ \ ntb->_name = val; \ \ @@ -1968,7 +1966,7 @@ static ssize_t epf_ntb_##_name##_show(struct config_item *item, \ \ sscanf(#_name, "mw%d", &win_no); \ \ - return sprintf(page, "%lld\n", ntb->mws_size[win_no - 1]); \ + return sysfs_emit(page, "%lld\n", ntb->mws_size[win_no - 1]); \ } #define EPF_NTB_MW_W(_name) \ @@ -1980,11 +1978,9 @@ static ssize_t epf_ntb_##_name##_store(struct config_item *item, \ struct device *dev = &ntb->epf->dev; \ int win_no; \ u64 val; \ - int ret; \ \ - ret = kstrtou64(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou64(page, 0, &val) < 0) \ + return -EINVAL; \ \ if (sscanf(#_name, "mw%d", &win_no) != 1) \ return -EINVAL; \ @@ -2005,11 +2001,9 @@ static ssize_t epf_ntb_num_mws_store(struct config_item *item, struct config_group *group = to_config_group(item); struct epf_ntb *ntb = to_epf_ntb(group); u32 val; - int ret; - ret = kstrtou32(page, 0, &val); - if (ret) - return ret; + if (kstrtou32(page, 0, &val) < 0) + return -EINVAL; if (val > MAX_MW) return -EINVAL; diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c index 999911801877..d4850bdd837f 100644 --- a/drivers/pci/endpoint/pci-ep-cfs.c +++ b/drivers/pci/endpoint/pci-ep-cfs.c @@ -175,9 +175,8 @@ static ssize_t pci_epc_start_store(struct config_item *item, const char *page, epc = epc_group->epc; - ret = kstrtobool(page, &start); - if (ret) - return ret; + if (kstrtobool(page, &start) < 0) + return -EINVAL; if (!start) { pci_epc_stop(epc); @@ -198,8 +197,7 @@ static ssize_t pci_epc_start_store(struct config_item *item, const char *page, static ssize_t pci_epc_start_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", - to_pci_epc_group(item)->start); + return sysfs_emit(page, "%d\n", to_pci_epc_group(item)->start); } CONFIGFS_ATTR(pci_epc_, start); @@ -321,7 +319,7 @@ static ssize_t pci_epf_##_name##_show(struct config_item *item, char *page) \ struct pci_epf *epf = to_pci_epf_group(item)->epf; \ if (WARN_ON_ONCE(!epf->header)) \ return -EINVAL; \ - return sprintf(page, "0x%04x\n", epf->header->_name); \ + return sysfs_emit(page, "0x%04x\n", epf->header->_name); \ } #define PCI_EPF_HEADER_W_u32(_name) \ @@ -329,13 +327,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item, \ const char *page, size_t len) \ { \ u32 val; \ - int ret; \ struct pci_epf *epf = to_pci_epf_group(item)->epf; \ if (WARN_ON_ONCE(!epf->header)) \ return -EINVAL; \ - ret = kstrtou32(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou32(page, 0, &val) < 0) \ + return -EINVAL; \ epf->header->_name = val; \ return len; \ } @@ -345,13 +341,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item, \ const char *page, size_t len) \ { \ u16 val; \ - int ret; \ struct pci_epf *epf = to_pci_epf_group(item)->epf; \ if (WARN_ON_ONCE(!epf->header)) \ return -EINVAL; \ - ret = kstrtou16(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou16(page, 0, &val) < 0) \ + return -EINVAL; \ epf->header->_name = val; \ return len; \ } @@ -361,13 +355,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item, \ const char *page, size_t len) \ { \ u8 val; \ - int ret; \ struct pci_epf *epf = to_pci_epf_group(item)->epf; \ if (WARN_ON_ONCE(!epf->header)) \ return -EINVAL; \ - ret = kstrtou8(page, 0, &val); \ - if (ret) \ - return ret; \ + if (kstrtou8(page, 0, &val) < 0) \ + return -EINVAL; \ epf->header->_name = val; \ return len; \ } @@ -376,11 +368,9 @@ static ssize_t pci_epf_msi_interrupts_store(struct config_item *item, const char *page, size_t len) { u8 val; - int ret; - ret = kstrtou8(page, 0, &val); - if (ret) - return ret; + if (kstrtou8(page, 0, &val) < 0) + return -EINVAL; to_pci_epf_group(item)->epf->msi_interrupts = val; @@ -390,19 +380,17 @@ static ssize_t pci_epf_msi_interrupts_store(struct config_item *item, static ssize_t pci_epf_msi_interrupts_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", - to_pci_epf_group(item)->epf->msi_interrupts); + return sysfs_emit(page, "%d\n", + to_pci_epf_group(item)->epf->msi_interrupts); } static ssize_t pci_epf_msix_interrupts_store(struct config_item *item, const char *page, size_t len) { u16 val; - int ret; - ret = kstrtou16(page, 0, &val); - if (ret) - return ret; + if (kstrtou16(page, 0, &val) < 0) + return -EINVAL; to_pci_epf_group(item)->epf->msix_interrupts = val; @@ -412,8 +400,8 @@ static ssize_t pci_epf_msix_interrupts_store(struct config_item *item, static ssize_t pci_epf_msix_interrupts_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", - to_pci_epf_group(item)->epf->msix_interrupts); + return sysfs_emit(page, "%d\n", + to_pci_epf_group(item)->epf->msix_interrupts); } PCI_EPF_HEADER_R(vendorid) diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index ecbb0fb3b653..38621558d397 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(pci_epc_linkup); /** * pci_epc_init_notify() - Notify the EPF device that EPC device's core * initialization is completed. - * @epc: the EPC device whose core initialization is completeds + * @epc: the EPC device whose core initialization is completed * * Invoke to Notify the EPF device that the EPC device's initialization * is completed. diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index 8aea16380870..9ed556936f48 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -224,7 +224,7 @@ EXPORT_SYMBOL_GPL(pci_epf_add_vepf); * be removed * @epf_vf: the virtual EP function to be removed * - * Invoke to remove a virtual endpoint function from the physcial endpoint + * Invoke to remove a virtual endpoint function from the physical endpoint * function. */ void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf) @@ -432,7 +432,7 @@ EXPORT_SYMBOL_GPL(pci_epf_destroy); /** * pci_epf_create() - create a new PCI EPF device * @name: the name of the PCI EPF device. This name will be used to bind the - * the EPF device to a EPF driver + * EPF device to a EPF driver * * Invoke to create a new PCI EPF device by providing the name of the function * device. diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index f031302ad401..12f4b351be67 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -22,7 +22,7 @@ * when the bridge is scanned and it loses a refcount when the bridge * is removed. * - When a P2P bridge is present, we elevate the refcount on the subordinate - * bus. It loses the refcount when the the driver unloads. + * bus. It loses the refcount when the driver unloads. */ #define pr_fmt(fmt) "acpiphp_glue: " fmt diff --git a/drivers/pci/hotplug/cpqphp.h b/drivers/pci/hotplug/cpqphp.h index 77e4e0142fbc..2f7b49ea96e2 100644 --- a/drivers/pci/hotplug/cpqphp.h +++ b/drivers/pci/hotplug/cpqphp.h @@ -15,7 +15,7 @@ #define _CPQPHP_H #include -#include /* for read? and write? functions */ +#include /* for read? and write? functions */ #include /* for delays */ #include #include /* for signal_pending() */ diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c index 1b26ca0b3701..ed7b58eb64d2 100644 --- a/drivers/pci/hotplug/cpqphp_ctrl.c +++ b/drivers/pci/hotplug/cpqphp_ctrl.c @@ -519,7 +519,7 @@ error: * @head: list to search * @size: size of node to find, must be a power of two. * - * Description: This function sorts the resource list by size and then returns + * Description: This function sorts the resource list by size and then * returns the first node of "size" length that is not in the ISA aliasing * window. If it finds a node larger than "size" it will split it up. */ @@ -1202,7 +1202,7 @@ static u8 set_controller_speed(struct controller *ctrl, u8 adapter_speed, u8 hp_ mdelay(5); - /* Reenable interrupts */ + /* Re-enable interrupts */ writel(0, ctrl->hpc_reg + INT_MASK); pci_write_config_byte(ctrl->pci_dev, 0x41, reg); diff --git a/drivers/pci/hotplug/cpqphp_pci.c b/drivers/pci/hotplug/cpqphp_pci.c index 1b2b3f3b648b..9038039ad6db 100644 --- a/drivers/pci/hotplug/cpqphp_pci.c +++ b/drivers/pci/hotplug/cpqphp_pci.c @@ -189,8 +189,10 @@ int cpqhp_set_irq(u8 bus_num, u8 dev_num, u8 int_pin, u8 irq_num) /* This should only be for x86 as it sets the Edge Level * Control Register */ - outb((u8) (temp_word & 0xFF), 0x4d0); outb((u8) ((temp_word & - 0xFF00) >> 8), 0x4d1); rc = 0; } + outb((u8)(temp_word & 0xFF), 0x4d0); + outb((u8)((temp_word & 0xFF00) >> 8), 0x4d1); + rc = 0; + } return rc; } diff --git a/drivers/pci/hotplug/ibmphp.h b/drivers/pci/hotplug/ibmphp.h index e90a4ebf6550..0399c60d2ec1 100644 --- a/drivers/pci/hotplug/ibmphp.h +++ b/drivers/pci/hotplug/ibmphp.h @@ -352,7 +352,7 @@ struct resource_node { u32 len; int type; /* MEM, IO, PFMEM */ u8 fromMem; /* this is to indicate that the range is from - * from the Memory bucket rather than from PFMem */ + * the Memory bucket rather than from PFMem */ struct resource_node *next; struct resource_node *nextRange; /* for the other mem range on bus */ }; @@ -736,7 +736,7 @@ struct controller { int ibmphp_init_devno(struct slot **); /* This function is called from EBDA, so we need it not be static */ int ibmphp_do_disable_slot(struct slot *slot_cur); -int ibmphp_update_slot_info(struct slot *); /* This function is called from HPC, so we need it to not be be static */ +int ibmphp_update_slot_info(struct slot *); /* This function is called from HPC, so we need it to not be static */ int ibmphp_configure_card(struct pci_func *, u8); int ibmphp_unconfigure_card(struct slot **, int); extern const struct hotplug_slot_ops ibmphp_hotplug_slot_ops; diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 69fd401691be..918dccbc74b6 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -189,6 +189,8 @@ int pciehp_get_attention_status(struct hotplug_slot *hotplug_slot, u8 *status); int pciehp_set_raw_indicator_status(struct hotplug_slot *h_slot, u8 status); int pciehp_get_raw_indicator_status(struct hotplug_slot *h_slot, u8 *status); +int pciehp_slot_reset(struct pcie_device *dev); + static inline const char *slot_name(struct controller *ctrl) { return hotplug_slot_name(&ctrl->hotplug_slot); diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index ad3393930ecb..f34114d45259 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -351,6 +351,8 @@ static struct pcie_port_service_driver hpdriver_portdrv = { .runtime_suspend = pciehp_runtime_suspend, .runtime_resume = pciehp_runtime_resume, #endif /* PM */ + + .slot_reset = pciehp_slot_reset, }; int __init pcie_hp_init(void) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 3024d7e85e6a..83a0fa119cae 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -862,6 +862,32 @@ void pcie_disable_interrupt(struct controller *ctrl) pcie_write_cmd(ctrl, 0, mask); } +/** + * pciehp_slot_reset() - ignore link event caused by error-induced hot reset + * @dev: PCI Express port service device + * + * Called from pcie_portdrv_slot_reset() after AER or DPC initiated a reset + * further up in the hierarchy to recover from an error. The reset was + * propagated down to this hotplug port. Ignore the resulting link flap. + * If the link failed to retrain successfully, synthesize the ignored event. + * Surprise removal during reset is detected through Presence Detect Changed. + */ +int pciehp_slot_reset(struct pcie_device *dev) +{ + struct controller *ctrl = get_service_data(dev); + + if (ctrl->state != ON_STATE) + return 0; + + pcie_capability_write_word(dev->port, PCI_EXP_SLTSTA, + PCI_EXP_SLTSTA_DLLSC); + + if (!pciehp_check_link_active(ctrl)) + pciehp_request(ctrl, PCI_EXP_SLTSTA_DLLSC); + + return 0; +} + /* * pciehp has a 1:1 bus:slot relationship so we ultimately want a secondary * bus reset of the bridge, but at the same time we want to ensure that it is diff --git a/drivers/pci/hotplug/shpchp_hpc.c b/drivers/pci/hotplug/shpchp_hpc.c index 9e3b27744305..bd7557ca4910 100644 --- a/drivers/pci/hotplug/shpchp_hpc.c +++ b/drivers/pci/hotplug/shpchp_hpc.c @@ -295,7 +295,7 @@ static int shpc_write_cmd(struct slot *slot, u8 t_slot, u8 cmd) mutex_lock(&slot->ctrl->cmd_lock); if (!shpc_poll_ctrl_busy(ctrl)) { - /* After 1 sec and and the controller is still busy */ + /* After 1 sec and the controller is still busy */ ctrl_err(ctrl, "Controller is still busy after 1 sec\n"); retval = -EBUSY; goto out; diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index dafdc652fcd0..1d7a7c5b5307 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -164,13 +164,15 @@ static ssize_t sriov_vf_total_msix_show(struct device *dev, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); + struct pci_driver *pdrv; u32 vf_total_msix = 0; device_lock(dev); - if (!pdev->driver || !pdev->driver->sriov_get_vf_total_msix) + pdrv = to_pci_driver(dev->driver); + if (!pdrv || !pdrv->sriov_get_vf_total_msix) goto unlock; - vf_total_msix = pdev->driver->sriov_get_vf_total_msix(pdev); + vf_total_msix = pdrv->sriov_get_vf_total_msix(pdev); unlock: device_unlock(dev); return sysfs_emit(buf, "%u\n", vf_total_msix); @@ -183,23 +185,24 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev, { struct pci_dev *vf_dev = to_pci_dev(dev); struct pci_dev *pdev = pci_physfn(vf_dev); - int val, ret; + struct pci_driver *pdrv; + int val, ret = 0; - ret = kstrtoint(buf, 0, &val); - if (ret) - return ret; + if (kstrtoint(buf, 0, &val) < 0) + return -EINVAL; if (val < 0) return -EINVAL; device_lock(&pdev->dev); - if (!pdev->driver || !pdev->driver->sriov_set_msix_vec_count) { + pdrv = to_pci_driver(dev->driver); + if (!pdrv || !pdrv->sriov_set_msix_vec_count) { ret = -EOPNOTSUPP; goto err_pdev; } device_lock(&vf_dev->dev); - if (vf_dev->driver) { + if (to_pci_driver(vf_dev->dev.driver)) { /* * A driver is already attached to this VF and has configured * itself based on the current MSI-X vector count. Changing @@ -209,7 +212,7 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev, goto err_dev; } - ret = pdev->driver->sriov_set_msix_vec_count(vf_dev, val); + ret = pdrv->sriov_set_msix_vec_count(vf_dev, val); err_dev: device_unlock(&vf_dev->dev); @@ -376,12 +379,12 @@ static ssize_t sriov_numvfs_store(struct device *dev, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - int ret; + struct pci_driver *pdrv; + int ret = 0; u16 num_vfs; - ret = kstrtou16(buf, 0, &num_vfs); - if (ret < 0) - return ret; + if (kstrtou16(buf, 0, &num_vfs) < 0) + return -EINVAL; if (num_vfs > pci_sriov_get_totalvfs(pdev)) return -ERANGE; @@ -392,14 +395,15 @@ static ssize_t sriov_numvfs_store(struct device *dev, goto exit; /* is PF driver loaded */ - if (!pdev->driver) { + pdrv = to_pci_driver(dev->driver); + if (!pdrv) { pci_info(pdev, "no driver bound to device; cannot configure SR-IOV\n"); ret = -ENOENT; goto exit; } /* is PF driver loaded w/callback */ - if (!pdev->driver->sriov_configure) { + if (!pdrv->sriov_configure) { pci_info(pdev, "driver does not support SR-IOV configuration via sysfs\n"); ret = -ENOENT; goto exit; @@ -407,7 +411,7 @@ static ssize_t sriov_numvfs_store(struct device *dev, if (num_vfs == 0) { /* disable VFs */ - ret = pdev->driver->sriov_configure(pdev, 0); + ret = pdrv->sriov_configure(pdev, 0); goto exit; } @@ -419,7 +423,7 @@ static ssize_t sriov_numvfs_store(struct device *dev, goto exit; } - ret = pdev->driver->sriov_configure(pdev, num_vfs); + ret = pdrv->sriov_configure(pdev, num_vfs); if (ret < 0) goto exit; diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 4b4792940e86..12e296d634eb 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -582,7 +582,8 @@ err: return ret; } -static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries) +static void __iomem *msix_map_region(struct pci_dev *dev, + unsigned int nr_entries) { resource_size_t phys_addr; u32 table_offset; diff --git a/drivers/pci/of.c b/drivers/pci/of.c index d84381ce82b5..0b1237cff239 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -423,7 +423,7 @@ failed: */ static int of_irq_parse_pci(const struct pci_dev *pdev, struct of_phandle_args *out_irq) { - struct device_node *dn, *ppnode; + struct device_node *dn, *ppnode = NULL; struct pci_dev *ppdev; __be32 laddr[3]; u8 pin; @@ -452,8 +452,14 @@ static int of_irq_parse_pci(const struct pci_dev *pdev, struct of_phandle_args * if (pin == 0) return -ENODEV; + /* Local interrupt-map in the device node? Use it! */ + if (of_get_property(dn, "interrupt-map", NULL)) { + pin = pci_swizzle_interrupt_pin(pdev, pin); + ppnode = dn; + } + /* Now we walk up the PCI tree */ - for (;;) { + while (!ppnode) { /* Get the pci_dev of our parent */ ppdev = pdev->bus->self; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 50cdde3e9a8b..8d47cb7218d1 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -874,7 +874,7 @@ static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap, int i; for_each_sg(sg, s, nents, i) { - s->dma_address = sg_phys(s) - p2p_pgmap->bus_offset; + s->dma_address = sg_phys(s) + p2p_pgmap->bus_offset; sg_dma_len(s) = s->length; } @@ -943,7 +943,7 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs); * * Parses an attribute value to decide whether to enable p2pdma. * The value can select a PCI device (using its full BDF device - * name) or a boolean (in any format strtobool() accepts). A false + * name) or a boolean (in any format kstrtobool() accepts). A false * value disables p2pdma, a true value expects the caller * to automatically find a compatible device and specifying a PCI device * expects the caller to use the specific provider. @@ -975,11 +975,11 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, } else if ((page[0] == '0' || page[0] == '1') && !iscntrl(page[1])) { /* * If the user enters a PCI device that doesn't exist - * like "0000:01:00.1", we don't want strtobool to think + * like "0000:01:00.1", we don't want kstrtobool to think * it's a '0' when it's clearly not what the user wanted. * So we require 0's and 1's to be exactly one character. */ - } else if (!strtobool(page, use_p2pdma)) { + } else if (!kstrtobool(page, use_p2pdma)) { return 0; } diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index fdaf86a888b7..db97cddfc85e 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -431,8 +431,21 @@ int pci_bridge_emul_conf_write(struct pci_bridge_emul *bridge, int where, /* Clear the W1C bits */ new &= ~((value << shift) & (behavior[reg / 4].w1c & mask)); + /* Save the new value with the cleared W1C bits into the cfgspace */ cfgspace[reg / 4] = cpu_to_le32(new); + /* + * Clear the W1C bits not specified by the write mask, so that the + * write_op() does not clear them. + */ + new &= ~(behavior[reg / 4].w1c & ~mask); + + /* + * Set the W1C bits specified by the write mask, so that write_op() + * knows about that they are to be cleared. + */ + new |= (value << shift) & (behavior[reg / 4].w1c & mask); + if (write_op) write_op(bridge, reg, old, new, mask); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index aa71a0c904f3..b442f6927ec0 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -348,12 +348,10 @@ static long local_pci_probe(void *_ddi) * its remove routine. */ pm_runtime_get_sync(dev); - pci_dev->driver = pci_drv; rc = pci_drv->probe(pci_dev, ddi->id); if (!rc) return rc; if (rc < 0) { - pci_dev->driver = NULL; pm_runtime_put_sync(dev); return rc; } @@ -419,14 +417,13 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, * @pci_dev: PCI device being probed * * returns 0 on success, else error. - * side-effect: pci_dev->driver is set to drv when drv claims pci_dev. */ static int __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev) { const struct pci_device_id *id; int error = 0; - if (!pci_dev->driver && drv->probe) { + if (drv->probe) { error = -ENODEV; id = pci_match_device(drv, pci_dev); @@ -486,18 +483,15 @@ static int pci_device_probe(struct device *dev) static void pci_device_remove(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(dev->driver); - if (drv) { - if (drv->remove) { - pm_runtime_get_sync(dev); - drv->remove(pci_dev); - pm_runtime_put_noidle(dev); - } - pcibios_free_irq(pci_dev); - pci_dev->driver = NULL; - pci_iov_remove(pci_dev); + if (drv->remove) { + pm_runtime_get_sync(dev); + drv->remove(pci_dev); + pm_runtime_put_noidle(dev); } + pcibios_free_irq(pci_dev); + pci_iov_remove(pci_dev); /* Undo the runtime PM settings in local_pci_probe() */ pm_runtime_put_sync(dev); @@ -524,7 +518,7 @@ static void pci_device_remove(struct device *dev) static void pci_device_shutdown(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(dev->driver); pm_runtime_resume(dev); @@ -605,7 +599,7 @@ static int pci_pm_reenable_device(struct pci_dev *pci_dev) { int retval; - /* if the device was enabled before suspend, reenable */ + /* if the device was enabled before suspend, re-enable */ retval = pci_reenable_device(pci_dev); /* * if the device was busmaster before the suspend, make it busmaster @@ -620,7 +614,7 @@ static int pci_pm_reenable_device(struct pci_dev *pci_dev) static int pci_legacy_suspend(struct device *dev, pm_message_t state) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(dev->driver); if (drv && drv->suspend) { pci_power_t prev = pci_dev->current_state; @@ -661,7 +655,7 @@ static int pci_legacy_suspend_late(struct device *dev, pm_message_t state) static int pci_legacy_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(dev->driver); pci_fixup_device(pci_fixup_resume, pci_dev); @@ -680,7 +674,7 @@ static void pci_pm_default_suspend(struct pci_dev *pci_dev) static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev) { - struct pci_driver *drv = pci_dev->driver; + struct pci_driver *drv = to_pci_driver(pci_dev->dev.driver); bool ret = drv && (drv->suspend || drv->resume); /* @@ -1273,11 +1267,11 @@ static int pci_pm_runtime_suspend(struct device *dev) int error; /* - * If pci_dev->driver is not set (unbound), we leave the device in D0, - * but it may go to D3cold when the bridge above it runtime suspends. - * Save its config space in case that happens. + * If the device has no driver, we leave it in D0, but it may go to + * D3cold when the bridge above it runtime suspends. Save its + * config space in case that happens. */ - if (!pci_dev->driver) { + if (!to_pci_driver(dev->driver)) { pci_save_state(pci_dev); return 0; } @@ -1334,7 +1328,7 @@ static int pci_pm_runtime_resume(struct device *dev) */ pci_restore_standard_config(pci_dev); - if (!pci_dev->driver) + if (!to_pci_driver(dev->driver)) return 0; pci_fixup_device(pci_fixup_resume_early, pci_dev); @@ -1353,14 +1347,13 @@ static int pci_pm_runtime_resume(struct device *dev) static int pci_pm_runtime_idle(struct device *dev) { - struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* - * If pci_dev->driver is not set (unbound), the device should - * always remain in D0 regardless of the runtime PM status + * If the device has no driver, it should always remain in D0 + * regardless of the runtime PM status */ - if (!pci_dev->driver) + if (!to_pci_driver(dev->driver)) return 0; if (!pm) @@ -1467,8 +1460,10 @@ static struct pci_driver pci_compat_driver = { */ struct pci_driver *pci_dev_driver(const struct pci_dev *dev) { - if (dev->driver) - return dev->driver; + struct pci_driver *drv = to_pci_driver(dev->dev.driver); + + if (drv) + return drv; else { int i; for (i = 0; i <= PCI_ROM_RESOURCE; i++) @@ -1571,7 +1566,7 @@ static int pci_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH) +#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) /** * pci_uevent_ers - emit a uevent during recovery path of PCI device * @pdev: PCI device undergoing error recovery diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index f807b92afa6c..cfe2f85af09e 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "pci.h" @@ -49,7 +50,28 @@ pci_config_attr(subsystem_vendor, "0x%04x\n"); pci_config_attr(subsystem_device, "0x%04x\n"); pci_config_attr(revision, "0x%02x\n"); pci_config_attr(class, "0x%06x\n"); -pci_config_attr(irq, "%u\n"); + +static ssize_t irq_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + +#ifdef CONFIG_PCI_MSI + /* + * For MSI, show the first MSI IRQ; for all other cases including + * MSI-X, show the legacy INTx IRQ. + */ + if (pdev->msi_enabled) { + struct msi_desc *desc = first_pci_msi_entry(pdev); + + return sysfs_emit(buf, "%u\n", desc->irq); + } +#endif + + return sysfs_emit(buf, "%u\n", pdev->irq); +} +static DEVICE_ATTR_RO(irq); static ssize_t broken_parity_status_show(struct device *dev, struct device_attribute *attr, @@ -275,15 +297,15 @@ static ssize_t enable_store(struct device *dev, struct device_attribute *attr, { struct pci_dev *pdev = to_pci_dev(dev); unsigned long val; - ssize_t result = kstrtoul(buf, 0, &val); - - if (result < 0) - return result; + ssize_t result = 0; /* this can crash the machine when done on the "wrong" device */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; + device_lock(dev); if (dev->driver) result = -EBUSY; @@ -314,14 +336,13 @@ static ssize_t numa_node_store(struct device *dev, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - int node, ret; + int node; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = kstrtoint(buf, 0, &node); - if (ret) - return ret; + if (kstrtoint(buf, 0, &node) < 0) + return -EINVAL; if ((node < 0 && node != NUMA_NO_NODE) || node >= MAX_NUMNODES) return -EINVAL; @@ -380,12 +401,12 @@ static ssize_t msi_bus_store(struct device *dev, struct device_attribute *attr, struct pci_bus *subordinate = pdev->subordinate; unsigned long val; - if (kstrtoul(buf, 0, &val) < 0) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; + /* * "no_msi" and "bus_flags" only affect what happens when a driver * requests MSI or MSI-X. They don't affect any drivers that have @@ -1341,10 +1362,10 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr, { struct pci_dev *pdev = to_pci_dev(dev); unsigned long val; - ssize_t result = kstrtoul(buf, 0, &val); + ssize_t result; - if (result < 0) - return result; + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; if (val != 1) return -EINVAL; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 39c65b5f92c1..da75c422ba85 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -269,7 +269,7 @@ static int pci_dev_str_match_path(struct pci_dev *dev, const char *path, const char **endptr) { int ret; - int seg, bus, slot, func; + unsigned int seg, bus, slot, func; char *wpath, *p; char end; @@ -1439,6 +1439,24 @@ static int pci_save_pcie_state(struct pci_dev *dev) return 0; } +void pci_bridge_reconfigure_ltr(struct pci_dev *dev) +{ +#ifdef CONFIG_PCIEASPM + struct pci_dev *bridge; + u32 ctl; + + bridge = pci_upstream_bridge(dev); + if (bridge && bridge->ltr_path) { + pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl); + if (!(ctl & PCI_EXP_DEVCTL2_LTR_EN)) { + pci_dbg(bridge, "re-enabling LTR\n"); + pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_LTR_EN); + } + } +#endif +} + static void pci_restore_pcie_state(struct pci_dev *dev) { int i = 0; @@ -1449,6 +1467,13 @@ static void pci_restore_pcie_state(struct pci_dev *dev) if (!save_state) return; + /* + * Downstream ports reset the LTR enable bit when link goes down. + * Check and re-configure the bit here before restoring device. + * PCIe r5.0, sec 7.5.3.16. + */ + pci_bridge_reconfigure_ltr(dev); + cap = (u16 *)&save_state->cap.data[0]; pcie_capability_write_word(dev, PCI_EXP_DEVCTL, cap[i++]); pcie_capability_write_word(dev, PCI_EXP_LNKCTL, cap[i++]); @@ -2053,14 +2078,14 @@ void pcim_pin_device(struct pci_dev *pdev) EXPORT_SYMBOL(pcim_pin_device); /* - * pcibios_add_device - provide arch specific hooks when adding device dev + * pcibios_device_add - provide arch specific hooks when adding device dev * @dev: the PCI device being added * * Permits the platform to provide architecture specific functionality when * devices are added. This is the default implementation. Architecture * implementations can override this. */ -int __weak pcibios_add_device(struct pci_dev *dev) +int __weak pcibios_device_add(struct pci_dev *dev) { return 0; } @@ -2180,6 +2205,7 @@ int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) } EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state); +#ifdef CONFIG_PCIEAER void pcie_clear_device_status(struct pci_dev *dev) { u16 sta; @@ -2187,6 +2213,7 @@ void pcie_clear_device_status(struct pci_dev *dev) pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta); pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta); } +#endif /** * pcie_clear_root_pme_status - Clear root port PME interrupt status. @@ -3697,6 +3724,14 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask) struct pci_dev *bridge; u32 cap, ctl2; + /* + * Per PCIe r5.0, sec 9.3.5.10, the AtomicOp Requester Enable bit + * in Device Control 2 is reserved in VFs and the PF value applies + * to all associated VFs. + */ + if (dev->is_virtfn) + return -EINVAL; + if (!pci_is_pcie(dev)) return -EINVAL; @@ -5068,13 +5103,14 @@ EXPORT_SYMBOL_GPL(pci_dev_unlock); static void pci_dev_save_and_disable(struct pci_dev *dev) { + struct pci_driver *drv = to_pci_driver(dev->dev.driver); const struct pci_error_handlers *err_handler = - dev->driver ? dev->driver->err_handler : NULL; + drv ? drv->err_handler : NULL; /* - * dev->driver->err_handler->reset_prepare() is protected against - * races with ->remove() by the device lock, which must be held by - * the caller. + * drv->err_handler->reset_prepare() is protected against races + * with ->remove() by the device lock, which must be held by the + * caller. */ if (err_handler && err_handler->reset_prepare) err_handler->reset_prepare(dev); @@ -5099,15 +5135,15 @@ static void pci_dev_save_and_disable(struct pci_dev *dev) static void pci_dev_restore(struct pci_dev *dev) { + struct pci_driver *drv = to_pci_driver(dev->dev.driver); const struct pci_error_handlers *err_handler = - dev->driver ? dev->driver->err_handler : NULL; + drv ? drv->err_handler : NULL; pci_restore_state(dev); /* - * dev->driver->err_handler->reset_done() is protected against - * races with ->remove() by the device lock, which must be held by - * the caller. + * drv->err_handler->reset_done() is protected against races with + * ->remove() by the device lock, which must be held by the caller. */ if (err_handler && err_handler->reset_done) err_handler->reset_done(dev); @@ -5268,7 +5304,7 @@ const struct attribute_group pci_dev_reset_method_attr_group = { */ int __pci_reset_function_locked(struct pci_dev *dev) { - int i, m, rc = -ENOTTY; + int i, m, rc; might_sleep(); @@ -6304,11 +6340,12 @@ EXPORT_SYMBOL_GPL(pci_pr3_present); * cannot be left as a userspace activity). DMA aliases should therefore * be configured via quirks, such as the PCI fixup header quirk. */ -void pci_add_dma_alias(struct pci_dev *dev, u8 devfn_from, unsigned nr_devfns) +void pci_add_dma_alias(struct pci_dev *dev, u8 devfn_from, + unsigned int nr_devfns) { int devfn_to; - nr_devfns = min(nr_devfns, (unsigned) MAX_NR_DEVFNS - devfn_from); + nr_devfns = min(nr_devfns, (unsigned int)MAX_NR_DEVFNS - devfn_from); devfn_to = devfn_from + nr_devfns - 1; if (!dev->dma_alias_mask) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index bd39098c57da..3d60cabde1a1 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -86,6 +86,7 @@ void pci_msix_init(struct pci_dev *dev); bool pci_bridge_d3_possible(struct pci_dev *dev); void pci_bridge_d3_update(struct pci_dev *dev); void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev); +void pci_bridge_reconfigure_ltr(struct pci_dev *dev); static inline void pci_wakeup_event(struct pci_dev *dev) { diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile index b2980db88cc0..5783a2f79e6a 100644 --- a/drivers/pci/pcie/Makefile +++ b/drivers/pci/pcie/Makefile @@ -2,12 +2,12 @@ # # Makefile for PCI Express features and port driver -pcieportdrv-y := portdrv_core.o portdrv_pci.o err.o rcec.o +pcieportdrv-y := portdrv_core.o portdrv_pci.o rcec.o obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o obj-$(CONFIG_PCIEASPM) += aspm.o -obj-$(CONFIG_PCIEAER) += aer.o +obj-$(CONFIG_PCIEAER) += aer.o err.o obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o obj-$(CONFIG_PCIE_PME) += pme.o obj-$(CONFIG_PCIE_DPC) += dpc.o diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 9784fdcf3006..9fa1f97e5b27 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -57,7 +57,7 @@ struct aer_stats { * "as seen by this device". Note that this may mean that if an * end point is causing problems, the AER counters may increment * at its link partner (e.g. root port) because the errors will be - * "seen" by the link partner and not the the problematic end point + * "seen" by the link partner and not the problematic end point * itself (which may report all counters as 0 as it never saw any * problems). */ diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 013a47f587ce..52c74682601a 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -1219,7 +1219,7 @@ static ssize_t aspm_attr_store_common(struct device *dev, struct pcie_link_state *link = pcie_aspm_get_link(pdev); bool state_enable; - if (strtobool(buf, &state_enable) < 0) + if (kstrtobool(buf, &state_enable) < 0) return -EINVAL; down_read(&pci_bus_sem); @@ -1276,7 +1276,7 @@ static ssize_t clkpm_store(struct device *dev, struct pcie_link_state *link = pcie_aspm_get_link(pdev); bool state_enable; - if (strtobool(buf, &state_enable) < 0) + if (kstrtobool(buf, &state_enable) < 0) return -EINVAL; down_read(&pci_bus_sem); diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index b576aa890c76..356b9317297e 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -49,14 +49,16 @@ static int report_error_detected(struct pci_dev *dev, pci_channel_state_t state, enum pci_ers_result *result) { + struct pci_driver *pdrv; pci_ers_result_t vote; const struct pci_error_handlers *err_handler; device_lock(&dev->dev); + pdrv = to_pci_driver(dev->dev.driver); if (!pci_dev_set_io_state(dev, state) || - !dev->driver || - !dev->driver->err_handler || - !dev->driver->err_handler->error_detected) { + !pdrv || + !pdrv->err_handler || + !pdrv->err_handler->error_detected) { /* * If any device in the subtree does not have an error_detected * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent @@ -70,7 +72,7 @@ static int report_error_detected(struct pci_dev *dev, vote = PCI_ERS_RESULT_NONE; } } else { - err_handler = dev->driver->err_handler; + err_handler = pdrv->err_handler; vote = err_handler->error_detected(dev, state); } pci_uevent_ers(dev, vote); @@ -91,16 +93,18 @@ static int report_normal_detected(struct pci_dev *dev, void *data) static int report_mmio_enabled(struct pci_dev *dev, void *data) { + struct pci_driver *pdrv; pci_ers_result_t vote, *result = data; const struct pci_error_handlers *err_handler; device_lock(&dev->dev); - if (!dev->driver || - !dev->driver->err_handler || - !dev->driver->err_handler->mmio_enabled) + pdrv = to_pci_driver(dev->dev.driver); + if (!pdrv || + !pdrv->err_handler || + !pdrv->err_handler->mmio_enabled) goto out; - err_handler = dev->driver->err_handler; + err_handler = pdrv->err_handler; vote = err_handler->mmio_enabled(dev); *result = merge_result(*result, vote); out: @@ -110,16 +114,18 @@ out: static int report_slot_reset(struct pci_dev *dev, void *data) { + struct pci_driver *pdrv; pci_ers_result_t vote, *result = data; const struct pci_error_handlers *err_handler; device_lock(&dev->dev); - if (!dev->driver || - !dev->driver->err_handler || - !dev->driver->err_handler->slot_reset) + pdrv = to_pci_driver(dev->dev.driver); + if (!pdrv || + !pdrv->err_handler || + !pdrv->err_handler->slot_reset) goto out; - err_handler = dev->driver->err_handler; + err_handler = pdrv->err_handler; vote = err_handler->slot_reset(dev); *result = merge_result(*result, vote); out: @@ -129,16 +135,18 @@ out: static int report_resume(struct pci_dev *dev, void *data) { + struct pci_driver *pdrv; const struct pci_error_handlers *err_handler; device_lock(&dev->dev); + pdrv = to_pci_driver(dev->dev.driver); if (!pci_dev_set_io_state(dev, pci_channel_io_normal) || - !dev->driver || - !dev->driver->err_handler || - !dev->driver->err_handler->resume) + !pdrv || + !pdrv->err_handler || + !pdrv->err_handler->resume) goto out; - err_handler = dev->driver->err_handler; + err_handler = pdrv->err_handler; err_handler->resume(dev); out: pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED); diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 2ff5724b8f13..0ef4bf5f811d 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -85,8 +85,7 @@ struct pcie_port_service_driver { int (*runtime_suspend)(struct pcie_device *dev); int (*runtime_resume)(struct pcie_device *dev); - /* Device driver may resume normal operations */ - void (*error_resume)(struct pci_dev *dev); + int (*slot_reset)(struct pcie_device *dev); int port_type; /* Type of the port this driver can handle */ u32 service; /* Port service this device represents */ @@ -110,6 +109,7 @@ void pcie_port_service_unregister(struct pcie_port_service_driver *new); extern struct bus_type pcie_port_bus_type; int pcie_port_device_register(struct pci_dev *dev); +int pcie_port_device_iter(struct device *dev, void *data); #ifdef CONFIG_PM int pcie_port_device_suspend(struct device *dev); int pcie_port_device_resume_noirq(struct device *dev); @@ -118,8 +118,6 @@ int pcie_port_device_runtime_suspend(struct device *dev); int pcie_port_device_runtime_resume(struct device *dev); #endif void pcie_port_device_remove(struct pci_dev *dev); -int __must_check pcie_port_bus_register(void); -void pcie_port_bus_unregister(void); struct pci_dev; diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 3ee63968deaa..bda630889f95 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -166,9 +166,6 @@ static int pcie_init_service_irqs(struct pci_dev *dev, int *irqs, int mask) { int ret, i; - for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++) - irqs[i] = -1; - /* * If we support PME but can't use MSI/MSI-X for it, we have to * fall back to INTx or other interrupts, e.g., a system shared @@ -317,8 +314,10 @@ static int pcie_device_init(struct pci_dev *pdev, int service, int irq) */ int pcie_port_device_register(struct pci_dev *dev) { - int status, capabilities, i, nr_service; - int irqs[PCIE_PORT_DEVICE_MAXSERVICES]; + int status, capabilities, irq_services, i, nr_service; + int irqs[PCIE_PORT_DEVICE_MAXSERVICES] = { + [0 ... PCIE_PORT_DEVICE_MAXSERVICES-1] = -1 + }; /* Enable PCI Express port device */ status = pci_enable_device(dev); @@ -331,18 +330,32 @@ int pcie_port_device_register(struct pci_dev *dev) return 0; pci_set_master(dev); - /* - * Initialize service irqs. Don't use service devices that - * require interrupts if there is no way to generate them. - * However, some drivers may have a polling mode (e.g. pciehp_poll_mode) - * that can be used in the absence of irqs. Allow them to determine - * if that is to be used. - */ - status = pcie_init_service_irqs(dev, irqs, capabilities); - if (status) { - capabilities &= PCIE_PORT_SERVICE_HP; - if (!capabilities) - goto error_disable; + + irq_services = 0; + if (IS_ENABLED(CONFIG_PCIE_PME)) + irq_services |= PCIE_PORT_SERVICE_PME; + if (IS_ENABLED(CONFIG_PCIEAER)) + irq_services |= PCIE_PORT_SERVICE_AER; + if (IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) + irq_services |= PCIE_PORT_SERVICE_HP; + if (IS_ENABLED(CONFIG_PCIE_DPC)) + irq_services |= PCIE_PORT_SERVICE_DPC; + irq_services &= capabilities; + + if (irq_services) { + /* + * Initialize service IRQs. Don't use service devices that + * require interrupts if there is no way to generate them. + * However, some drivers may have a polling mode (e.g. + * pciehp_poll_mode) that can be used in the absence of IRQs. + * Allow them to determine if that is to be used. + */ + status = pcie_init_service_irqs(dev, irqs, irq_services); + if (status) { + irq_services &= PCIE_PORT_SERVICE_HP; + if (!irq_services) + goto error_disable; + } } /* Allocate child services if any */ @@ -367,24 +380,24 @@ error_disable: return status; } -#ifdef CONFIG_PM -typedef int (*pcie_pm_callback_t)(struct pcie_device *); +typedef int (*pcie_callback_t)(struct pcie_device *); -static int pm_iter(struct device *dev, void *data) +int pcie_port_device_iter(struct device *dev, void *data) { struct pcie_port_service_driver *service_driver; size_t offset = *(size_t *)data; - pcie_pm_callback_t cb; + pcie_callback_t cb; if ((dev->bus == &pcie_port_bus_type) && dev->driver) { service_driver = to_service_driver(dev->driver); - cb = *(pcie_pm_callback_t *)((void *)service_driver + offset); + cb = *(pcie_callback_t *)((void *)service_driver + offset); if (cb) return cb(to_pcie_device(dev)); } return 0; } +#ifdef CONFIG_PM /** * pcie_port_device_suspend - suspend port services associated with a PCIe port * @dev: PCI Express port to handle @@ -392,13 +405,13 @@ static int pm_iter(struct device *dev, void *data) int pcie_port_device_suspend(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, suspend); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } int pcie_port_device_resume_noirq(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, resume_noirq); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } /** @@ -408,7 +421,7 @@ int pcie_port_device_resume_noirq(struct device *dev) int pcie_port_device_resume(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, resume); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } /** @@ -418,7 +431,7 @@ int pcie_port_device_resume(struct device *dev) int pcie_port_device_runtime_suspend(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, runtime_suspend); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } /** @@ -428,7 +441,7 @@ int pcie_port_device_runtime_suspend(struct device *dev) int pcie_port_device_runtime_resume(struct device *dev) { size_t off = offsetof(struct pcie_port_service_driver, runtime_resume); - return device_for_each_child(dev, &off, pm_iter); + return device_for_each_child(dev, &off, pcie_port_device_iter); } #endif /* PM */ diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index c7ff1eea225a..35eca6277a96 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -160,6 +160,9 @@ static pci_ers_result_t pcie_portdrv_error_detected(struct pci_dev *dev, static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev) { + size_t off = offsetof(struct pcie_port_service_driver, slot_reset); + device_for_each_child(&dev->dev, &off, pcie_port_device_iter); + pci_restore_state(dev); pci_save_state(dev); return PCI_ERS_RESULT_RECOVERED; @@ -170,29 +173,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev) return PCI_ERS_RESULT_RECOVERED; } -static int resume_iter(struct device *device, void *data) -{ - struct pcie_device *pcie_device; - struct pcie_port_service_driver *driver; - - if (device->bus == &pcie_port_bus_type && device->driver) { - driver = to_service_driver(device->driver); - if (driver && driver->error_resume) { - pcie_device = to_pcie_device(device); - - /* Forward error message to service drivers */ - driver->error_resume(pcie_device->port); - } - } - - return 0; -} - -static void pcie_portdrv_err_resume(struct pci_dev *dev) -{ - device_for_each_child(&dev->dev, NULL, resume_iter); -} - /* * LINUX Device Driver Model */ @@ -210,7 +190,6 @@ static const struct pci_error_handlers pcie_portdrv_err_handler = { .error_detected = pcie_portdrv_error_detected, .slot_reset = pcie_portdrv_slot_reset, .mmio_enabled = pcie_portdrv_mmio_enabled, - .resume = pcie_portdrv_err_resume, }; static struct pci_driver pcie_portdriver = { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index d9fc02a71baa..087d3658f75c 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -883,11 +883,11 @@ static void pci_set_bus_msi_domain(struct pci_bus *bus) static int pci_register_host_bridge(struct pci_host_bridge *bridge) { struct device *parent = bridge->dev.parent; - struct resource_entry *window, *n; + struct resource_entry *window, *next, *n; struct pci_bus *bus, *b; - resource_size_t offset; + resource_size_t offset, next_offset; LIST_HEAD(resources); - struct resource *res; + struct resource *res, *next_res; char addr[64], *fmt; const char *name; int err; @@ -970,11 +970,34 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) if (nr_node_ids > 1 && pcibus_to_node(bus) == NUMA_NO_NODE) dev_warn(&bus->dev, "Unknown NUMA node; performance will be reduced\n"); - /* Add initial resources to the bus */ + /* Coalesce contiguous windows */ resource_list_for_each_entry_safe(window, n, &resources) { - list_move_tail(&window->node, &bridge->windows); + if (list_is_last(&window->node, &resources)) + break; + + next = list_next_entry(window, node); offset = window->offset; res = window->res; + next_offset = next->offset; + next_res = next->res; + + if (res->flags != next_res->flags || offset != next_offset) + continue; + + if (res->end + 1 == next_res->start) { + next_res->start = res->start; + res->flags = res->start = res->end = 0; + } + } + + /* Add initial resources to the bus */ + resource_list_for_each_entry_safe(window, n, &resources) { + offset = window->offset; + res = window->res; + if (!res->end) + continue; + + list_move_tail(&window->node, &bridge->windows); if (res->flags & IORESOURCE_BUS) pci_bus_insert_busn_res(bus, bus->number, res->end); @@ -2168,9 +2191,21 @@ static void pci_configure_ltr(struct pci_dev *dev) * Complex and all intermediate Switches indicate support for LTR. * PCIe r4.0, sec 6.18. */ - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || - ((bridge = pci_upstream_bridge(dev)) && - bridge->ltr_path)) { + if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) { + pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_LTR_EN); + dev->ltr_path = 1; + return; + } + + /* + * If we're configuring a hot-added device, LTR was likely + * disabled in the upstream bridge, so re-enable it before enabling + * it in the new device. + */ + bridge = pci_upstream_bridge(dev); + if (bridge && bridge->ltr_path) { + pci_bridge_reconfigure_ltr(dev); pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, PCI_EXP_DEVCTL2_LTR_EN); dev->ltr_path = 1; @@ -2450,7 +2485,7 @@ static struct irq_domain *pci_dev_msi_domain(struct pci_dev *dev) struct irq_domain *d; /* - * If a domain has been set through the pcibios_add_device() + * If a domain has been set through the pcibios_device_add() * callback, then this is the one (platform code knows best). */ d = dev_get_msi_domain(&dev->dev); @@ -2518,7 +2553,7 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) list_add_tail(&dev->bus_list, &bus->devices); up_write(&pci_bus_sem); - ret = pcibios_add_device(dev); + ret = pcibios_device_add(dev); WARN_ON(ret < 0); /* Set up MSI IRQ domain */ @@ -2550,11 +2585,12 @@ struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn) } EXPORT_SYMBOL(pci_scan_single_device); -static unsigned next_fn(struct pci_bus *bus, struct pci_dev *dev, unsigned fn) +static unsigned int next_fn(struct pci_bus *bus, struct pci_dev *dev, + unsigned int fn) { int pos; u16 cap = 0; - unsigned next_fn; + unsigned int next_fn; if (pci_ari_enabled(bus)) { if (!dev) @@ -2613,7 +2649,7 @@ static int only_one_child(struct pci_bus *bus) */ int pci_scan_slot(struct pci_bus *bus, int devfn) { - unsigned fn, nr = 0; + unsigned int fn, nr = 0; struct pci_dev *dev; if (only_one_child(bus) && (devfn > 0)) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index af30ab6c81e2..1f936704cfe2 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -501,7 +501,7 @@ static void quirk_s3_64M(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_868, quirk_s3_64M); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_968, quirk_s3_64M); -static void quirk_io(struct pci_dev *dev, int pos, unsigned size, +static void quirk_io(struct pci_dev *dev, int pos, unsigned int size, const char *name) { u32 region; @@ -552,7 +552,7 @@ static void quirk_cs5536_vsa(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, quirk_cs5536_vsa); static void quirk_io_region(struct pci_dev *dev, int port, - unsigned size, int nr, const char *name) + unsigned int size, int nr, const char *name) { u16 region; struct pci_bus_region bus_region; @@ -666,7 +666,7 @@ static void piix4_io_quirk(struct pci_dev *dev, const char *name, unsigned int p base = devres & 0xffff; size = 16; for (;;) { - unsigned bit = size >> 1; + unsigned int bit = size >> 1; if ((bit & mask) == bit) break; size = bit; @@ -692,7 +692,7 @@ static void piix4_mem_quirk(struct pci_dev *dev, const char *name, unsigned int mask = (devres & 0x3f) << 16; size = 128 << 16; for (;;) { - unsigned bit = size >> 1; + unsigned int bit = size >> 1; if ((bit & mask) == bit) break; size = bit; @@ -806,7 +806,7 @@ static void ich6_lpc_acpi_gpio(struct pci_dev *dev) "ICH6 GPIO"); } -static void ich6_lpc_generic_decode(struct pci_dev *dev, unsigned reg, +static void ich6_lpc_generic_decode(struct pci_dev *dev, unsigned int reg, const char *name, int dynsize) { u32 val; @@ -850,7 +850,7 @@ static void quirk_ich6_lpc(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, quirk_ich6_lpc); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, quirk_ich6_lpc); -static void ich7_lpc_generic_decode(struct pci_dev *dev, unsigned reg, +static void ich7_lpc_generic_decode(struct pci_dev *dev, unsigned int reg, const char *name) { u32 val; @@ -2700,7 +2700,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, * then the device can't use INTx interrupts. Tegra's PCIe root ports don't * generate MSI interrupts for PME and AER events instead only INTx interrupts * are generated. Though Tegra's PCIe root ports can generate MSI interrupts - * for other events, since PCIe specificiation doesn't support using a mix of + * for other events, since PCIe specification doesn't support using a mix of * INTx and MSI/MSI-X, it is required to disable MSI interrupts to avoid port * service drivers registering their respective ISRs for MSIs. */ @@ -3612,6 +3612,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0032, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0033, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0034, quirk_no_bus_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003e, quirk_no_bus_reset); /* * Root port on some Cavium CN8xxx chips do not successfully complete a bus @@ -5819,3 +5820,58 @@ static void apex_pci_fixup_class(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_CLASS_HEADER(0x1ac1, 0x089a, PCI_CLASS_NOT_DEFINED, 8, apex_pci_fixup_class); + +/* + * Pericom PI7C9X2G404/PI7C9X2G304/PI7C9X2G303 switch erratum E5 - + * ACS P2P Request Redirect is not functional + * + * When ACS P2P Request Redirect is enabled and bandwidth is not balanced + * between upstream and downstream ports, packets are queued in an internal + * buffer until CPLD packet. The workaround is to use the switch in store and + * forward mode. + */ +#define PI7C9X2Gxxx_MODE_REG 0x74 +#define PI7C9X2Gxxx_STORE_FORWARD_MODE BIT(0) +static void pci_fixup_pericom_acs_store_forward(struct pci_dev *pdev) +{ + struct pci_dev *upstream; + u16 val; + + /* Downstream ports only */ + if (pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM) + return; + + /* Check for ACS P2P Request Redirect use */ + if (!pdev->acs_cap) + return; + pci_read_config_word(pdev, pdev->acs_cap + PCI_ACS_CTRL, &val); + if (!(val & PCI_ACS_RR)) + return; + + upstream = pci_upstream_bridge(pdev); + if (!upstream) + return; + + pci_read_config_word(upstream, PI7C9X2Gxxx_MODE_REG, &val); + if (!(val & PI7C9X2Gxxx_STORE_FORWARD_MODE)) { + pci_info(upstream, "Setting PI7C9X2Gxxx store-forward mode to avoid ACS erratum\n"); + pci_write_config_word(upstream, PI7C9X2Gxxx_MODE_REG, val | + PI7C9X2Gxxx_STORE_FORWARD_MODE); + } +} +/* + * Apply fixup on enable and on resume, in order to apply the fix up whenever + * ACS configuration changes or switch mode is reset + */ +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2404, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2404, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2304, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2304, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2303, + pci_fixup_pericom_acs_store_forward); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2303, + pci_fixup_pericom_acs_store_forward); diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c index 8fc9a4e911e3..e18d3a4383ba 100644 --- a/drivers/pci/rom.c +++ b/drivers/pci/rom.c @@ -85,7 +85,7 @@ static size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, { void __iomem *image; int last_image; - unsigned length; + unsigned int length; image = rom; do { diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 2ce636937c6e..547396ec50b5 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1525,7 +1525,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus, { struct pci_dev *dev = bus->self; struct resource *r; - unsigned old_flags = 0; + unsigned int old_flags = 0; struct resource *b_res; int idx = 1; diff --git a/drivers/pci/setup-irq.c b/drivers/pci/setup-irq.c index 7129494754dd..cc7d26b015f3 100644 --- a/drivers/pci/setup-irq.c +++ b/drivers/pci/setup-irq.c @@ -8,7 +8,6 @@ * David Miller (davem@redhat.com) */ - #include #include #include @@ -28,25 +27,26 @@ void pci_assign_irq(struct pci_dev *dev) return; } - /* If this device is not on the primary bus, we need to figure out - which interrupt pin it will come in on. We know which slot it - will come in on 'cos that slot is where the bridge is. Each - time the interrupt line passes through a PCI-PCI bridge we must - apply the swizzle function. */ - + /* + * If this device is not on the primary bus, we need to figure out + * which interrupt pin it will come in on. We know which slot it + * will come in on because that slot is where the bridge is. Each + * time the interrupt line passes through a PCI-PCI bridge we must + * apply the swizzle function. + */ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); /* Cope with illegal. */ if (pin > 4) pin = 1; if (pin) { - /* Follow the chain of bridges, swizzling as we go. */ + /* Follow the chain of bridges, swizzling as we go. */ if (hbrg->swizzle_irq) slot = (*(hbrg->swizzle_irq))(dev, &pin); /* - * If a swizzling function is not used map_irq must - * ignore slot + * If a swizzling function is not used, map_irq() must + * ignore slot. */ irq = (*(hbrg->map_irq))(dev, slot, pin); if (irq == -1) @@ -56,7 +56,9 @@ void pci_assign_irq(struct pci_dev *dev) pci_dbg(dev, "assign IRQ: got %d\n", dev->irq); - /* Always tell the device, so the driver knows what is - the real IRQ to use; the device does not use it. */ + /* + * Always tell the device, so the driver knows what is the real IRQ + * to use; the device does not use it. + */ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index 0b301f8be9ed..38c2b036fb8e 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -45,6 +45,7 @@ enum mrpc_state { MRPC_QUEUED, MRPC_RUNNING, MRPC_DONE, + MRPC_IO_ERROR, }; struct switchtec_user { @@ -66,6 +67,19 @@ struct switchtec_user { int event_cnt; }; +/* + * The MMIO reads to the device_id register should always return the device ID + * of the device, otherwise the firmware is probably stuck or unreachable + * due to a firmware reset which clears PCI state including the BARs and Memory + * Space Enable bits. + */ +static int is_firmware_running(struct switchtec_dev *stdev) +{ + u32 device = ioread32(&stdev->mmio_sys_info->device_id); + + return stdev->pdev->device == device; +} + static struct switchtec_user *stuser_create(struct switchtec_dev *stdev) { struct switchtec_user *stuser; @@ -113,6 +127,7 @@ static void stuser_set_state(struct switchtec_user *stuser, [MRPC_QUEUED] = "QUEUED", [MRPC_RUNNING] = "RUNNING", [MRPC_DONE] = "DONE", + [MRPC_IO_ERROR] = "IO_ERROR", }; stuser->state = state; @@ -184,9 +199,26 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser) return 0; } +static void mrpc_cleanup_cmd(struct switchtec_dev *stdev) +{ + /* requires the mrpc_mutex to already be held when called */ + + struct switchtec_user *stuser = list_entry(stdev->mrpc_queue.next, + struct switchtec_user, list); + + stuser->cmd_done = true; + wake_up_interruptible(&stuser->cmd_comp); + list_del_init(&stuser->list); + stuser_put(stuser); + stdev->mrpc_busy = 0; + + mrpc_cmd_submit(stdev); +} + static void mrpc_complete_cmd(struct switchtec_dev *stdev) { /* requires the mrpc_mutex to already be held when called */ + struct switchtec_user *stuser; if (list_empty(&stdev->mrpc_queue)) @@ -206,7 +238,8 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev) stuser_set_state(stuser, MRPC_DONE); stuser->return_code = 0; - if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE) + if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE && + stuser->status != SWITCHTEC_MRPC_STATUS_ERROR) goto out; if (stdev->dma_mrpc) @@ -223,13 +256,7 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev) memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data, stuser->read_len); out: - stuser->cmd_done = true; - wake_up_interruptible(&stuser->cmd_comp); - list_del_init(&stuser->list); - stuser_put(stuser); - stdev->mrpc_busy = 0; - - mrpc_cmd_submit(stdev); + mrpc_cleanup_cmd(stdev); } static void mrpc_event_work(struct work_struct *work) @@ -246,6 +273,23 @@ static void mrpc_event_work(struct work_struct *work) mutex_unlock(&stdev->mrpc_mutex); } +static void mrpc_error_complete_cmd(struct switchtec_dev *stdev) +{ + /* requires the mrpc_mutex to already be held when called */ + + struct switchtec_user *stuser; + + if (list_empty(&stdev->mrpc_queue)) + return; + + stuser = list_entry(stdev->mrpc_queue.next, + struct switchtec_user, list); + + stuser_set_state(stuser, MRPC_IO_ERROR); + + mrpc_cleanup_cmd(stdev); +} + static void mrpc_timeout_work(struct work_struct *work) { struct switchtec_dev *stdev; @@ -257,6 +301,11 @@ static void mrpc_timeout_work(struct work_struct *work) mutex_lock(&stdev->mrpc_mutex); + if (!is_firmware_running(stdev)) { + mrpc_error_complete_cmd(stdev); + goto out; + } + if (stdev->dma_mrpc) status = stdev->dma_mrpc->status; else @@ -327,7 +376,7 @@ static ssize_t field ## _show(struct device *dev, \ return io_string_show(buf, &si->gen4.field, \ sizeof(si->gen4.field)); \ else \ - return -ENOTSUPP; \ + return -EOPNOTSUPP; \ } \ \ static DEVICE_ATTR_RO(field) @@ -544,6 +593,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data, if (rc) return rc; + if (stuser->state == MRPC_IO_ERROR) { + mutex_unlock(&stdev->mrpc_mutex); + return -EIO; + } + if (stuser->state != MRPC_DONE) { mutex_unlock(&stdev->mrpc_mutex); return -EBADE; @@ -569,7 +623,8 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data, out: mutex_unlock(&stdev->mrpc_mutex); - if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE) + if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE || + stuser->status == SWITCHTEC_MRPC_STATUS_ERROR) return size; else if (stuser->status == SWITCHTEC_MRPC_STATUS_INTERRUPTED) return -ENXIO; @@ -613,7 +668,7 @@ static int ioctl_flash_info(struct switchtec_dev *stdev, info.flash_length = ioread32(&fi->gen4.flash_length); info.num_partitions = SWITCHTEC_NUM_PARTITIONS_GEN4; } else { - return -ENOTSUPP; + return -EOPNOTSUPP; } if (copy_to_user(uinfo, &info, sizeof(info))) @@ -821,7 +876,7 @@ static int ioctl_flash_part_info(struct switchtec_dev *stdev, if (ret) return ret; } else { - return -ENOTSUPP; + return -EOPNOTSUPP; } if (copy_to_user(uinfo, &info, sizeof(info))) @@ -969,6 +1024,9 @@ static int event_ctl(struct switchtec_dev *stdev, return PTR_ERR(reg); hdr = ioread32(reg); + if (hdr & SWITCHTEC_EVENT_NOT_SUPP) + return -EOPNOTSUPP; + for (i = 0; i < ARRAY_SIZE(ctl->data); i++) ctl->data[i] = ioread32(®[i + 1]); @@ -1041,7 +1099,7 @@ static int ioctl_event_ctl(struct switchtec_dev *stdev, for (ctl.index = 0; ctl.index < nr_idxs; ctl.index++) { ctl.flags = event_flags; ret = event_ctl(stdev, &ctl); - if (ret < 0) + if (ret < 0 && ret != -EOPNOTSUPP) return ret; } } else { @@ -1078,7 +1136,7 @@ static int ioctl_pff_to_port(struct switchtec_dev *stdev, break; } - reg = ioread32(&pcfg->vep_pff_inst_id); + reg = ioread32(&pcfg->vep_pff_inst_id) & 0xFF; if (reg == p.pff) { p.port = SWITCHTEC_IOCTL_PFF_VEP; break; @@ -1124,7 +1182,7 @@ static int ioctl_port_to_pff(struct switchtec_dev *stdev, p.pff = ioread32(&pcfg->usp_pff_inst_id); break; case SWITCHTEC_IOCTL_PFF_VEP: - p.pff = ioread32(&pcfg->vep_pff_inst_id); + p.pff = ioread32(&pcfg->vep_pff_inst_id) & 0xFF; break; default: if (p.port > ARRAY_SIZE(pcfg->dsp_pff_inst_id)) @@ -1348,6 +1406,9 @@ static int mask_event(struct switchtec_dev *stdev, int eid, int idx) hdr_reg = event_regs[eid].map_reg(stdev, off, idx); hdr = ioread32(hdr_reg); + if (hdr & SWITCHTEC_EVENT_NOT_SUPP) + return 0; + if (!(hdr & SWITCHTEC_EVENT_OCCURRED && hdr & SWITCHTEC_EVENT_EN_IRQ)) return 0; @@ -1498,7 +1559,7 @@ static void init_pff(struct switchtec_dev *stdev) if (reg < stdev->pff_csr_count) stdev->pff_local[reg] = 1; - reg = ioread32(&pcfg->vep_pff_inst_id); + reg = ioread32(&pcfg->vep_pff_inst_id) & 0xFF; if (reg < stdev->pff_csr_count) stdev->pff_local[reg] = 1; @@ -1556,7 +1617,7 @@ static int switchtec_init_pci(struct switchtec_dev *stdev, else if (stdev->gen == SWITCHTEC_GEN4) part_id = &stdev->mmio_sys_info->gen4.partition_id; else - return -ENOTSUPP; + return -EOPNOTSUPP; stdev->partition = ioread8(part_id); stdev->partition_count = ioread8(&stdev->mmio_ntb->partition_count); diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 4be24890132e..a4fc4d0690fe 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -57,10 +57,7 @@ static size_t pci_vpd_size(struct pci_dev *dev) size_t off = 0, size; unsigned char tag, header[1+2]; /* 1 byte tag, 2 bytes length */ - /* Otherwise the following reads would fail. */ - dev->vpd.len = PCI_VPD_MAX_SIZE; - - while (pci_read_vpd(dev, off, 1, header) == 1) { + while (pci_read_vpd_any(dev, off, 1, header) == 1) { size = 0; if (off == 0 && (header[0] == 0x00 || header[0] == 0xff)) @@ -68,7 +65,7 @@ static size_t pci_vpd_size(struct pci_dev *dev) if (header[0] & PCI_VPD_LRDT) { /* Large Resource Data Type Tag */ - if (pci_read_vpd(dev, off + 1, 2, &header[1]) != 2) { + if (pci_read_vpd_any(dev, off + 1, 2, &header[1]) != 2) { pci_warn(dev, "failed VPD read at offset %zu\n", off + 1); return off ?: PCI_VPD_SZ_INVALID; @@ -99,14 +96,14 @@ error: return off ?: PCI_VPD_SZ_INVALID; } -static bool pci_vpd_available(struct pci_dev *dev) +static bool pci_vpd_available(struct pci_dev *dev, bool check_size) { struct pci_vpd *vpd = &dev->vpd; if (!vpd->cap) return false; - if (vpd->len == 0) { + if (vpd->len == 0 && check_size) { vpd->len = pci_vpd_size(dev); if (vpd->len == PCI_VPD_SZ_INVALID) { vpd->cap = 0; @@ -156,24 +153,27 @@ static int pci_vpd_wait(struct pci_dev *dev, bool set) } static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count, - void *arg) + void *arg, bool check_size) { struct pci_vpd *vpd = &dev->vpd; + unsigned int max_len; int ret = 0; loff_t end = pos + count; u8 *buf = arg; - if (!pci_vpd_available(dev)) + if (!pci_vpd_available(dev, check_size)) return -ENODEV; if (pos < 0) return -EINVAL; - if (pos > vpd->len) + max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE; + + if (pos >= max_len) return 0; - if (end > vpd->len) { - end = vpd->len; + if (end > max_len) { + end = max_len; count = end - pos; } @@ -217,20 +217,23 @@ static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count, } static ssize_t pci_vpd_write(struct pci_dev *dev, loff_t pos, size_t count, - const void *arg) + const void *arg, bool check_size) { struct pci_vpd *vpd = &dev->vpd; + unsigned int max_len; const u8 *buf = arg; loff_t end = pos + count; int ret = 0; - if (!pci_vpd_available(dev)) + if (!pci_vpd_available(dev, check_size)) return -ENODEV; if (pos < 0 || (pos & 3) || (count & 3)) return -EINVAL; - if (end > vpd->len) + max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE; + + if (end > max_len) return -EINVAL; if (mutex_lock_killable(&vpd->lock)) @@ -313,7 +316,7 @@ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size) void *buf; int cnt; - if (!pci_vpd_available(dev)) + if (!pci_vpd_available(dev, true)) return ERR_PTR(-ENODEV); len = dev->vpd.len; @@ -381,6 +384,24 @@ static int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, return -ENOENT; } +static ssize_t __pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf, + bool check_size) +{ + ssize_t ret; + + if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) { + dev = pci_get_func0_dev(dev); + if (!dev) + return -ENODEV; + + ret = pci_vpd_read(dev, pos, count, buf, check_size); + pci_dev_put(dev); + return ret; + } + + return pci_vpd_read(dev, pos, count, buf, check_size); +} + /** * pci_read_vpd - Read one entry from Vital Product Data * @dev: PCI device struct @@ -389,6 +410,20 @@ static int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, * @buf: pointer to where to store result */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf) +{ + return __pci_read_vpd(dev, pos, count, buf, true); +} +EXPORT_SYMBOL(pci_read_vpd); + +/* Same, but allow to access any address */ +ssize_t pci_read_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, void *buf) +{ + return __pci_read_vpd(dev, pos, count, buf, false); +} +EXPORT_SYMBOL(pci_read_vpd_any); + +static ssize_t __pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, + const void *buf, bool check_size) { ssize_t ret; @@ -397,14 +432,13 @@ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf) if (!dev) return -ENODEV; - ret = pci_vpd_read(dev, pos, count, buf); + ret = pci_vpd_write(dev, pos, count, buf, check_size); pci_dev_put(dev); return ret; } - return pci_vpd_read(dev, pos, count, buf); + return pci_vpd_write(dev, pos, count, buf, check_size); } -EXPORT_SYMBOL(pci_read_vpd); /** * pci_write_vpd - Write entry to Vital Product Data @@ -415,22 +449,17 @@ EXPORT_SYMBOL(pci_read_vpd); */ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf) { - ssize_t ret; - - if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) { - dev = pci_get_func0_dev(dev); - if (!dev) - return -ENODEV; - - ret = pci_vpd_write(dev, pos, count, buf); - pci_dev_put(dev); - return ret; - } - - return pci_vpd_write(dev, pos, count, buf); + return __pci_write_vpd(dev, pos, count, buf, true); } EXPORT_SYMBOL(pci_write_vpd); +/* Same, but allow to access any address */ +ssize_t pci_write_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, const void *buf) +{ + return __pci_write_vpd(dev, pos, count, buf, false); +} +EXPORT_SYMBOL(pci_write_vpd_any); + int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len, const char *kw, unsigned int *size) { diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 2156c632524d..d858d25b6cab 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -588,61 +588,43 @@ static pci_ers_result_t pcifront_common_process(int cmd, struct pcifront_device *pdev, pci_channel_state_t state) { - pci_ers_result_t result; struct pci_driver *pdrv; int bus = pdev->sh_info->aer_op.bus; int devfn = pdev->sh_info->aer_op.devfn; int domain = pdev->sh_info->aer_op.domain; struct pci_dev *pcidev; - int flag = 0; dev_dbg(&pdev->xdev->dev, "pcifront AER process: cmd %x (bus:%x, devfn%x)", cmd, bus, devfn); - result = PCI_ERS_RESULT_NONE; pcidev = pci_get_domain_bus_and_slot(domain, bus, devfn); - if (!pcidev || !pcidev->driver) { + if (!pcidev || !pcidev->dev.driver) { dev_err(&pdev->xdev->dev, "device or AER driver is NULL\n"); pci_dev_put(pcidev); - return result; + return PCI_ERS_RESULT_NONE; } - pdrv = pcidev->driver; + pdrv = to_pci_driver(pcidev->dev.driver); - if (pdrv) { - if (pdrv->err_handler && pdrv->err_handler->error_detected) { - pci_dbg(pcidev, "trying to call AER service\n"); - if (pcidev) { - flag = 1; - switch (cmd) { - case XEN_PCI_OP_aer_detected: - result = pdrv->err_handler-> - error_detected(pcidev, state); - break; - case XEN_PCI_OP_aer_mmio: - result = pdrv->err_handler-> - mmio_enabled(pcidev); - break; - case XEN_PCI_OP_aer_slotreset: - result = pdrv->err_handler-> - slot_reset(pcidev); - break; - case XEN_PCI_OP_aer_resume: - pdrv->err_handler->resume(pcidev); - break; - default: - dev_err(&pdev->xdev->dev, - "bad request in aer recovery " - "operation!\n"); - - } - } + if (pdrv->err_handler && pdrv->err_handler->error_detected) { + pci_dbg(pcidev, "trying to call AER service\n"); + switch (cmd) { + case XEN_PCI_OP_aer_detected: + return pdrv->err_handler->error_detected(pcidev, state); + case XEN_PCI_OP_aer_mmio: + return pdrv->err_handler->mmio_enabled(pcidev); + case XEN_PCI_OP_aer_slotreset: + return pdrv->err_handler->slot_reset(pcidev); + case XEN_PCI_OP_aer_resume: + pdrv->err_handler->resume(pcidev); + return PCI_ERS_RESULT_NONE; + default: + dev_err(&pdev->xdev->dev, + "bad request in aer recovery operation!\n"); } } - if (!flag) - result = PCI_ERS_RESULT_NONE; - return result; + return PCI_ERS_RESULT_NONE; } diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 94331d999d27..7df466e22282 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -965,6 +965,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) struct rio_transfer_io *transfer; enum dma_data_direction dir; int i, ret = 0; + size_t size; if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction)))) return -EFAULT; @@ -976,13 +977,14 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) priv->md->properties.transfer_mode) == 0) return -ENODEV; - transfer = vmalloc(array_size(sizeof(*transfer), transaction.count)); + size = array_size(sizeof(*transfer), transaction.count); + transfer = vmalloc(size); if (!transfer) return -ENOMEM; if (unlikely(copy_from_user(transfer, (void __user *)(uintptr_t)transaction.block, - array_size(sizeof(*transfer), transaction.count)))) { + size))) { ret = -EFAULT; goto out_free; } @@ -994,8 +996,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) transaction.sync, dir, &transfer[i]); if (unlikely(copy_to_user((void __user *)(uintptr_t)transaction.block, - transfer, - array_size(sizeof(*transfer), transaction.count)))) + transfer, size))) ret = -EFAULT; out_free: diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 3a6f3af240fa..a7a33ebf4bbe 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -34,7 +34,7 @@ int dasd_gendisk_alloc(struct dasd_block *block) { struct gendisk *gdp; struct dasd_device *base; - int len; + int len, rc; /* Make sure the minor for this device exists. */ base = block->base; @@ -80,7 +80,13 @@ int dasd_gendisk_alloc(struct dasd_block *block) dasd_add_link_to_gendisk(gdp, base); block->gdp = gdp; set_capacity(block->gdp, 0); - device_add_disk(&base->cdev->dev, block->gdp, NULL); + + rc = device_add_disk(&base->cdev->dev, block->gdp, NULL); + if (rc) { + dasd_gendisk_free(block); + return rc; + } + return 0; } diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 59e513d34b0f..27ab888b44d0 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -696,7 +696,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char } get_device(&dev_info->dev); - device_add_disk(&dev_info->dev, dev_info->gd, NULL); + rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL); + if (rc) + goto out_dax; switch (dev_info->segment_type) { case SEG_TYPE_SR: @@ -712,6 +714,10 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char rc = count; goto out; +out_dax: + put_device(&dev_info->dev); + kill_dax(dev_info->dax_dev); + put_dax(dev_info->dax_dev); put_dev: list_del(&dev_info->lh); blk_cleanup_disk(dev_info->gd); diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 88cba6212ee2..61ecdcb2cc6a 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -495,9 +495,14 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) /* 512 byte sectors */ set_capacity(bdev->gendisk, scmdev->size >> 9); - device_add_disk(&scmdev->dev, bdev->gendisk, NULL); + ret = device_add_disk(&scmdev->dev, bdev->gendisk, NULL); + if (ret) + goto out_cleanup_disk; + return 0; +out_cleanup_disk: + blk_cleanup_disk(bdev->gendisk); out_tag: blk_mq_free_tag_set(&bdev->tag_set); out: diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index 2cf7fe131ece..f0763e36b861 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -163,7 +163,7 @@ static inline void sclp_trace_req(int prio, char *id, struct sclp_req *req, summary.timeout = (u16)req->queue_timeout; summary.start_count = (u16)req->start_count; - sclp_trace(prio, id, (u32)(addr_t)sccb, summary.b, err); + sclp_trace(prio, id, __pa(sccb), summary.b, err); } static inline void sclp_trace_register(int prio, char *id, u32 a, u64 b, @@ -502,7 +502,7 @@ sclp_add_request(struct sclp_req *req) } /* RQAD: Request was added (a=sccb, b=caller) */ - sclp_trace(2, "RQAD", (u32)(addr_t)req->sccb, _RET_IP_, false); + sclp_trace(2, "RQAD", __pa(req->sccb), _RET_IP_, false); req->status = SCLP_REQ_QUEUED; req->start_count = 0; @@ -617,15 +617,15 @@ __sclp_find_req(u32 sccb) list_for_each(l, &sclp_req_queue) { req = list_entry(l, struct sclp_req, list); - if (sccb == (u32) (addr_t) req->sccb) - return req; + if (sccb == __pa(req->sccb)) + return req; } return NULL; } static bool ok_response(u32 sccb_int, sclp_cmdw_t cmd) { - struct sccb_header *sccb = (struct sccb_header *)(addr_t)sccb_int; + struct sccb_header *sccb = (struct sccb_header *)__va(sccb_int); struct evbuf_header *evbuf; u16 response; @@ -664,7 +664,7 @@ static void sclp_interrupt_handler(struct ext_code ext_code, /* INT: Interrupt received (a=intparm, b=cmd) */ sclp_trace_sccb(0, "INT", param32, active_cmd, active_cmd, - (struct sccb_header *)(addr_t)finished_sccb, + (struct sccb_header *)__va(finished_sccb), !ok_response(finished_sccb, active_cmd)); if (finished_sccb) { @@ -1110,7 +1110,7 @@ static void sclp_check_handler(struct ext_code ext_code, /* Is this the interrupt we are waiting for? */ if (finished_sccb == 0) return; - if (finished_sccb != (u32) (addr_t) sclp_init_sccb) + if (finished_sccb != __pa(sclp_init_sccb)) panic("sclp: unsolicited interrupt for buffer at 0x%x\n", finished_sccb); spin_lock(&sclp_lock); diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 5e434108aae6..8a30e77db469 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -333,7 +333,7 @@ static inline int sclp_service_call(sclp_cmdw_t command, void *sccb) "2:\n" EX_TABLE(0b, 2b) EX_TABLE(1b, 2b) - : "+&d" (cc) : "d" (command), "a" ((unsigned long)sccb) + : "+&d" (cc) : "d" (command), "a" (__pa(sccb)) : "cc", "memory"); if (cc == 4) return -EINVAL; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index f3d5c7f4c13d..b64feab62caa 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -139,7 +139,7 @@ int __init sclp_early_get_core_info(struct sclp_core_info *info) } sclp_fill_core_info(info, sccb); out: - memblock_free_early((unsigned long)sccb, length); + memblock_phys_free((unsigned long)sccb, length); return rc; } @@ -155,6 +155,11 @@ static void __init sclp_early_console_detect(struct init_sccb *sccb) sclp.has_linemode = 1; } +void __init sclp_early_adjust_va(void) +{ + sclp_early_sccb = __va((unsigned long)sclp_early_sccb); +} + void __init sclp_early_detect(void) { void *sccb = sclp_early_sccb; diff --git a/drivers/s390/char/sclp_ftp.c b/drivers/s390/char/sclp_ftp.c index 1e9de99dcd02..ec5a0e2b9255 100644 --- a/drivers/s390/char/sclp_ftp.c +++ b/drivers/s390/char/sclp_ftp.c @@ -31,6 +31,8 @@ static u64 sclp_ftp_length; /** * sclp_ftp_txcb() - Diagnostic Test FTP services SCLP command callback + * @req: sclp request + * @data: pointer to struct completion */ static void sclp_ftp_txcb(struct sclp_req *req, void *data) { @@ -45,6 +47,7 @@ static void sclp_ftp_txcb(struct sclp_req *req, void *data) /** * sclp_ftp_rxcb() - Diagnostic Test FTP services receiver event callback + * @evbuf: pointer to Diagnostic Test (ET7) event buffer */ static void sclp_ftp_rxcb(struct evbuf_header *evbuf) { diff --git a/drivers/s390/char/sclp_sd.c b/drivers/s390/char/sclp_sd.c index 1e244f78f192..25c2d760f6e6 100644 --- a/drivers/s390/char/sclp_sd.c +++ b/drivers/s390/char/sclp_sd.c @@ -122,6 +122,7 @@ static void sclp_sd_listener_remove(struct sclp_sd_listener *listener) /** * sclp_sd_listener_init() - Initialize a Store Data response listener + * @listener: Response listener to initialize * @id: Event ID to listen for * * Initialize a listener for asynchronous Store Data responses. This listener @@ -193,7 +194,7 @@ static int sclp_sd_sync(unsigned long page, u8 eq, u8 di, u64 sat, u64 sa, struct sclp_sd_evbuf *evbuf; int rc; - sclp_sd_listener_init(&listener, (u32) (addr_t) sccb); + sclp_sd_listener_init(&listener, __pa(sccb)); sclp_sd_listener_add(&listener); /* Prepare SCCB */ @@ -403,6 +404,7 @@ static int sclp_sd_file_update(struct sclp_sd_file *sd_file) /** * sclp_sd_file_update_async() - Wrapper for asynchronous update call * @data: Object to update + * @cookie: Unused */ static void sclp_sd_file_update_async(void *data, async_cookie_t cookie) { @@ -414,6 +416,9 @@ static void sclp_sd_file_update_async(void *data, async_cookie_t cookie) /** * reload_store() - Store function for "reload" sysfs attribute * @kobj: Kobject of sclp_sd_file object + * @attr: Reload attribute + * @buf: Data written to sysfs attribute + * @count: Count of bytes written * * Initiate a reload of the data associated with an sclp_sd_file object. */ @@ -441,8 +446,10 @@ static struct kobj_type sclp_sd_file_ktype = { }; /** - * data_read() - Read function for "read" sysfs attribute + * data_read() - Read function for "data" sysfs attribute + * @file: Open file pointer * @kobj: Kobject of sclp_sd_file object + * @attr: Data attribute * @buffer: Target buffer * @off: Requested file offset * @size: Requested number of bytes diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c index 29a6a0099f83..7bc4e4a10937 100644 --- a/drivers/s390/char/sclp_vt220.c +++ b/drivers/s390/char/sclp_vt220.c @@ -768,6 +768,8 @@ out_driver: } __initcall(sclp_vt220_tty_init); +#ifdef CONFIG_SCLP_VT220_CONSOLE + static void __sclp_vt220_flush_buffer(void) { unsigned long flags; @@ -784,8 +786,6 @@ static void __sclp_vt220_flush_buffer(void) spin_unlock_irqrestore(&sclp_vt220_lock, flags); } -#ifdef CONFIG_SCLP_VT220_CONSOLE - static void sclp_vt220_con_write(struct console *con, const char *buf, unsigned int count) { diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 44461928aab8..2bc55ccf3f23 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -792,10 +792,13 @@ static int __unset_online(struct device *dev, void *data) { struct idset *set = data; struct subchannel *sch = to_subchannel(dev); - struct ccw_device *cdev = sch_get_cdev(sch); + struct ccw_device *cdev; - if (cdev && cdev->online) - idset_sch_del(set, sch->schid); + if (sch->st == SUBCHANNEL_TYPE_IO) { + cdev = sch_get_cdev(sch); + if (cdev && cdev->online) + idset_sch_del(set, sch->schid); + } return 0; } diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 8d14569823d7..07a17613fab5 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1322,6 +1322,7 @@ static int purge_fn(struct device *dev, void *data) { struct ccw_device *cdev = to_ccwdev(dev); struct ccw_dev_id *id = &cdev->private->dev_id; + struct subchannel *sch = to_subchannel(cdev->dev.parent); spin_lock_irq(cdev->ccwlock); if (is_blacklisted(id->ssid, id->devno) && @@ -1330,6 +1331,7 @@ static int purge_fn(struct device *dev, void *data) CIO_MSG_EVENT(3, "ccw: purging 0.%x.%04x\n", id->ssid, id->devno); ccw_device_sched_todo(cdev, CDEV_TODO_UNREG); + css_sched_sch_todo(sch, SCH_TODO_UNREG); atomic_set(&cdev->private->onoff, 0); } spin_unlock_irq(cdev->ccwlock); diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index 0fe7b2f2e7f5..c533d1dadc6b 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -825,13 +825,23 @@ EXPORT_SYMBOL_GPL(ccw_device_get_chid); */ void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size) { - return cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size); + void *addr; + + if (!get_device(&cdev->dev)) + return NULL; + addr = cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size); + if (IS_ERR_OR_NULL(addr)) + put_device(&cdev->dev); + return addr; } EXPORT_SYMBOL(ccw_device_dma_zalloc); void ccw_device_dma_free(struct ccw_device *cdev, void *cpu_addr, size_t size) { + if (!cpu_addr) + return; cio_gp_dma_free(cdev->private->dma_pool, cpu_addr, size); + put_device(&cdev->dev); } EXPORT_SYMBOL(ccw_device_dma_free); diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index d9b804943d19..1986243f9cd3 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -61,6 +61,10 @@ static char *aqm_str; module_param_named(aqmask, aqm_str, charp, 0440); MODULE_PARM_DESC(aqmask, "AP bus domain mask."); +static int ap_useirq = 1; +module_param_named(useirq, ap_useirq, int, 0440); +MODULE_PARM_DESC(useirq, "Use interrupt if available, default is 1 (on)."); + atomic_t ap_max_msg_size = ATOMIC_INIT(AP_DEFAULT_MAX_MSG_SIZE); EXPORT_SYMBOL(ap_max_msg_size); @@ -725,7 +729,7 @@ static void ap_check_bindings_complete(void) if (bound == apqns) { if (!completion_done(&ap_init_apqn_bindings_complete)) { complete_all(&ap_init_apqn_bindings_complete); - AP_DBF(DBF_INFO, "%s complete\n", __func__); + AP_DBF_INFO("%s complete\n", __func__); } ap_send_bindings_complete_uevent(); } @@ -786,9 +790,12 @@ static int __ap_revise_reserved(struct device *dev, void *dummy) drvres = to_ap_drv(dev->driver)->flags & AP_DRIVER_FLAG_DEFAULT; if (!!devres != !!drvres) { - AP_DBF_DBG("reprobing queue=%02x.%04x\n", - card, queue); + AP_DBF_DBG("%s reprobing queue=%02x.%04x\n", + __func__, card, queue); rc = device_reprobe(dev); + if (rc) + AP_DBF_WARN("%s reprobing queue=%02x.%04x failed\n", + __func__, card, queue); } } @@ -1118,7 +1125,8 @@ static ssize_t ap_domain_store(struct bus_type *bus, ap_domain_index = domain; spin_unlock_bh(&ap_domain_lock); - AP_DBF_INFO("stored new default domain=%d\n", domain); + AP_DBF_INFO("%s stored new default domain=%d\n", + __func__, domain); return count; } @@ -1433,8 +1441,9 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) /* < CEX2A is not supported */ if (rawtype < AP_DEVICE_TYPE_CEX2A) { - AP_DBF_WARN("get_comp_type queue=%02x.%04x unsupported type %d\n", - AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype); + AP_DBF_WARN("%s queue=%02x.%04x unsupported type %d\n", + __func__, AP_QID_CARD(qid), + AP_QID_QUEUE(qid), rawtype); return 0; } /* up to CEX7 known and fully supported */ @@ -1458,11 +1467,12 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) comp_type = apinfo.cat; } if (!comp_type) - AP_DBF_WARN("get_comp_type queue=%02x.%04x unable to map type %d\n", - AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype); + AP_DBF_WARN("%s queue=%02x.%04x unable to map type %d\n", + __func__, AP_QID_CARD(qid), + AP_QID_QUEUE(qid), rawtype); else if (comp_type != rawtype) - AP_DBF_INFO("get_comp_type queue=%02x.%04x map type %d to %d\n", - AP_QID_CARD(qid), AP_QID_QUEUE(qid), + AP_DBF_INFO("%s queue=%02x.%04x map type %d to %d\n", + __func__, AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype, comp_type); return comp_type; } @@ -1535,7 +1545,7 @@ static inline void ap_scan_domains(struct ap_card *ac) aq = dev ? to_ap_queue(dev) : NULL; if (!ap_test_config_usage_domain(dom)) { if (dev) { - AP_DBF_INFO("%s(%d,%d) not in config any more, rm queue device\n", + AP_DBF_INFO("%s(%d,%d) not in config anymore, rm queue dev\n", __func__, ac->id, dom); device_unregister(dev); put_device(dev); @@ -1545,9 +1555,8 @@ static inline void ap_scan_domains(struct ap_card *ac) /* domain is valid, get info from this APQN */ if (!ap_queue_info(qid, &type, &func, &depth, &ml, &decfg)) { if (aq) { - AP_DBF_INFO( - "%s(%d,%d) ap_queue_info() not successful, rm queue device\n", - __func__, ac->id, dom); + AP_DBF_INFO("%s(%d,%d) queue_info() failed, rm queue dev\n", + __func__, ac->id, dom); device_unregister(dev); put_device(dev); } @@ -1577,10 +1586,10 @@ static inline void ap_scan_domains(struct ap_card *ac) /* get it and thus adjust reference counter */ get_device(dev); if (decfg) - AP_DBF_INFO("%s(%d,%d) new (decfg) queue device created\n", + AP_DBF_INFO("%s(%d,%d) new (decfg) queue dev created\n", __func__, ac->id, dom); else - AP_DBF_INFO("%s(%d,%d) new queue device created\n", + AP_DBF_INFO("%s(%d,%d) new queue dev created\n", __func__, ac->id, dom); goto put_dev_and_continue; } @@ -1594,7 +1603,7 @@ static inline void ap_scan_domains(struct ap_card *ac) aq->last_err_rc = AP_RESPONSE_DECONFIGURED; } spin_unlock_bh(&aq->lock); - AP_DBF_INFO("%s(%d,%d) queue device config off\n", + AP_DBF_INFO("%s(%d,%d) queue dev config off\n", __func__, ac->id, dom); ap_send_config_uevent(&aq->ap_dev, aq->config); /* 'receive' pending messages with -EAGAIN */ @@ -1609,7 +1618,7 @@ static inline void ap_scan_domains(struct ap_card *ac) aq->sm_state = AP_SM_STATE_RESET_START; } spin_unlock_bh(&aq->lock); - AP_DBF_INFO("%s(%d,%d) queue device config on\n", + AP_DBF_INFO("%s(%d,%d) queue dev config on\n", __func__, ac->id, dom); ap_send_config_uevent(&aq->ap_dev, aq->config); goto put_dev_and_continue; @@ -1621,7 +1630,7 @@ static inline void ap_scan_domains(struct ap_card *ac) ap_flush_queue(aq); /* re-init (with reset) the queue device */ ap_queue_init_state(aq); - AP_DBF_INFO("%s(%d,%d) queue device reinit enforced\n", + AP_DBF_INFO("%s(%d,%d) queue dev reinit enforced\n", __func__, ac->id, dom); goto put_dev_and_continue; } @@ -1653,7 +1662,7 @@ static inline void ap_scan_adapter(int ap) /* Adapter not in configuration ? */ if (!ap_test_config_card_id(ap)) { if (ac) { - AP_DBF_INFO("%s(%d) ap not in config any more, rm card and queue devices\n", + AP_DBF_INFO("%s(%d) ap not in config any more, rm card and queue devs\n", __func__, ap); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); @@ -1678,9 +1687,8 @@ static inline void ap_scan_adapter(int ap) if (dom > ap_max_domain_id) { /* Could not find a valid APQN for this adapter */ if (ac) { - AP_DBF_INFO( - "%s(%d) no type info (no APQN found), rm card and queue devices\n", - __func__, ap); + AP_DBF_INFO("%s(%d) no type info (no APQN found), rm card and queue devs\n", + __func__, ap); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); } else { @@ -1692,7 +1700,7 @@ static inline void ap_scan_adapter(int ap) if (!type) { /* No apdater type info available, an unusable adapter */ if (ac) { - AP_DBF_INFO("%s(%d) no valid type (0) info, rm card and queue devices\n", + AP_DBF_INFO("%s(%d) no valid type (0) info, rm card and queue devs\n", __func__, ap); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); @@ -1706,13 +1714,13 @@ static inline void ap_scan_adapter(int ap) if (ac) { /* Check APQN against existing card device for changes */ if (ac->raw_hwtype != type) { - AP_DBF_INFO("%s(%d) hwtype %d changed, rm card and queue devices\n", + AP_DBF_INFO("%s(%d) hwtype %d changed, rm card and queue devs\n", __func__, ap, type); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); ac = NULL; } else if (ac->functions != func) { - AP_DBF_INFO("%s(%d) functions 0x%08x changed, rm card and queue devices\n", + AP_DBF_INFO("%s(%d) functions 0x%08x changed, rm card and queue devs\n", __func__, ap, type); ap_scan_rm_card_dev_and_queue_devs(ac); put_device(dev); @@ -1720,13 +1728,13 @@ static inline void ap_scan_adapter(int ap) } else { if (decfg && ac->config) { ac->config = false; - AP_DBF_INFO("%s(%d) card device config off\n", + AP_DBF_INFO("%s(%d) card dev config off\n", __func__, ap); ap_send_config_uevent(&ac->ap_dev, ac->config); } if (!decfg && !ac->config) { ac->config = true; - AP_DBF_INFO("%s(%d) card device config on\n", + AP_DBF_INFO("%s(%d) card dev config on\n", __func__, ap); ap_send_config_uevent(&ac->ap_dev, ac->config); } @@ -1756,7 +1764,8 @@ static inline void ap_scan_adapter(int ap) if (ac->maxmsgsize > atomic_read(&ap_max_msg_size)) { atomic_set(&ap_max_msg_size, ac->maxmsgsize); AP_DBF_INFO("%s(%d) ap_max_msg_size update to %d byte\n", - __func__, ap, atomic_read(&ap_max_msg_size)); + __func__, ap, + atomic_read(&ap_max_msg_size)); } /* Register the new card device with AP bus */ rc = device_register(dev); @@ -1769,10 +1778,10 @@ static inline void ap_scan_adapter(int ap) /* get it and thus adjust reference counter */ get_device(dev); if (decfg) - AP_DBF_INFO("%s(%d) new (decfg) card device type=%d func=0x%08x created\n", + AP_DBF_INFO("%s(%d) new (decfg) card dev type=%d func=0x%08x created\n", __func__, ap, type, func); else - AP_DBF_INFO("%s(%d) new card device type=%d func=0x%08x created\n", + AP_DBF_INFO("%s(%d) new card dev type=%d func=0x%08x created\n", __func__, ap, type, func); } @@ -1810,12 +1819,12 @@ static void ap_scan_bus(struct work_struct *unused) if (dev) put_device(dev); else - AP_DBF_INFO("no queue device with default domain %d available\n", - ap_domain_index); + AP_DBF_INFO("%s no queue device with default domain %d available\n", + __func__, ap_domain_index); } if (atomic64_inc_return(&ap_scan_bus_count) == 1) { - AP_DBF(DBF_DEBUG, "%s init scan complete\n", __func__); + AP_DBF_DBG("%s init scan complete\n", __func__); ap_send_init_scan_done_uevent(); ap_check_bindings_complete(); } @@ -1830,7 +1839,7 @@ static void ap_config_timeout(struct timer_list *unused) static int __init ap_debug_init(void) { - ap_dbf_info = debug_register("ap", 1, 1, + ap_dbf_info = debug_register("ap", 2, 1, DBF_MAX_SPRINTF_ARGS * sizeof(long)); debug_register_view(ap_dbf_info, &debug_sprintf_view); debug_set_level(ap_dbf_info, DBF_ERR); @@ -1897,7 +1906,7 @@ static int __init ap_module_init(void) } /* enable interrupts if available */ - if (ap_interrupts_available()) { + if (ap_interrupts_available() && ap_useirq) { rc = register_adapter_interrupt(&ap_airq); ap_irq_flag = (rc == 0); } diff --git a/drivers/s390/crypto/ap_debug.h b/drivers/s390/crypto/ap_debug.h index 34b0350d0b1a..c083ce88a9a6 100644 --- a/drivers/s390/crypto/ap_debug.h +++ b/drivers/s390/crypto/ap_debug.h @@ -16,7 +16,7 @@ #define RC2ERR(rc) ((rc) ? DBF_ERR : DBF_INFO) #define RC2WARN(rc) ((rc) ? DBF_WARN : DBF_INFO) -#define DBF_MAX_SPRINTF_ARGS 5 +#define DBF_MAX_SPRINTF_ARGS 6 #define AP_DBF(...) \ debug_sprintf_event(ap_dbf_info, ##__VA_ARGS__) diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 9ea48bf0ee40..1901449768dd 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -157,6 +157,8 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq) switch (status.response_code) { case AP_RESPONSE_NORMAL: aq->queue_count = max_t(int, 0, aq->queue_count - 1); + if (!status.queue_empty && !aq->queue_count) + aq->queue_count++; if (aq->queue_count > 0) mod_timer(&aq->timeout, jiffies + aq->request_timeout); @@ -246,6 +248,7 @@ static enum ap_sm_wait ap_sm_write(struct ap_queue *aq) if (aq->requestq_count <= 0) return AP_SM_WAIT_NONE; + /* Start the next request on the queue. */ ap_msg = list_entry(aq->requestq.next, struct ap_message, list); #ifdef CONFIG_ZCRYPT_DEBUG @@ -279,7 +282,7 @@ static enum ap_sm_wait ap_sm_write(struct ap_queue *aq) aq->sm_state = AP_SM_STATE_RESET_WAIT; return AP_SM_WAIT_TIMEOUT; case AP_RESPONSE_INVALID_DOMAIN: - AP_DBF(DBF_WARN, "AP_RESPONSE_INVALID_DOMAIN on NQAP\n"); + AP_DBF_WARN("%s RESPONSE_INVALID_DOMAIN on NQAP\n", __func__); fallthrough; case AP_RESPONSE_MESSAGE_TOO_BIG: case AP_RESPONSE_REQ_FAC_NOT_INST: @@ -571,8 +574,8 @@ static ssize_t reset_store(struct device *dev, ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL)); spin_unlock_bh(&aq->lock); - AP_DBF(DBF_INFO, "reset queue=%02x.%04x triggered by user\n", - AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid)); + AP_DBF_INFO("%s reset queue=%02x.%04x triggered by user\n", + __func__, AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid)); return count; } diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index 4d2556bc7fe5..03311a476366 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -42,10 +42,13 @@ static struct ap_device_id ap_queue_ids[] = { MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids); /** - * vfio_ap_queue_dev_probe: + * vfio_ap_queue_dev_probe: Allocate a vfio_ap_queue structure and associate it + * with the device as driver_data. * - * Allocate a vfio_ap_queue structure and associate it - * with the device as driver_data. + * @apdev: the AP device being probed + * + * Return: returns 0 if the probe succeeded; otherwise, returns -ENOMEM if + * storage could not be allocated for a vfio_ap_queue object. */ static int vfio_ap_queue_dev_probe(struct ap_device *apdev) { @@ -61,10 +64,11 @@ static int vfio_ap_queue_dev_probe(struct ap_device *apdev) } /** - * vfio_ap_queue_dev_remove: + * vfio_ap_queue_dev_remove: Free the associated vfio_ap_queue structure. * - * Takes the matrix lock to avoid actions on this device while removing - * Free the associated vfio_ap_queue structure + * @apdev: the AP device being removed + * + * Takes the matrix lock to avoid actions on this device while doing the remove. */ static void vfio_ap_queue_dev_remove(struct ap_device *apdev) { diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 2341425f6967..abc0b9b88386 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -187,6 +187,8 @@ end_free: * vfio_ap_irq_enable - Enable Interruption for a APQN * * @q: the vfio_ap_queue holding AQIC parameters + * @isc: the guest ISC to register with the GIB interface + * @nib: the notification indicator byte to pin. * * Pin the NIB saved in *q * Register the guest ISC to GIB interface and retrieve the @@ -738,7 +740,6 @@ vfio_ap_mdev_verify_queues_reserved_for_apqi(struct ap_matrix_mdev *matrix_mdev, * assign_domain_store - parses the APQI from @buf and sets the * corresponding bit in the mediated matrix device's AQM * - * * @dev: the matrix device * @attr: the mediated matrix device's assign_domain attribute * @buf: a buffer containing the AP queue index (APQI) of the domain to @@ -866,7 +867,6 @@ static DEVICE_ATTR_WO(unassign_domain); * assign_control_domain_store - parses the domain ID from @buf and sets * the corresponding bit in the mediated matrix device's ADM * - * * @dev: the matrix device * @attr: the mediated matrix device's assign_control_domain attribute * @buf: a buffer containing the domain ID to be assigned @@ -1142,6 +1142,7 @@ static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb, * by @matrix_mdev. * * @matrix_mdev: a matrix mediated device + * @kvm: the pointer to the kvm structure being unset. * * Note: The matrix_dev->lock must be taken prior to calling * this function; however, the lock will be temporarily released while the diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 77760e2b546f..648fcaf8104a 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -26,16 +26,18 @@ #define VFIO_AP_DRV_NAME "vfio_ap" /** - * ap_matrix_dev - the AP matrix device structure + * struct ap_matrix_dev - Contains the data for the matrix device. + * * @device: generic device structure associated with the AP matrix device * @available_instances: number of mediated matrix devices that can be created * @info: the struct containing the output from the PQAP(QCI) instruction - * mdev_list: the list of mediated matrix devices created - * lock: mutex for locking the AP matrix device. This lock will be + * @mdev_list: the list of mediated matrix devices created + * @lock: mutex for locking the AP matrix device. This lock will be * taken every time we fiddle with state managed by the vfio_ap * driver, be it using @mdev_list or writing the state of a * single ap_matrix_mdev device. It's quite coarse but we don't * expect much contention. + * @vfio_ap_drv: the vfio_ap device driver */ struct ap_matrix_dev { struct device device; @@ -49,17 +51,19 @@ struct ap_matrix_dev { extern struct ap_matrix_dev *matrix_dev; /** - * The AP matrix is comprised of three bit masks identifying the adapters, - * queues (domains) and control domains that belong to an AP matrix. The bits i - * each mask, from least significant to most significant bit, correspond to IDs - * 0 to 255. When a bit is set, the corresponding ID belongs to the matrix. + * struct ap_matrix - matrix of adapters, domains and control domains * * @apm_max: max adapter number in @apm - * @apm identifies the AP adapters in the matrix + * @apm: identifies the AP adapters in the matrix * @aqm_max: max domain number in @aqm - * @aqm identifies the AP queues (domains) in the matrix + * @aqm: identifies the AP queues (domains) in the matrix * @adm_max: max domain number in @adm - * @adm identifies the AP control domains in the matrix + * @adm: identifies the AP control domains in the matrix + * + * The AP matrix is comprised of three bit masks identifying the adapters, + * queues (domains) and control domains that belong to an AP matrix. The bits in + * each mask, from left to right, correspond to IDs 0 to 255. When a bit is set + * the corresponding ID belongs to the matrix. */ struct ap_matrix { unsigned long apm_max; @@ -71,13 +75,20 @@ struct ap_matrix { }; /** - * struct ap_matrix_mdev - the mediated matrix device structure - * @list: allows the ap_matrix_mdev struct to be added to a list + * struct ap_matrix_mdev - Contains the data associated with a matrix mediated + * device. + * @vdev: the vfio device + * @node: allows the ap_matrix_mdev struct to be added to a list * @matrix: the adapters, usage domains and control domains assigned to the * mediated matrix device. * @group_notifier: notifier block used for specifying callback function for * handling the VFIO_GROUP_NOTIFY_SET_KVM event + * @iommu_notifier: notifier block used for specifying callback function for + * handling the VFIO_IOMMU_NOTIFY_DMA_UNMAP even * @kvm: the struct holding guest's state + * @pqap_hook: the function pointer to the interception handler for the + * PQAP(AQIC) instruction. + * @mdev: the mediated device */ struct ap_matrix_mdev { struct vfio_device vdev; @@ -90,6 +101,14 @@ struct ap_matrix_mdev { struct mdev_device *mdev; }; +/** + * struct vfio_ap_queue - contains the data associated with a queue bound to the + * vfio_ap device driver + * @matrix_mdev: the matrix mediated device + * @saved_pfn: the guest PFN pinned for the guest + * @apqn: the APQN of the AP queue device + * @saved_isc: the guest ISC registered with the GIB interface + */ struct vfio_ap_queue { struct ap_matrix_mdev *matrix_mdev; unsigned long saved_pfn; diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 356318746dd1..4c3dcc435e83 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -82,8 +82,8 @@ static inline int zcrypt_process_rescan(void) atomic_set(&zcrypt_rescan_req, 0); atomic_inc(&zcrypt_rescan_count); ap_bus_force_rescan(); - ZCRYPT_DBF(DBF_INFO, "rescan count=%07d\n", - atomic_inc_return(&zcrypt_rescan_count)); + ZCRYPT_DBF_INFO("%s rescan count=%07d\n", __func__, + atomic_inc_return(&zcrypt_rescan_count)); return 1; } return 0; @@ -341,8 +341,8 @@ static void zcdn_device_release(struct device *dev) { struct zcdn_device *zcdndev = to_zcdn_dev(dev); - ZCRYPT_DBF(DBF_INFO, "releasing zcdn device %d:%d\n", - MAJOR(dev->devt), MINOR(dev->devt)); + ZCRYPT_DBF_INFO("%s releasing zcdn device %d:%d\n", + __func__, MAJOR(dev->devt), MINOR(dev->devt)); kfree(zcdndev); } @@ -407,8 +407,8 @@ static int zcdn_create(const char *name) goto unlockout; } - ZCRYPT_DBF(DBF_INFO, "created zcdn device %d:%d\n", - MAJOR(devt), MINOR(devt)); + ZCRYPT_DBF_INFO("%s created zcdn device %d:%d\n", + __func__, MAJOR(devt), MINOR(devt)); unlockout: mutex_unlock(&ap_perms_mutex); @@ -550,9 +550,8 @@ static inline int zcrypt_check_ioctl(struct ap_perms *perms, } if (rc) - ZCRYPT_DBF(DBF_WARN, - "ioctl check failed: ioctlnr=0x%04x rc=%d\n", - ioctlnr, rc); + ZCRYPT_DBF_WARN("%s ioctl check failed: ioctlnr=0x%04x rc=%d\n", + __func__, ioctlnr, rc); return rc; } @@ -1446,7 +1445,7 @@ static int icarsamodexpo_ioctl(struct ap_perms *perms, unsigned long arg) if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX) rc = -EIO; if (rc) { - ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSAMODEXPO rc=%d\n", rc); + ZCRYPT_DBF_DBG("ioctl ICARSAMODEXPO rc=%d\n", rc); return rc; } return put_user(mex.outputdatalength, &umex->outputdatalength); @@ -1491,7 +1490,7 @@ static int icarsacrt_ioctl(struct ap_perms *perms, unsigned long arg) if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX) rc = -EIO; if (rc) { - ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSACRT rc=%d\n", rc); + ZCRYPT_DBF_DBG("ioctl ICARSACRT rc=%d\n", rc); return rc; } return put_user(crt.outputdatalength, &ucrt->outputdatalength); @@ -1509,12 +1508,12 @@ static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg) return -EFAULT; #ifdef CONFIG_ZCRYPT_DEBUG - if (xcRB.status & (1U << 31)) { + if ((xcRB.status & 0x8000FFFF) == 0x80004649 /* 'FI' */) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; tr.fi.cmd = (u16)(xcRB.status >> 16); } - xcRB.status &= 0x0000FFFF; + xcRB.status = 0; #endif do { @@ -1536,8 +1535,8 @@ static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg) if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX) rc = -EIO; if (rc) - ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDCPRB rc=%d status=0x%x\n", - rc, xcRB.status); + ZCRYPT_DBF_DBG("ioctl ZSENDCPRB rc=%d status=0x%x\n", + rc, xcRB.status); if (copy_to_user(uxcRB, &xcRB, sizeof(xcRB))) return -EFAULT; return rc; @@ -1582,7 +1581,7 @@ static int zsendep11cprb_ioctl(struct ap_perms *perms, unsigned long arg) if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX) rc = -EIO; if (rc) - ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDEP11CPRB rc=%d\n", rc); + ZCRYPT_DBF_DBG("ioctl ZSENDEP11CPRB rc=%d\n", rc); if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb))) return -EFAULT; return rc; @@ -1709,7 +1708,7 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, } /* unknown ioctl number */ default: - ZCRYPT_DBF(DBF_DEBUG, "unknown ioctl 0x%08x\n", cmd); + ZCRYPT_DBF_DBG("unknown ioctl 0x%08x\n", cmd); return -ENOIOCTLCMD; } } @@ -2048,16 +2047,14 @@ int zcrypt_wait_api_operational(void) break; case -ETIME: /* timeout */ - ZCRYPT_DBF(DBF_WARN, - "%s ap_wait_init_apqn_bindings_complete() returned with ETIME\n", - __func__); + ZCRYPT_DBF_WARN("%s ap_wait_init_apqn_bindings_complete()=ETIME\n", + __func__); zcrypt_wait_api_state = -ETIME; break; default: /* other failure */ - ZCRYPT_DBF(DBF_DEBUG, - "%s ap_wait_init_apqn_bindings_complete() failure rc=%d\n", - __func__, rc); + ZCRYPT_DBF_DBG("%s ap_wait_init_apqn_bindings_complete()=%d\n", + __func__, rc); break; } break; @@ -2079,7 +2076,7 @@ EXPORT_SYMBOL(zcrypt_wait_api_operational); int __init zcrypt_debug_init(void) { - zcrypt_dbf_info = debug_register("zcrypt", 1, 1, + zcrypt_dbf_info = debug_register("zcrypt", 2, 1, DBF_MAX_SPRINTF_ARGS * sizeof(long)); debug_register_view(zcrypt_dbf_info, &debug_sprintf_view); debug_set_level(zcrypt_dbf_info, DBF_ERR); diff --git a/drivers/s390/crypto/zcrypt_card.c b/drivers/s390/crypto/zcrypt_card.c index ef11d2a0ca6c..3e259befd30a 100644 --- a/drivers/s390/crypto/zcrypt_card.c +++ b/drivers/s390/crypto/zcrypt_card.c @@ -76,7 +76,7 @@ static ssize_t online_store(struct device *dev, zc->online = online; id = zc->card->id; - ZCRYPT_DBF(DBF_INFO, "card=%02x online=%d\n", id, online); + ZCRYPT_DBF_INFO("%s card=%02x online=%d\n", __func__, id, online); ap_send_online_uevent(&ac->ap_dev, online); @@ -189,7 +189,8 @@ int zcrypt_card_register(struct zcrypt_card *zc) zc->online = 1; - ZCRYPT_DBF(DBF_INFO, "card=%02x register online=1\n", zc->card->id); + ZCRYPT_DBF_INFO("%s card=%02x register online=1\n", + __func__, zc->card->id); rc = sysfs_create_group(&zc->card->ap_dev.device.kobj, &zcrypt_card_attr_group); @@ -211,7 +212,8 @@ EXPORT_SYMBOL(zcrypt_card_register); */ void zcrypt_card_unregister(struct zcrypt_card *zc) { - ZCRYPT_DBF(DBF_INFO, "card=%02x unregister\n", zc->card->id); + ZCRYPT_DBF_INFO("%s card=%02x unregister\n", + __func__, zc->card->id); spin_lock(&zcrypt_list_lock); list_del_init(&zc->list); diff --git a/drivers/s390/crypto/zcrypt_debug.h b/drivers/s390/crypto/zcrypt_debug.h index 3225489a1c41..5cf88aabd64b 100644 --- a/drivers/s390/crypto/zcrypt_debug.h +++ b/drivers/s390/crypto/zcrypt_debug.h @@ -17,7 +17,7 @@ #define RC2ERR(rc) ((rc) ? DBF_ERR : DBF_INFO) #define RC2WARN(rc) ((rc) ? DBF_WARN : DBF_INFO) -#define DBF_MAX_SPRINTF_ARGS 5 +#define DBF_MAX_SPRINTF_ARGS 6 #define ZCRYPT_DBF(...) \ debug_sprintf_event(zcrypt_dbf_info, ##__VA_ARGS__) diff --git a/drivers/s390/crypto/zcrypt_error.h b/drivers/s390/crypto/zcrypt_error.h index 39e626e3a379..8b0ce600b749 100644 --- a/drivers/s390/crypto/zcrypt_error.h +++ b/drivers/s390/crypto/zcrypt_error.h @@ -98,9 +98,8 @@ static inline int convert_error(struct zcrypt_queue *zq, case REP88_ERROR_MESSAGE_MALFORMD: /* 0x22 */ case REP88_ERROR_KEY_TYPE: /* 0x34 */ /* RY indicates malformed request */ - ZCRYPT_DBF(DBF_WARN, - "dev=%02x.%04x RY=0x%02x => rc=EINVAL\n", - card, queue, ehdr->reply_code); + ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => rc=EINVAL\n", + __func__, card, queue, ehdr->reply_code); return -EINVAL; case REP82_ERROR_MACHINE_FAILURE: /* 0x10 */ case REP82_ERROR_MESSAGE_TYPE: /* 0x20 */ @@ -119,19 +118,18 @@ static inline int convert_error(struct zcrypt_queue *zq, } __packed * head = reply->msg; unsigned int apfs = *((u32 *)head->fmt2.apfs); - ZCRYPT_DBF(DBF_WARN, - "dev=%02x.%04x RY=0x%02x apfs=0x%x => bus rescan, rc=EAGAIN\n", - card, queue, ehdr->reply_code, apfs); + ZCRYPT_DBF_WARN( + "%s dev=%02x.%04x RY=0x%02x apfs=0x%x => bus rescan, rc=EAGAIN\n", + __func__, card, queue, ehdr->reply_code, apfs); } else - ZCRYPT_DBF(DBF_WARN, - "dev=%02x.%04x RY=0x%02x => bus rescan, rc=EAGAIN\n", - card, queue, ehdr->reply_code); + ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => bus rescan, rc=EAGAIN\n", + __func__, card, queue, + ehdr->reply_code); return -EAGAIN; default: /* Assume request is valid and a retry will be worth it */ - ZCRYPT_DBF(DBF_WARN, - "dev=%02x.%04x RY=0x%02x => rc=EAGAIN\n", - card, queue, ehdr->reply_code); + ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => rc=EAGAIN\n", + __func__, card, queue, ehdr->reply_code); return -EAGAIN; } } diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c index 99937f3e1d49..f42e8c511184 100644 --- a/drivers/s390/crypto/zcrypt_msgtype50.c +++ b/drivers/s390/crypto/zcrypt_msgtype50.c @@ -369,12 +369,10 @@ static int convert_type80(struct zcrypt_queue *zq, zq->online = 0; pr_err("Crypto dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n", AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - t80h->code); - ZCRYPT_DBF_ERR("dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - t80h->code); + AP_QID_QUEUE(zq->queue->qid), t80h->code); + ZCRYPT_DBF_ERR("%s dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), t80h->code); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } @@ -409,10 +407,10 @@ static int convert_response_cex2a(struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) rtype); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) rtype); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) rtype); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index bc5a8c31ba73..8582dd0d6969 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -649,8 +649,8 @@ static int convert_type86_ica(struct zcrypt_queue *zq, (service_rc == 8 && service_rs == 72) || (service_rc == 8 && service_rs == 770) || (service_rc == 12 && service_rs == 769)) { - ZCRYPT_DBF_WARN("dev=%02x.%04x rc/rs=%d/%d => rc=EINVAL\n", - AP_QID_CARD(zq->queue->qid), + ZCRYPT_DBF_WARN("%s dev=%02x.%04x rc/rs=%d/%d => rc=EINVAL\n", + __func__, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) service_rc, (int) service_rs); return -EINVAL; @@ -660,8 +660,8 @@ static int convert_type86_ica(struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) service_rc, (int) service_rs); - ZCRYPT_DBF_ERR("dev=%02x.%04x rc/rs=%d/%d => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), + ZCRYPT_DBF_ERR("%s dev=%02x.%04x rc/rs=%d/%d => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) service_rc, (int) service_rs); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); @@ -806,10 +806,10 @@ static int convert_response_ica(struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) msg->hdr.type); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } @@ -841,10 +841,10 @@ static int convert_response_xcrb(bool userspace, struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) msg->hdr.type); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } @@ -871,10 +871,10 @@ static int convert_response_ep11_xcrb(bool userspace, struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) msg->hdr.type); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } @@ -902,10 +902,10 @@ static int convert_response_rng(struct zcrypt_queue *zq, AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); - ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - (int) msg->hdr.type); + ZCRYPT_DBF_ERR( + "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type); ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } diff --git a/drivers/s390/crypto/zcrypt_queue.c b/drivers/s390/crypto/zcrypt_queue.c index 398bde237e37..1552a850a52e 100644 --- a/drivers/s390/crypto/zcrypt_queue.c +++ b/drivers/s390/crypto/zcrypt_queue.c @@ -65,10 +65,9 @@ static ssize_t online_store(struct device *dev, return -EINVAL; zq->online = online; - ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x online=%d\n", - AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - online); + ZCRYPT_DBF_INFO("%s queue=%02x.%04x online=%d\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid), online); ap_send_online_uevent(&aq->ap_dev, online); @@ -175,8 +174,9 @@ int zcrypt_queue_register(struct zcrypt_queue *zq) zq->zcard = zc; zq->online = 1; /* New devices are online by default. */ - ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x register online=1\n", - AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid)); + ZCRYPT_DBF_INFO("%s queue=%02x.%04x register online=1\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid)); list_add_tail(&zq->list, &zc->zqueues); spin_unlock(&zcrypt_list_lock); @@ -215,8 +215,9 @@ void zcrypt_queue_unregister(struct zcrypt_queue *zq) { struct zcrypt_card *zc; - ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x unregister\n", - AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid)); + ZCRYPT_DBF_INFO("%s queue=%02x.%04x unregister\n", + __func__, AP_QID_CARD(zq->queue->qid), + AP_QID_QUEUE(zq->queue->qid)); zc = zq->zcard; spin_lock(&zcrypt_list_lock); diff --git a/drivers/ssb/pcihost_wrapper.c b/drivers/ssb/pcihost_wrapper.c index 410215c16920..dd70fd41c77d 100644 --- a/drivers/ssb/pcihost_wrapper.c +++ b/drivers/ssb/pcihost_wrapper.c @@ -69,7 +69,6 @@ static int ssb_pcihost_probe(struct pci_dev *dev, { struct ssb_bus *ssb; int err = -ENOMEM; - const char *name; u32 val; ssb = kzalloc(sizeof(*ssb), GFP_KERNEL); @@ -78,10 +77,7 @@ static int ssb_pcihost_probe(struct pci_dev *dev, err = pci_enable_device(dev); if (err) goto err_kfree_ssb; - name = dev_name(&dev->dev); - if (dev->driver && dev->driver->name) - name = dev->driver->name; - err = pci_request_regions(dev, name); + err = pci_request_regions(dev, dev_driver_string(&dev->dev)); if (err) goto err_pci_disable; pci_set_master(dev); diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index e03627ad4460..59af251e7576 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -86,8 +86,6 @@ source "drivers/staging/vc04_services/Kconfig" source "drivers/staging/pi433/Kconfig" -source "drivers/staging/mt7621-pci/Kconfig" - source "drivers/staging/mt7621-dma/Kconfig" source "drivers/staging/ralink-gdma/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index c7f8d8d8dd11..76f413470bc8 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_KS7010) += ks7010/ obj-$(CONFIG_GREYBUS) += greybus/ obj-$(CONFIG_BCM2835_VCHIQ) += vc04_services/ obj-$(CONFIG_PI433) += pi433/ -obj-$(CONFIG_PCI_MT7621) += mt7621-pci/ obj-$(CONFIG_SOC_MT7621) += mt7621-dma/ obj-$(CONFIG_DMA_RALINK) += ralink-gdma/ obj-$(CONFIG_SOC_MT7621) += mt7621-dts/ diff --git a/drivers/staging/mt7621-pci/Kconfig b/drivers/staging/mt7621-pci/Kconfig deleted file mode 100644 index ce58042f2f21..000000000000 --- a/drivers/staging/mt7621-pci/Kconfig +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -config PCI_MT7621 - tristate "MediaTek MT7621 PCI Controller" - depends on RALINK - select PCI_DRIVERS_GENERIC - help - This selects a driver for the MediaTek MT7621 PCI Controller. - diff --git a/drivers/staging/mt7621-pci/Makefile b/drivers/staging/mt7621-pci/Makefile deleted file mode 100644 index f4e651cf7ce3..000000000000 --- a/drivers/staging/mt7621-pci/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_PCI_MT7621) += pci-mt7621.o diff --git a/drivers/staging/mt7621-pci/TODO b/drivers/staging/mt7621-pci/TODO deleted file mode 100644 index d674a9ac85c1..000000000000 --- a/drivers/staging/mt7621-pci/TODO +++ /dev/null @@ -1,4 +0,0 @@ - -- general code review and cleanup - -Cc: NeilBrown diff --git a/drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt b/drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt deleted file mode 100644 index 327a68267309..000000000000 --- a/drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt +++ /dev/null @@ -1,104 +0,0 @@ -MediaTek MT7621 PCIe controller - -Required properties: -- compatible: "mediatek,mt7621-pci" -- device_type: Must be "pci" -- reg: Base addresses and lengths of the PCIe subsys and root ports. -- bus-range: Range of bus numbers associated with this controller. -- #address-cells: Address representation for root ports (must be 3) -- pinctrl-names : The pin control state names. -- pinctrl-0: The "default" pinctrl state. -- #size-cells: Size representation for root ports (must be 2) -- ranges: Ranges for the PCI memory and I/O regions. -- #interrupt-cells: Must be 1 -- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties. - Please refer to the standard PCI bus binding document for a more detailed - explanation. -- status: either "disabled" or "okay". -- resets: Must contain an entry for each entry in reset-names. - See ../reset/reset.txt for details. -- reset-names: Must be "pcie0", "pcie1", "pcieN"... based on the number of - root ports. -- clocks: Must contain an entry for each entry in clock-names. - See ../clocks/clock-bindings.txt for details. -- clock-names: Must be "pcie0", "pcie1", "pcieN"... based on the number of - root ports. -- reset-gpios: GPIO specs for the reset pins. - -In addition, the device tree node must have sub-nodes describing each PCIe port -interface, having the following mandatory properties: - -Required properties: -- reg: Only the first four bytes are used to refer to the correct bus number - and device number. -- #address-cells: Must be 3 -- #size-cells: Must be 2 -- ranges: Sub-ranges distributed from the PCIe controller node. An empty - property is sufficient. -- bus-range: Range of bus numbers associated with this port. - -Example for MT7621: - - pcie: pcie@1e140000 { - compatible = "mediatek,mt7621-pci"; - reg = <0x1e140000 0x100 /* host-pci bridge registers */ - 0x1e142000 0x100 /* pcie port 0 RC control registers */ - 0x1e143000 0x100 /* pcie port 1 RC control registers */ - 0x1e144000 0x100>; /* pcie port 2 RC control registers */ - - #address-cells = <3>; - #size-cells = <2>; - - pinctrl-names = "default"; - pinctrl-0 = <&pcie_pins>; - - device_type = "pci"; - - bus-range = <0 255>; - ranges = < - 0x02000000 0 0x00000000 0x60000000 0 0x10000000 /* pci memory */ - 0x01000000 0 0x00000000 0x1e160000 0 0x00010000 /* io space */ - >; - - #interrupt-cells = <1>; - interrupt-map-mask = <0xF0000 0 0 1>; - interrupt-map = <0x10000 0 0 1 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>, - <0x20000 0 0 1 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>, - <0x30000 0 0 1 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>; - - status = "disabled"; - - resets = <&rstctrl 24 &rstctrl 25 &rstctrl 26>; - reset-names = "pcie0", "pcie1", "pcie2"; - clocks = <&clkctrl 24 &clkctrl 25 &clkctrl 26>; - clock-names = "pcie0", "pcie1", "pcie2"; - - reset-gpios = <&gpio 19 GPIO_ACTIVE_LOW>, - <&gpio 8 GPIO_ACTIVE_LOW>, - <&gpio 7 GPIO_ACTIVE_LOW>; - - pcie@0,0 { - reg = <0x0000 0 0 0 0>; - #address-cells = <3>; - #size-cells = <2>; - ranges; - bus-range = <0x00 0xff>; - }; - - pcie@1,0 { - reg = <0x0800 0 0 0 0>; - #address-cells = <3>; - #size-cells = <2>; - ranges; - bus-range = <0x00 0xff>; - }; - - pcie@2,0 { - reg = <0x1000 0 0 0 0>; - #address-cells = <3>; - #size-cells = <2>; - ranges; - bus-range = <0x00 0xff>; - }; - }; - diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c index be4ecbabdd58..933d77ad0a64 100644 --- a/drivers/usb/early/xhci-dbc.c +++ b/drivers/usb/early/xhci-dbc.c @@ -185,7 +185,7 @@ static void __init xdbc_free_ring(struct xdbc_ring *ring) if (!seg) return; - memblock_free(seg->dma, PAGE_SIZE); + memblock_phys_free(seg->dma, PAGE_SIZE); ring->segment = NULL; } @@ -665,10 +665,10 @@ int __init early_xdbc_setup_hardware(void) xdbc_free_ring(&xdbc.in_ring); if (xdbc.table_dma) - memblock_free(xdbc.table_dma, PAGE_SIZE); + memblock_phys_free(xdbc.table_dma, PAGE_SIZE); if (xdbc.out_dma) - memblock_free(xdbc.out_dma, PAGE_SIZE); + memblock_phys_free(xdbc.out_dma, PAGE_SIZE); xdbc.table_base = NULL; xdbc.out_buf = NULL; @@ -987,8 +987,8 @@ free_and_quit: xdbc_free_ring(&xdbc.evt_ring); xdbc_free_ring(&xdbc.out_ring); xdbc_free_ring(&xdbc.in_ring); - memblock_free(xdbc.table_dma, PAGE_SIZE); - memblock_free(xdbc.out_dma, PAGE_SIZE); + memblock_phys_free(xdbc.table_dma, PAGE_SIZE); + memblock_phys_free(xdbc.out_dma, PAGE_SIZE); writel(0, &xdbc.xdbc_reg->control); early_iounmap(xdbc.xhci_base, xdbc.xhci_length); diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 1d8a4c089a85..92adf6107864 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -111,7 +111,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) struct xhci_driver_data *driver_data; const struct pci_device_id *id; - id = pci_match_id(pdev->driver->id_table, pdev); + id = pci_match_id(to_pci_driver(pdev->dev.driver)->id_table, pdev); if (id && id->driver_data) { driver_data = (struct xhci_driver_data *)id->driver_data; diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 8fcf94cd2c96..10607be76a88 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -108,7 +108,7 @@ config VIRTIO_MEM default m depends on X86_64 depends on VIRTIO - depends on MEMORY_HOTPLUG_SPARSE + depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on CONTIG_ALLOC help diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index cbdff8979980..47aebd98f52f 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -241,7 +241,7 @@ retry: */ rc = xen_swiotlb_fixup(start, nslabs); if (rc) { - memblock_free(__pa(start), PAGE_ALIGN(bytes)); + memblock_free(start, PAGE_ALIGN(bytes)); if (nslabs > 1024 && repeat--) { /* Min is 2MB */ nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9fa930dfd78d..dca42aa87d30 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -38,7 +38,6 @@ #include #include "cifs_spnego.h" #include "fscache.h" -#include "smb2pdu.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e916470468ea..abff31dcd005 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -20,6 +20,7 @@ #include #include #include +#include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ @@ -776,7 +777,7 @@ revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) static inline void revert_current_mid_from_hdr(struct TCP_Server_Info *server, - const struct smb2_sync_hdr *shdr) + const struct smb2_hdr *shdr) { unsigned int num = le16_to_cpu(shdr->CreditCharge); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index c3b94c1e4591..0abbff4e4135 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -677,7 +677,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) static unsigned int smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + struct smb2_hdr *shdr = (struct smb2_hdr *)buffer; /* * SMB1 does not use credits. @@ -794,7 +794,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) */ } - kfree(server->hostname); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -878,7 +877,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) static void smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + struct smb2_hdr *shdr = (struct smb2_hdr *)buffer; int scredits, in_flight; /* @@ -1235,6 +1234,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context * if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns)) return 0; + if (strcasecmp(server->hostname, ctx->server_hostname)) + return 0; + if (!match_address(server, addr, (struct sockaddr *)&ctx->srcaddr)) return 0; @@ -1336,6 +1338,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; + kfree(server->hostname); task = xchg(&server->tsk, NULL); if (task) @@ -1361,14 +1364,15 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx) goto out_err; } + tcp_ses->hostname = kstrdup(ctx->server_hostname, GFP_KERNEL); + if (!tcp_ses->hostname) { + rc = -ENOMEM; + goto out_err; + } + tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); - tcp_ses->hostname = extract_hostname(ctx->UNC); - if (IS_ERR(tcp_ses->hostname)) { - rc = PTR_ERR(tcp_ses->hostname); - goto out_err_crypto_release; - } tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); tcp_ses->noblockcnt = ctx->rootfs; @@ -1497,8 +1501,7 @@ out_err_crypto_release: out_err: if (tcp_ses) { - if (!IS_ERR(tcp_ses->hostname)) - kfree(tcp_ses->hostname); + kfree(tcp_ses->hostname); if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); kfree(tcp_ses); @@ -2646,11 +2649,12 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = 0; if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); + trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); sock_release(socket); server->ssocket = NULL; return rc; } - + trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr); if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 3109def8e199..38d96a480745 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -116,6 +116,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag("nosharesock", Opt_nosharesock), fsparam_flag_no("persistenthandles", Opt_persistent), fsparam_flag_no("resilienthandles", Opt_resilient), + fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay), fsparam_flag("domainauto", Opt_domainauto), fsparam_flag("rdma", Opt_rdma), fsparam_flag("modesid", Opt_modesid), @@ -318,6 +319,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(mount_options); DUP_CTX_STR(username); DUP_CTX_STR(password); + DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); DUP_CTX_STR(domainname); @@ -456,6 +458,11 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) if (!pos) return -EINVAL; + /* record the server hostname */ + ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL); + if (!ctx->server_hostname) + return -ENOMEM; + /* skip past delimiter */ ++pos; @@ -1383,6 +1390,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } } break; + case Opt_tcp_nodelay: + /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */ + if (result.negated) + ctx->sockopt_tcp_nodelay = false; + else + ctx->sockopt_tcp_nodelay = true; + break; case Opt_domainauto: ctx->domainauto = true; break; @@ -1496,6 +1510,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; + kfree(ctx->server_hostname); + ctx->server_hostname = NULL; kfree(ctx->UNC); ctx->UNC = NULL; kfree(ctx->source); diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index a42ba71d7a81..b2d22cf9cb18 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -98,6 +98,7 @@ enum cifs_param { Opt_nosharesock, Opt_persistent, Opt_resilient, + Opt_tcp_nodelay, Opt_domainauto, Opt_rdma, Opt_modesid, @@ -166,6 +167,7 @@ struct smb3_fs_context { char *password; char *domainname; char *source; + char *server_hostname; char *UNC; char *nodename; char *iocharset; /* local code page for mapping to and from Unicode */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bb1185fff8cc..ba2c3e897b29 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -152,7 +152,7 @@ cifs_buf_get(void) * SMB2 header is bigger than CIFS one - no problems to clean some * more bytes for CIFS. */ - size_t buf_size = sizeof(struct smb2_sync_hdr); + size_t buf_size = sizeof(struct smb2_hdr); /* * We could use negotiated size instead of max_msgsize - diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 181514b8770d..194799ddd382 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -2439,14 +2439,16 @@ smb2_print_status(__le32 status) int map_smb2_to_linux_error(char *buf, bool log_err) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; unsigned int i; int rc = -EIO; __le32 smb2err = shdr->Status; if (smb2err == 0) { - trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId)); + trace_smb3_cmd_done(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), + le64_to_cpu(shdr->MessageId)); return 0; } @@ -2470,8 +2472,10 @@ map_smb2_to_linux_error(char *buf, bool log_err) cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n", __le32_to_cpu(smb2err), rc); - trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), - le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc); + trace_smb3_cmd_err(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), + le64_to_cpu(shdr->MessageId), + le32_to_cpu(smb2err), rc); return rc; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 29b5554f6263..cdcdef32759e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -8,7 +8,6 @@ * */ #include -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" @@ -19,7 +18,7 @@ #include "nterr.h" static int -check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid) +check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid) { __u64 wire_mid = le64_to_cpu(shdr->MessageId); @@ -81,9 +80,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) }; -#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_sync_hdr) + sizeof(struct smb2_negotiate_rsp)) +#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_hdr) + sizeof(struct smb2_negotiate_rsp)) -static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, +static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen) { __u16 neg_count; @@ -135,13 +134,13 @@ static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, int smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; + struct smb2_pdu *pdu = (struct smb2_pdu *)shdr; __u64 mid; __u32 clc_len; /* calculated length */ int command; - int pdu_size = sizeof(struct smb2_sync_pdu); - int hdr_size = sizeof(struct smb2_sync_hdr); + int pdu_size = sizeof(struct smb2_pdu); + int hdr_size = sizeof(struct smb2_hdr); /* * Add function to do table lookup of StructureSize by command @@ -155,7 +154,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) { - if (ses->Suid == thdr->SessionId) + if (ses->Suid == le64_to_cpu(thdr->SessionId)) break; } spin_unlock(&cifs_tcp_ses_lock); @@ -296,7 +295,7 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { * area and the offset to it (from the beginning of the smb are also returned. */ char * -smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) +smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) { *off = 0; *len = 0; @@ -401,8 +400,8 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *srvr) { - struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf; - struct smb2_sync_hdr *shdr = &pdu->sync_hdr; + struct smb2_pdu *pdu = (struct smb2_pdu *)buf; + struct smb2_hdr *shdr = &pdu->hdr; int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ /* Structure Size has already been checked to make sure it is 64 */ @@ -669,7 +668,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Checking for oplock break\n"); - if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) + if (rsp->hdr.Command != SMB2_OPLOCK_BREAK) return false; if (rsp->StructureSize != @@ -816,25 +815,25 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *sync_hdr = mid->resp_buf; + struct smb2_hdr *hdr = mid->resp_buf; struct smb2_create_rsp *rsp = mid->resp_buf; struct cifs_tcon *tcon; int rc; - if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE || - sync_hdr->Status != STATUS_SUCCESS) + if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || hdr->Command != SMB2_CREATE || + hdr->Status != STATUS_SUCCESS) return 0; - tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId, - sync_hdr->TreeId); + tcon = smb2_find_smb_tcon(server, le64_to_cpu(hdr->SessionId), + le32_to_cpu(hdr->Id.SyncId.TreeId)); if (!tcon) return -ENOENT; rc = __smb2_handle_cancelled_cmd(tcon, - le16_to_cpu(sync_hdr->Command), - le64_to_cpu(sync_hdr->MessageId), - rsp->PersistentFileId, - rsp->VolatileFileId); + le16_to_cpu(hdr->Command), + le64_to_cpu(hdr->MessageId), + le64_to_cpu(rsp->PersistentFileId), + le64_to_cpu(rsp->VolatileFileId)); if (rc) cifs_put_tcon(tcon); @@ -856,10 +855,10 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec) { int i, rc; struct sdesc *d; - struct smb2_sync_hdr *hdr; + struct smb2_hdr *hdr; struct TCP_Server_Info *server = cifs_ses_server(ses); - hdr = (struct smb2_sync_hdr *)iov[0].iov_base; + hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ if (hdr->Command == SMB2_NEGOTIATE) goto ok; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bda606dc72b1..7acf71defea7 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -325,7 +325,7 @@ static struct mid_q_entry * __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue) { struct mid_q_entry *mid; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; __u64 wire_mid = le64_to_cpu(shdr->MessageId); if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { @@ -367,11 +367,11 @@ static void smb2_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, - shdr->ProcessId); + shdr->Id.SyncId.ProcessId); cifs_server_dbg(VFS, "smb buf %p len %u\n", buf, server->ops->calc_smb_size(buf, server)); #endif @@ -885,10 +885,10 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, atomic_inc(&tcon->num_remote_opens); o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; - oparms.fid->persistent_fid = o_rsp->PersistentFileId; - oparms.fid->volatile_fid = o_rsp->VolatileFileId; + oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId); + oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId); #ifdef CONFIG_CIFS_DEBUG2 - oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); + oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ tcon->crfid.tcon = tcon; @@ -2391,12 +2391,12 @@ again: /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; - if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) { + if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) { cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc); goto qdf_free; } - fid->persistent_fid = op_rsp->PersistentFileId; - fid->volatile_fid = op_rsp->VolatileFileId; + fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId); + fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId); /* Anything else than ENODATA means a genuine error */ if (rc && rc != -ENODATA) { @@ -2410,7 +2410,7 @@ again: atomic_inc(&tcon->num_remote_opens); qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base; - if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + if (qd_rsp->hdr.Status == STATUS_NO_MORE_FILES) { trace_smb3_query_dir_done(xid, fid->persistent_fid, tcon->tid, tcon->ses->Suid, 0, 0); srch_inf->endOfSearch = true; @@ -2462,7 +2462,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; int scredits, in_flight; if (shdr->Status != STATUS_PENDING) @@ -2489,13 +2489,14 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) static bool smb2_is_session_expired(char *buf) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED && shdr->Status != STATUS_USER_SESSION_DELETED) return false; - trace_smb3_ses_expired(shdr->TreeId, shdr->SessionId, + trace_smb3_ses_expired(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId)); cifs_dbg(FYI, "Session expired or deleted\n"); @@ -2506,7 +2507,7 @@ smb2_is_session_expired(char *buf) static bool smb2_is_status_io_timeout(char *buf) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; if (shdr->Status == STATUS_IO_TIMEOUT) return true; @@ -2517,7 +2518,7 @@ smb2_is_status_io_timeout(char *buf) static void smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct list_head *tmp, *tmp1; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -2530,7 +2531,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) ses = list_entry(tmp, struct cifs_ses, smb_ses_list); list_for_each(tmp1, &ses->tcon_list) { tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); - if (tcon->tid == shdr->TreeId) { + if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { tcon->need_reconnect = true; spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", @@ -2558,9 +2559,9 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, void smb2_set_related(struct smb_rqst *rqst) { - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; - shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); if (shdr == NULL) { cifs_dbg(FYI, "shdr NULL in smb2_set_related\n"); return; @@ -2573,13 +2574,13 @@ char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0}; void smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) { - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = ses->server; unsigned long len = smb_rqst_len(server, rqst); int i, num_padding; - shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); if (shdr == NULL) { cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n"); return; @@ -3124,7 +3125,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype, rsp_iov); create_rsp = rsp_iov[0].iov_base; - if (create_rsp && create_rsp->sync_hdr.Status) + if (create_rsp && create_rsp->hdr.Status) err_iov = rsp_iov[0]; ioctl_rsp = rsp_iov[1].iov_base; @@ -4369,8 +4370,8 @@ static void fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, struct smb_rqst *old_rq, __le16 cipher_type) { - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)old_rq->rq_iov[0].iov_base; memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; @@ -4496,7 +4497,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); - rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key); + rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__, enc ? "en" : "de"); @@ -4788,7 +4789,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int cur_page_idx; unsigned int pad_len; struct cifs_readdata *rdata = mid->callback_data; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct bio_vec *bvec = NULL; struct iov_iter iter; struct kvec iov; @@ -5117,7 +5118,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, { int ret, length; char *buf = server->smallbuf; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; unsigned int pdu_length = server->pdu_size; unsigned int buf_size; struct mid_q_entry *mid_entry; @@ -5147,7 +5148,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, next_is_large = server->large_buf; one_more: - shdr = (struct smb2_sync_hdr *)buf; + shdr = (struct smb2_hdr *)buf; if (shdr->NextCommand) { if (next_is_large) next_buffer = (char *)cifs_buf_get(); @@ -5213,7 +5214,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); if (pdu_length < sizeof(struct smb2_transform_hdr) + - sizeof(struct smb2_sync_hdr)) { + sizeof(struct smb2_hdr)) { cifs_server_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); cifs_reconnect(server); @@ -5246,7 +5247,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) static int smb2_next_header(char *buf) { - struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf; if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) @@ -5788,7 +5789,7 @@ struct smb_version_values smb20_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5809,7 +5810,7 @@ struct smb_version_values smb21_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5830,7 +5831,7 @@ struct smb_version_values smb3any_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5851,7 +5852,7 @@ struct smb_version_values smbdefault_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5872,7 +5873,7 @@ struct smb_version_values smb30_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5893,7 +5894,7 @@ struct smb_version_values smb302_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5914,7 +5915,7 @@ struct smb_version_values smb311_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7829c590eeac..d2ecb2ea37c0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -23,7 +23,6 @@ #include #include #include -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsacl.h" #include "cifsproto.h" @@ -84,7 +83,7 @@ int smb3_encryption_required(const struct cifs_tcon *tcon) } static void -smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, +smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, const struct cifs_tcon *tcon, struct TCP_Server_Info *server) { @@ -104,7 +103,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, } else { shdr->CreditRequest = cpu_to_le16(2); } - shdr->ProcessId = cpu_to_le32((__u16)current->tgid); + shdr->Id.SyncId.ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) goto out; @@ -115,10 +114,10 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, shdr->CreditCharge = cpu_to_le16(1); /* else CreditCharge MBZ */ - shdr->TreeId = tcon->tid; + shdr->Id.SyncId.TreeId = cpu_to_le32(tcon->tid); /* Uid is not converted */ if (tcon->ses) - shdr->SessionId = tcon->ses->Suid; + shdr->SessionId = cpu_to_le64(tcon->ses->Suid); /* * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have @@ -331,7 +330,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, unsigned int *total_len) { - struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf; + struct smb2_pdu *spdu = (struct smb2_pdu *)buf; /* lookup word count ie StructureSize from table */ __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)]; @@ -341,10 +340,10 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, */ memset(buf, 0, 256); - smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon, server); + smb2_hdr_assemble(&spdu->hdr, smb2_command, tcon, server); spdu->StructureSize2 = cpu_to_le16(parmsize); - *total_len = parmsize + sizeof(struct smb2_sync_hdr); + *total_len = parmsize + sizeof(struct smb2_hdr); } /* @@ -367,7 +366,7 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, } fill_small_buf(smb2_command, tcon, server, - (struct smb2_sync_hdr *)(*request_buf), + (struct smb2_hdr *)(*request_buf), total_len); if (tcon != NULL) { @@ -414,8 +413,8 @@ build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES; pneg_ctxt->DataLength = cpu_to_le16(38); pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1); - pneg_ctxt->SaltLength = cpu_to_le16(SMB311_LINUX_CLIENT_SALT_SIZE); - get_random_bytes(pneg_ctxt->Salt, SMB311_LINUX_CLIENT_SALT_SIZE); + pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE); + get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE); pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512; } @@ -857,7 +856,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if (rc) return rc; - req->sync_hdr.SessionId = 0; + req->hdr.SessionId = 0; memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); @@ -1018,7 +1017,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, - (struct smb2_sync_hdr *)rsp); + (struct smb2_hdr *)rsp); /* * See MS-SMB2 section 2.2.4: if no blob, client picks default which * for us will be @@ -1250,23 +1249,23 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) return rc; if (sess_data->ses->binding) { - req->sync_hdr.SessionId = sess_data->ses->Suid; - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.SessionId = cpu_to_le64(sess_data->ses->Suid); + req->hdr.Flags |= SMB2_FLAGS_SIGNED; req->PreviousSessionId = 0; req->Flags = SMB2_SESSION_REQ_FLAG_BINDING; } else { /* First session, not a reauthenticate */ - req->sync_hdr.SessionId = 0; + req->hdr.SessionId = 0; /* * if reconnect, we need to send previous sess id * otherwise it is 0 */ - req->PreviousSessionId = sess_data->previous_session; + req->PreviousSessionId = cpu_to_le64(sess_data->previous_session); req->Flags = 0; /* MBZ */ } /* enough to enable echos and oplocks and one max size write */ - req->sync_hdr.CreditRequest = cpu_to_le16(130); + req->hdr.CreditRequest = cpu_to_le16(130); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -1425,7 +1424,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; /* keep session id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1501,7 +1500,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* If true, rc here is expected and not an error */ if (sess_data->buf0_type != CIFS_NO_BUFFER && - rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) rc = 0; if (rc) @@ -1523,7 +1522,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* keep existing ses id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1558,7 +1557,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) goto out; req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; - req->sync_hdr.SessionId = ses->Suid; + req->hdr.SessionId = cpu_to_le64(ses->Suid); rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, sess_data->nls_cp); @@ -1584,7 +1583,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) /* keep existing ses id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1715,12 +1714,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) return rc; /* since no tcon, smb2_init can not do this, so do here */ - req->sync_hdr.SessionId = ses->Suid; + req->hdr.SessionId = cpu_to_le64(ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) flags |= CIFS_TRANSFORM_REQ; else if (server->sign) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; flags |= CIFS_NO_RSP_BUF; @@ -1828,14 +1827,14 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && ((ses->user_name != NULL) || (ses->sectype == Kerberos))) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); rqst.rq_iov = iov; rqst.rq_nvec = 2; /* Need 64 for max size write so ask for more in case not there yet */ - req->sync_hdr.CreditRequest = cpu_to_le16(64); + req->hdr.CreditRequest = cpu_to_le16(64); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -1871,7 +1870,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tidStatus = CifsGood; tcon->need_reconnect = false; - tcon->tid = rsp->sync_hdr.TreeId; + tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && @@ -1892,9 +1891,8 @@ tcon_exit: return rc; tcon_error_exit: - if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { + if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - } goto tcon_exit; } @@ -2608,7 +2606,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, if (tcon->share_flags & SHI1005_FLAGS_DFS) { int name_len; - req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, utf16_path); @@ -2672,11 +2670,13 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, } rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; - trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, + trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId), + tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); - SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId); + SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId), + le64_to_cpu(rsp->VolatileFileId)); /* Eventually save off posix specific response info and timestaps */ @@ -2740,7 +2740,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (tcon->share_flags & SHI1005_FLAGS_DFS) { int name_len; - req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, path); @@ -2943,16 +2943,17 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } goto creat_exit; } else - trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, + trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId), + tcon->tid, ses->Suid, oparms->create_options, oparms->desired_access); atomic_inc(&tcon->num_remote_opens); - oparms->fid->persistent_fid = rsp->PersistentFileId; - oparms->fid->volatile_fid = rsp->VolatileFileId; + oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId); + oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId); oparms->fid->access = oparms->desired_access; #ifdef CONFIG_CIFS_DEBUG2 - oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId); + oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ if (buf) { @@ -3052,7 +3053,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, * response size smaller. */ req->MaxOutputResponse = cpu_to_le32(max_response_size); - req->sync_hdr.CreditCharge = + req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size), SMB2_MAX_BUFFER_SIZE)); if (is_fsctl) @@ -3062,7 +3063,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; return 0; } @@ -3236,8 +3237,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); if (query_attrs) req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB; else @@ -3600,8 +3601,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); /* See note 354 of MS-SMB2, 64K max */ req->OutputBufferLength = cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); @@ -3687,7 +3688,7 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED || mid->mid_state == MID_RESPONSE_MALFORMED) { - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; } @@ -3787,7 +3788,7 @@ SMB2_echo(struct TCP_Server_Info *server) if (rc) return rc; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); iov[0].iov_len = total_len; iov[0].iov_base = (char *)req; @@ -3823,8 +3824,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; @@ -3890,8 +3891,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, unsigned int remaining_bytes, int request_type) { int rc = -EACCES; - struct smb2_read_plain_req *req = NULL; - struct smb2_sync_hdr *shdr; + struct smb2_read_req *req = NULL; + struct smb2_hdr *shdr; struct TCP_Server_Info *server = io_parms->server; rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, server, @@ -3902,11 +3903,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (server == NULL) return -ECONNABORTED; - shdr = &req->sync_hdr; - shdr->ProcessId = cpu_to_le32(io_parms->pid); + shdr = &req->hdr; + shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = io_parms->persistent_fid; - req->VolatileFileId = io_parms->volatile_fid; + req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); + req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); req->ReadChannelInfoOffset = 0; /* reserved */ req->ReadChannelInfoLength = 0; /* reserved */ req->Channel = 0; /* reserved */ @@ -3940,7 +3941,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; req->ReadChannelInfoOffset = - cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer)); + cpu_to_le16(offsetof(struct smb2_read_req, Buffer)); req->ReadChannelInfoLength = cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; @@ -3964,10 +3965,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * Related requests use info from previous read request * in chain. */ - shdr->SessionId = 0xFFFFFFFFFFFFFFFF; - shdr->TreeId = 0xFFFFFFFF; - req->PersistentFileId = 0xFFFFFFFFFFFFFFFF; - req->VolatileFileId = 0xFFFFFFFFFFFFFFFF; + shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); + shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF); + req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); + req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); } } if (remaining_bytes > io_parms->length) @@ -3985,8 +3986,8 @@ smb2_readv_callback(struct mid_q_entry *mid) struct cifs_readdata *rdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = rdata->server; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rdata->iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], .rq_nvec = 1, @@ -4072,7 +4073,7 @@ smb2_async_readv(struct cifs_readdata *rdata) { int rc, flags = 0; char *buf; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 1 }; @@ -4105,7 +4106,7 @@ smb2_async_readv(struct cifs_readdata *rdata) rdata->iov[0].iov_base = buf; rdata->iov[0].iov_len = total_len; - shdr = (struct smb2_sync_hdr *)buf; + shdr = (struct smb2_hdr *)buf; if (rdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, @@ -4144,7 +4145,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, { struct smb_rqst rqst; int resp_buftype, rc; - struct smb2_read_plain_req *req = NULL; + struct smb2_read_req *req = NULL; struct smb2_read_rsp *rsp = NULL; struct kvec iov[1]; struct kvec rsp_iov; @@ -4178,19 +4179,22 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc != -ENODATA) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); - trace_smb3_read_err(xid, req->PersistentFileId, + trace_smb3_read_err(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length, rc); } else - trace_smb3_read_done(xid, req->PersistentFileId, - io_parms->tcon->tid, ses->Suid, - io_parms->offset, 0); + trace_smb3_read_done(xid, + le64_to_cpu(req->PersistentFileId), + io_parms->tcon->tid, ses->Suid, + io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else - trace_smb3_read_done(xid, req->PersistentFileId, + trace_smb3_read_done(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); @@ -4238,7 +4242,7 @@ smb2_writev_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; wdata->result = smb2_check_receive(mid, server, 0); if (wdata->result != 0) @@ -4264,7 +4268,7 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -EAGAIN; break; case MID_RESPONSE_MALFORMED: - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; fallthrough; default: @@ -4311,7 +4315,7 @@ smb2_async_writev(struct cifs_writedata *wdata, { int rc = -EACCES, flags = 0; struct smb2_write_req *req = NULL; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); struct TCP_Server_Info *server = wdata->server; struct kvec iov[1]; @@ -4329,11 +4333,11 @@ smb2_async_writev(struct cifs_writedata *wdata, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - shdr = (struct smb2_sync_hdr *)req; - shdr->ProcessId = cpu_to_le32(wdata->cfile->pid); + shdr = (struct smb2_hdr *)req; + shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); - req->PersistentFileId = wdata->cfile->fid.persistent_fid; - req->VolatileFileId = wdata->cfile->fid.volatile_fid; + req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid); + req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid); req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4430,7 +4434,8 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata, flags, &wdata->credits); if (rc) { - trace_smb3_write_err(0 /* no xid */, req->PersistentFileId, + trace_smb3_write_err(0 /* no xid */, + le64_to_cpu(req->PersistentFileId), tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes, rc); kref_put(&wdata->refcount, release); @@ -4481,10 +4486,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (smb3_encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = io_parms->persistent_fid; - req->VolatileFileId = io_parms->volatile_fid; + req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); + req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4512,7 +4517,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { - trace_smb3_write_err(xid, req->PersistentFileId, + trace_smb3_write_err(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, io_parms->tcon->ses->Suid, io_parms->offset, io_parms->length, rc); @@ -4520,10 +4526,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_dbg(VFS, "Send error in write = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); - trace_smb3_write_done(xid, req->PersistentFileId, - io_parms->tcon->tid, - io_parms->tcon->ses->Suid, - io_parms->offset, *nbytes); + trace_smb3_write_done(xid, + le64_to_cpu(req->PersistentFileId), + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, *nbytes); } cifs_small_buf_release(req); @@ -4866,7 +4873,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { if (rc == -ENODATA && - rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + rsp->hdr.Status == STATUS_NO_MORE_FILES) { trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid, tcon->ses->Suid, index, 0); srch_inf->endOfSearch = true; @@ -4914,7 +4921,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (rc) return rc; - req->sync_hdr.ProcessId = cpu_to_le32(pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid); req->InfoType = info_type; req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; @@ -5074,7 +5081,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFid = volatile_fid; req->PersistentFid = persistent_fid; req->OplockLevel = oplock_level; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); flags |= CIFS_NO_RSP_BUF; @@ -5376,7 +5383,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.ProcessId = cpu_to_le32(pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid); req->LockCount = cpu_to_le16(num_lock); req->PersistentFileId = persist_fid; @@ -5452,7 +5459,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); req->StructureSize = cpu_to_le16(36); total_len += 12; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f32c99c9ba13..33cfd0a1adf1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -14,156 +14,12 @@ #include #include "cifsacl.h" -/* - * Note that, due to trying to use names similar to the protocol specifications, - * there are many mixed case field names in the structures below. Although - * this does not match typical Linux kernel style, it is necessary to be - * able to match against the protocol specfication. - * - * SMB2 commands - * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses - * (ie no useful data other than the SMB error code itself) and are marked such. - * Knowing this helps avoid response buffer allocations and copy in some cases. - */ - -/* List of commands in host endian */ -#define SMB2_NEGOTIATE_HE 0x0000 -#define SMB2_SESSION_SETUP_HE 0x0001 -#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */ -#define SMB2_TREE_CONNECT_HE 0x0003 -#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */ -#define SMB2_CREATE_HE 0x0005 -#define SMB2_CLOSE_HE 0x0006 -#define SMB2_FLUSH_HE 0x0007 /* trivial resp */ -#define SMB2_READ_HE 0x0008 -#define SMB2_WRITE_HE 0x0009 -#define SMB2_LOCK_HE 0x000A -#define SMB2_IOCTL_HE 0x000B -#define SMB2_CANCEL_HE 0x000C -#define SMB2_ECHO_HE 0x000D -#define SMB2_QUERY_DIRECTORY_HE 0x000E -#define SMB2_CHANGE_NOTIFY_HE 0x000F -#define SMB2_QUERY_INFO_HE 0x0010 -#define SMB2_SET_INFO_HE 0x0011 -#define SMB2_OPLOCK_BREAK_HE 0x0012 - -/* The same list in little endian */ -#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) -#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE) -#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE) -#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE) -#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE) -#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE) -#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE) -#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE) -#define SMB2_READ cpu_to_le16(SMB2_READ_HE) -#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE) -#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE) -#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE) -#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE) -#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE) -#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE) -#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE) -#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE) -#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) -#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) - -#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF) - -#define NUMBER_OF_SMB2_COMMANDS 0x0013 - /* 52 transform hdr + 64 hdr + 88 create rsp */ #define SMB2_TRANSFORM_HEADER_SIZE 52 #define MAX_SMB2_HDR_SIZE 204 -#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) -#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) -#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc) - -/* - * SMB2 Header Definition - * - * "MBZ" : Must be Zero - * "BB" : BugBug, Something to check/review/analyze later - * "PDU" : "Protocol Data Unit" (ie a network "frame") - * - */ - -#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) - -struct smb2_sync_hdr { - __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ - __le16 StructureSize; /* 64 */ - __le16 CreditCharge; /* MBZ */ - __le32 Status; /* Error from server */ - __le16 Command; - __le16 CreditRequest; /* CreditResponse */ - __le32 Flags; - __le32 NextCommand; - __le64 MessageId; - __le32 ProcessId; - __u32 TreeId; /* opaque - so do not make little endian */ - __u64 SessionId; /* opaque - so do not make little endian */ - __u8 Signature[16]; -} __packed; - /* The total header size for SMB2 read and write */ -#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr)) - -struct smb2_sync_pdu { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize2; /* size of wct area (varies, request specific) */ -} __packed; - -#define SMB3_AES_CCM_NONCE 11 -#define SMB3_AES_GCM_NONCE 12 - -/* Transform flags (for 3.0 dialect this flag indicates CCM */ -#define TRANSFORM_FLAG_ENCRYPTED 0x0001 -struct smb2_transform_hdr { - __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ - __u8 Signature[16]; - __u8 Nonce[16]; - __le32 OriginalMessageSize; - __u16 Reserved1; - __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */ - __u64 SessionId; -} __packed; - -/* See MS-SMB2 2.2.42 */ -struct smb2_compression_transform_hdr_unchained { - __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ - __le32 OriginalCompressedSegmentSize; - __le16 CompressionAlgorithm; - __le16 Flags; - __le16 Length; /* if chained it is length, else offset */ -} __packed; - -/* See MS-SMB2 2.2.42.1 */ -#define SMB2_COMPRESSION_FLAG_NONE 0x0000 -#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 - -struct compression_payload_header { - __le16 CompressionAlgorithm; - __le16 Flags; - __le32 Length; /* length of compressed playload including field below if present */ - /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ -} __packed; - -/* See MS-SMB2 2.2.42.2 */ -struct smb2_compression_transform_hdr_chained { - __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ - __le32 OriginalCompressedSegmentSize; - /* struct compression_payload_header[] */ -} __packed; - -/* See MS-SMB2 2.2.42.2.2 */ -struct compression_pattern_payload_v1 { - __le16 Pattern; - __le16 Reserved1; - __le16 Reserved2; - __le32 Repetitions; -} __packed; +#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_hdr)) /* See MS-SMB2 2.2.43 */ struct smb2_rdma_transform { @@ -189,17 +45,6 @@ struct smb2_rdma_crypto_transform { /* followed by padding */ } __packed; -/* - * SMB2 flag definitions - */ -#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) -#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) -#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) -#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) -#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */ -#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) -#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ - /* * Definitions for SMB2 Protocol Data Units (network frames) * @@ -214,7 +59,7 @@ struct smb2_rdma_crypto_transform { #define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) struct smb2_err_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; __le16 Reserved; /* MBZ */ __le32 ByteCount; /* even if zero, at least one byte follows */ @@ -270,530 +115,6 @@ struct share_redirect_error_context_rsp { /* __u8 ResourceName[] */ /* Name of share as counted Unicode string */ } __packed; -#define SMB2_CLIENT_GUID_SIZE 16 - -struct smb2_negotiate_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 36 */ - __le16 DialectCount; - __le16 SecurityMode; - __le16 Reserved; /* MBZ */ - __le32 Capabilities; - __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; - /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ - __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ - __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ - __le16 Reserved2; - __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */ -} __packed; - -/* Dialects */ -#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */ -#define SMB20_PROT_ID 0x0202 -#define SMB21_PROT_ID 0x0210 -#define SMB30_PROT_ID 0x0300 -#define SMB302_PROT_ID 0x0302 -#define SMB311_PROT_ID 0x0311 -#define BAD_PROT_ID 0xFFFF - -/* SecurityMode flags */ -#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 -#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 -#define SMB2_SEC_MODE_FLAGS_ALL 0x0003 - -/* Capabilities flags */ -#define SMB2_GLOBAL_CAP_DFS 0x00000001 -#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ -#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ -#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ -/* Internal types */ -#define SMB2_NT_FIND 0x00100000 -#define SMB2_LARGE_FILES 0x00200000 - - -/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */ -#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) -#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) -#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) -#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) -#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) -#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) -#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) -#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) - -struct smb2_neg_context { - __le16 ContextType; - __le16 DataLength; - __le32 Reserved; - /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */ -} __packed; - -#define SMB311_LINUX_CLIENT_SALT_SIZE 32 -/* Hash Algorithm Types */ -#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) -#define SMB2_PREAUTH_HASH_SIZE 64 - -/* - * SaltLength that the server send can be zero, so the only three required - * fields (all __le16) end up six bytes total, so the minimum context data len - * in the response is six bytes which accounts for - * - * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm. - */ -#define MIN_PREAUTH_CTXT_DATA_LEN 6 - -struct smb2_preauth_neg_context { - __le16 ContextType; /* 1 */ - __le16 DataLength; - __le32 Reserved; - __le16 HashAlgorithmCount; /* 1 */ - __le16 SaltLength; - __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ - __u8 Salt[SMB311_LINUX_CLIENT_SALT_SIZE]; -} __packed; - -/* Encryption Algorithms Ciphers */ -#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) -#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) -/* we currently do not request AES256_CCM since presumably GCM faster */ -#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) -#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) - -/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ -#define MIN_ENCRYPT_CTXT_DATA_LEN 4 -struct smb2_encryption_neg_context { - __le16 ContextType; /* 2 */ - __le16 DataLength; - __le32 Reserved; - /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ - __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ - __le16 Ciphers[3]; -} __packed; - -/* See MS-SMB2 2.2.3.1.3 */ -#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000) -#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001) -#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) -#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) -/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ -#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ - -/* Compression Flags */ -#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) -#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001) - -struct smb2_compression_capabilities_context { - __le16 ContextType; /* 3 */ - __le16 DataLength; - __u32 Reserved; - __le16 CompressionAlgorithmCount; - __u16 Padding; - __u32 Flags; - __le16 CompressionAlgorithms[3]; - __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */ - /* Check if pad needed */ -} __packed; - -/* - * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4. - * Its struct simply contains NetName, an array of Unicode characters - */ -struct smb2_netname_neg_context { - __le16 ContextType; /* 5 */ - __le16 DataLength; - __le32 Reserved; - __le16 NetName[]; /* hostname of target converted to UCS-2 */ -} __packed; - -/* - * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 - * and 2.2.4.1.5 - */ - -/* Flags */ -#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 - -struct smb2_transport_capabilities_context { - __le16 ContextType; /* 6 */ - __le16 DataLength; - __u32 Reserved; - __le32 Flags; - __u32 Pad; -} __packed; - -/* - * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 - * and 2.2.4.1.6 - */ - -/* RDMA Transform IDs */ -#define SMB2_RDMA_TRANSFORM_NONE 0x0000 -#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 -#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 - -struct smb2_rdma_transform_capabilities_context { - __le16 ContextType; /* 7 */ - __le16 DataLength; - __u32 Reserved; - __le16 TransformCount; - __u16 Reserved1; - __u32 Reserved2; - __le16 RDMATransformIds[]; -} __packed; - -/* - * For signing capabilities context see MS-SMB2 2.2.3.1.7 - * and 2.2.4.1.7 - */ - -/* Signing algorithms */ -#define SIGNING_ALG_HMAC_SHA256 0 -#define SIGNING_ALG_AES_CMAC 1 -#define SIGNING_ALG_AES_GMAC 2 - -struct smb2_signing_capabilities { - __le16 ContextType; /* 8 */ - __le16 DataLength; - __u32 Reserved; - __le16 SigningAlgorithmCount; - __le16 SigningAlgorithms[]; - /* Followed by padding to 8 byte boundary (required by some servers) */ -} __packed; - -#define POSIX_CTXT_DATA_LEN 16 -struct smb2_posix_neg_context { - __le16 ContextType; /* 0x100 */ - __le16 DataLength; - __le32 Reserved; - __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ -} __packed; - -struct smb2_negotiate_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 65 */ - __le16 SecurityMode; - __le16 DialectRevision; - __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ - __u8 ServerGUID[16]; - __le32 Capabilities; - __le32 MaxTransactSize; - __le32 MaxReadSize; - __le32 MaxWriteSize; - __le64 SystemTime; /* MBZ */ - __le64 ServerStartTime; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - -/* Flags */ -#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 -#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 - -struct smb2_sess_setup_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 25 */ - __u8 Flags; - __u8 SecurityMode; - __le32 Capabilities; - __le32 Channel; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __u64 PreviousSessionId; - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - -/* Currently defined SessionFlags */ -#define SMB2_SESSION_FLAG_IS_GUEST 0x0001 -#define SMB2_SESSION_FLAG_IS_NULL 0x0002 -#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 -struct smb2_sess_setup_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 SessionFlags; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - -struct smb2_logoff_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_logoff_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -/* Flags/Reserved for SMB3.1.1 */ -#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001) -#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002) -#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004) - -struct smb2_tree_connect_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */ - __le16 PathOffset; - __le16 PathLength; - __u8 Buffer[1]; /* variable length */ -} __packed; - -/* See MS-SMB2 section 2.2.9.2 */ -/* Context Types */ -#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000 -#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001) - -struct tree_connect_contexts { - __le16 ContextType; - __le16 DataLength; - __le32 Reserved; - __u8 Data[]; -} __packed; - -/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */ -struct smb3_blob_data { - __le16 BlobSize; - __u8 BlobData[]; -} __packed; - -/* Valid values for Attr */ -#define SE_GROUP_MANDATORY 0x00000001 -#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002 -#define SE_GROUP_ENABLED 0x00000004 -#define SE_GROUP_OWNER 0x00000008 -#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010 -#define SE_GROUP_INTEGRITY 0x00000020 -#define SE_GROUP_INTEGRITY_ENABLED 0x00000040 -#define SE_GROUP_RESOURCE 0x20000000 -#define SE_GROUP_LOGON_ID 0xC0000000 - -/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ - -struct sid_array_data { - __le16 SidAttrCount; - /* SidAttrList - array of sid_attr_data structs */ -} __packed; - -struct luid_attr_data { - -} __packed; - -/* - * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5 - * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA - */ - -struct privilege_array_data { - __le16 PrivilegeCount; - /* array of privilege_data structs */ -} __packed; - -struct remoted_identity_tcon_context { - __le16 TicketType; /* must be 0x0001 */ - __le16 TicketSize; /* total size of this struct */ - __le16 User; /* offset to SID_ATTR_DATA struct with user info */ - __le16 UserName; /* offset to null terminated Unicode username string */ - __le16 Domain; /* offset to null terminated Unicode domain name */ - __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */ - __le16 RestrictedGroups; /* similar to above */ - __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */ - __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */ - __le16 Owner; /* offset to BLOB_DATA struct */ - __le16 DefaultDacl; /* offset to BLOB_DATA struct */ - __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */ - __le16 UserClaims; /* offset to BLOB_DATA struct */ - __le16 DeviceClaims; /* offset to BLOB_DATA struct */ - __u8 TicketInfo[]; /* variable length buf - remoted identity data */ -} __packed; - -struct smb2_tree_connect_req_extension { - __le32 TreeConnectContextOffset; - __le16 TreeConnectContextCount; - __u8 Reserved[10]; - __u8 PathName[]; /* variable sized array */ - /* followed by array of TreeConnectContexts */ -} __packed; - -struct smb2_tree_connect_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 16 */ - __u8 ShareType; /* see below */ - __u8 Reserved; - __le32 ShareFlags; /* see below */ - __le32 Capabilities; /* see below */ - __le32 MaximalAccess; -} __packed; - -/* Possible ShareType values */ -#define SMB2_SHARE_TYPE_DISK 0x01 -#define SMB2_SHARE_TYPE_PIPE 0x02 -#define SMB2_SHARE_TYPE_PRINT 0x03 - -/* - * Possible ShareFlags - exactly one and only one of the first 4 caching flags - * must be set (any of the remaining, SHI1005, flags may be set individually - * or in combination. - */ -#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000 -#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010 -#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020 -#define SMB2_SHAREFLAG_NO_CACHING 0x00000030 -#define SHI1005_FLAGS_DFS 0x00000001 -#define SHI1005_FLAGS_DFS_ROOT 0x00000002 -#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 -#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 -#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 -#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 -#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 -#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 -#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 -#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 -#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ -#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ -#define SHI1005_FLAGS_ALL 0x0014FF33 - -/* Possible share capabilities */ -#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ -#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ -#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ -#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ -#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ -#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */ - -struct smb2_tree_disconnect_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_tree_disconnect_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -/* File Attrubutes */ -#define FILE_ATTRIBUTE_READONLY 0x00000001 -#define FILE_ATTRIBUTE_HIDDEN 0x00000002 -#define FILE_ATTRIBUTE_SYSTEM 0x00000004 -#define FILE_ATTRIBUTE_DIRECTORY 0x00000010 -#define FILE_ATTRIBUTE_ARCHIVE 0x00000020 -#define FILE_ATTRIBUTE_NORMAL 0x00000080 -#define FILE_ATTRIBUTE_TEMPORARY 0x00000100 -#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200 -#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400 -#define FILE_ATTRIBUTE_COMPRESSED 0x00000800 -#define FILE_ATTRIBUTE_OFFLINE 0x00001000 -#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 -#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 -#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000 -#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000 - -/* Oplock levels */ -#define SMB2_OPLOCK_LEVEL_NONE 0x00 -#define SMB2_OPLOCK_LEVEL_II 0x01 -#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 -#define SMB2_OPLOCK_LEVEL_BATCH 0x09 -#define SMB2_OPLOCK_LEVEL_LEASE 0xFF -/* Non-spec internal type */ -#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 - -/* Desired Access Flags */ -#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) -#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002) -#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004) -#define FILE_READ_EA_LE cpu_to_le32(0x00000008) -#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010) -#define FILE_EXECUTE_LE cpu_to_le32(0x00000020) -#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080) -#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100) -#define FILE_DELETE_LE cpu_to_le32(0x00010000) -#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000) -#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000) -#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000) -#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000) -#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000) -#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000) -#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000) -#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000) -#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000) -#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000) - -/* ShareAccess Flags */ -#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001) -#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002) -#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004) -#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007) - -/* CreateDisposition Flags */ -#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000) -#define FILE_OPEN_LE cpu_to_le32(0x00000001) -#define FILE_CREATE_LE cpu_to_le32(0x00000002) -#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003) -#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004) -#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005) - -/* CreateOptions Flags */ -#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001) -/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */ -#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) -#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) -#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008) -#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) -#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE cpu_to_le32(0x00000020) -#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) -#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) -#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) -#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) -#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) -#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) -#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) -#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) -#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000) -#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) -#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) -#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000) - -#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ - | FILE_READ_ATTRIBUTES_LE) -#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \ - | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE) -#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE) - -/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */ -#define IL_ANONYMOUS cpu_to_le32(0x00000000) -#define IL_IDENTIFICATION cpu_to_le32(0x00000001) -#define IL_IMPERSONATION cpu_to_le32(0x00000002) -#define IL_DELEGATE cpu_to_le32(0x00000003) - -/* Create Context Values */ -#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */ -#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ -#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" -#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" -#define SMB2_CREATE_ALLOCATION_SIZE "AISi" -#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" -#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" -#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" -#define SMB2_CREATE_REQUEST_LEASE "RqLs" -#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" -#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" -#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 -#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010 -#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83 -#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C - -/* Flag (SMB3 open response) values */ -#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 - /* * Maximum number of iovs we need for an open/create request. * [0] : struct smb2_create_req @@ -807,26 +128,6 @@ struct smb2_tree_disconnect_rsp { */ #define SMB2_CREATE_IOV_SIZE 8 -struct smb2_create_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 57 */ - __u8 SecurityFlags; - __u8 RequestedOplockLevel; - __le32 ImpersonationLevel; - __le64 SmbCreateFlags; - __le64 Reserved; - __le32 DesiredAccess; - __le32 FileAttributes; - __le32 ShareAccess; - __le32 CreateDisposition; - __le32 CreateOptions; - __le16 NameOffset; - __le16 NameLength; - __le32 CreateContextsOffset; - __le32 CreateContextsLength; - __u8 Buffer[]; -} __packed; - /* * Maximum size of a SMB2_CREATE response is 64 (smb2 header) + * 88 (fixed part of create response) + 520 (path) + 208 (contexts) + @@ -834,37 +135,6 @@ struct smb2_create_req { */ #define MAX_SMB2_CREATE_RESPONSE_SIZE 880 -struct smb2_create_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 89 */ - __u8 OplockLevel; - __u8 Flag; /* 0x01 if reparse point */ - __le32 CreateAction; - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 AllocationSize; - __le64 EndofFile; - __le32 FileAttributes; - __le32 Reserved2; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 CreateContextsOffset; - __le32 CreateContextsLength; - __u8 Buffer[1]; -} __packed; - -struct create_context { - __le32 Next; - __le16 NameOffset; - __le16 NameLength; - __le16 Reserved; - __le16 DataOffset; - __le32 DataLength; - __u8 Buffer[]; -} __packed; - #define SMB2_LEASE_READ_CACHING_HE 0x01 #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 @@ -1210,7 +480,7 @@ struct duplicate_extents_to_file { #define SMB2_IOCTL_IOV_SIZE 2 struct smb2_ioctl_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -1228,7 +498,7 @@ struct smb2_ioctl_req { } __packed; struct smb2_ioctl_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -1243,161 +513,6 @@ struct smb2_ioctl_rsp { /* char * buffer[] */ } __packed; -/* Currently defined values for close flags */ -#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) -struct smb2_close_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 24 */ - __le16 Flags; - __le32 Reserved; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ -} __packed; - -/* - * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) - */ -#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 - -struct smb2_close_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* 60 */ - __le16 Flags; - __le32 Reserved; - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ - __le64 EndOfFile; - __le32 Attributes; -} __packed; - -struct smb2_flush_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 24 */ - __le16 Reserved1; - __le32 Reserved2; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ -} __packed; - -struct smb2_flush_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; - __le16 Reserved; -} __packed; - -/* For read request Flags field below, following flag is defined for SMB3.02 */ -#define SMB2_READFLAG_READ_UNBUFFERED 0x01 -#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */ - -/* Channel field for read and write: exactly one of following flags can be set*/ -#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) -#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */ -#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */ -#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */ - -/* SMB2 read request without RFC1001 length at the beginning */ -struct smb2_read_plain_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 49 */ - __u8 Padding; /* offset from start of SMB2 header to place read */ - __u8 Flags; /* MBZ unless SMB3.02 or later */ - __le32 Length; - __le64 Offset; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 MinimumCount; - __le32 Channel; /* MBZ except for SMB3 or later */ - __le32 RemainingBytes; - __le16 ReadChannelInfoOffset; - __le16 ReadChannelInfoLength; - __u8 Buffer[1]; -} __packed; - -/* Read flags */ -#define SMB2_READFLAG_RESPONSE_NONE 0x00000000 -#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001 - -struct smb2_read_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 17 */ - __u8 DataOffset; - __u8 Reserved; - __le32 DataLength; - __le32 DataRemaining; - __u32 Flags; - __u8 Buffer[1]; -} __packed; - -/* For write request Flags field below the following flags are defined: */ -#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ -#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ - -struct smb2_write_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 49 */ - __le16 DataOffset; /* offset from start of SMB2 header to write data */ - __le32 Length; - __le64 Offset; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 Channel; /* MBZ unless SMB3.02 or later */ - __le32 RemainingBytes; - __le16 WriteChannelInfoOffset; - __le16 WriteChannelInfoLength; - __le32 Flags; - __u8 Buffer[1]; -} __packed; - -struct smb2_write_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 17 */ - __u8 DataOffset; - __u8 Reserved; - __le32 DataLength; - __le32 DataRemaining; - __u32 Reserved2; - __u8 Buffer[1]; -} __packed; - -/* notify flags */ -#define SMB2_WATCH_TREE 0x0001 - -/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */ -#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 -#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 -#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 -#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 -#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 -#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 -#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 -#define FILE_NOTIFY_CHANGE_EA 0x00000080 -#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 -#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 -#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 -#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 - -struct smb2_change_notify_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; - __le16 Flags; - __le32 OutputBufferLength; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 CompletionFilter; - __u32 Reserved; -} __packed; - -struct smb2_change_notify_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; /* array of file notify structs */ -} __packed; - #define SMB2_LOCKFLAG_SHARED_LOCK 0x0001 #define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002 #define SMB2_LOCKFLAG_UNLOCK 0x0004 @@ -1411,7 +526,7 @@ struct smb2_lock_element { } __packed; struct smb2_lock_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 48 */ __le16 LockCount; /* @@ -1426,19 +541,19 @@ struct smb2_lock_req { } __packed; struct smb2_lock_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; struct smb2_echo_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; struct smb2_echo_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; @@ -1468,7 +583,7 @@ struct smb2_echo_rsp { */ struct smb2_query_directory_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 33 */ __u8 FileInformationClass; __u8 Flags; @@ -1482,7 +597,7 @@ struct smb2_query_directory_req { } __packed; struct smb2_query_directory_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1515,7 +630,7 @@ struct smb2_query_directory_rsp { #define SL_INDEX_SPECIFIED 0x00000004 struct smb2_query_info_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 41 */ __u8 InfoType; __u8 FileInfoClass; @@ -1531,7 +646,7 @@ struct smb2_query_info_req { } __packed; struct smb2_query_info_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1548,7 +663,7 @@ struct smb2_query_info_rsp { #define SMB2_SET_INFO_IOV_SIZE 3 struct smb2_set_info_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 33 */ __u8 InfoType; __u8 FileInfoClass; @@ -1562,12 +677,12 @@ struct smb2_set_info_req { } __packed; struct smb2_set_info_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 2 */ } __packed; struct smb2_oplock_break { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 24 */ __u8 OplockLevel; __u8 Reserved; @@ -1579,7 +694,7 @@ struct smb2_oplock_break { #define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) struct smb2_lease_break { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 44 */ __le16 Epoch; __le32 Flags; @@ -1592,7 +707,7 @@ struct smb2_lease_break { } __packed; struct smb2_lease_ack { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 36 */ __le16 Reserved; __le32 Flags; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 547945443fa7..096fada16ebd 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -25,7 +25,7 @@ extern int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *server); extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server); extern char *smb2_get_data_area_len(int *off, int *len, - struct smb2_sync_hdr *shdr); + struct smb2_hdr *shdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index f59b956f9d25..2bf047b390a9 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -19,7 +19,6 @@ #include #include #include -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" @@ -213,14 +212,14 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; + struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; struct shash_desc *shash; struct crypto_shash *hash; struct sdesc *sdesc = NULL; struct smb_rqst drqst; - ses = smb2_find_smb_ses(server, shdr->SessionId); + ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); if (!ses) { cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); return 0; @@ -534,14 +533,14 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; + struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct shash_desc *shash; struct crypto_shash *hash; struct sdesc *sdesc = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; - rc = smb2_get_sign_key(shdr->SessionId, server, key); + rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); if (rc) return 0; @@ -611,12 +610,12 @@ static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct smb2_sess_setup_req *ssr; bool is_binding; bool is_signed; - shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + shdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; ssr = (struct smb2_sess_setup_req *)shdr; is_binding = shdr->Command == SMB2_SESSION_SETUP && @@ -642,8 +641,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { unsigned int rc; char server_response_sig[SMB2_SIGNATURE_SIZE]; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; if ((shdr->Command == SMB2_NEGOTIATE) || (shdr->Command == SMB2_SESSION_SETUP) || @@ -689,7 +688,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) */ static inline void smb2_seq_num_into_buf(struct TCP_Server_Info *server, - struct smb2_sync_hdr *shdr) + struct smb2_hdr *shdr) { unsigned int i, num = le16_to_cpu(shdr->CreditCharge); @@ -700,7 +699,7 @@ smb2_seq_num_into_buf(struct TCP_Server_Info *server, } static struct mid_q_entry * -smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, +smb2_mid_entry_alloc(const struct smb2_hdr *shdr, struct TCP_Server_Info *server) { struct mid_q_entry *temp; @@ -732,14 +731,15 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; - trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), temp->mid); + trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), temp->mid); return temp; } static int smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, - struct smb2_sync_hdr *shdr, struct mid_q_entry **mid) + struct smb2_hdr *shdr, struct mid_q_entry **mid) { if (server->tcpStatus == CifsExiting) return -ENOENT; @@ -807,8 +807,8 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(server, shdr); @@ -833,8 +833,8 @@ struct mid_q_entry * smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; if (server->tcpStatus == CifsNeedNegotiate && diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index dafcb6ab050d..6cecf302dcfd 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -11,6 +11,8 @@ #define _CIFS_TRACE_H #include +#include +#include /* * Please use this 3-part article as a reference for writing new tracepoints: @@ -854,6 +856,75 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \ DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); +DECLARE_EVENT_CLASS(smb3_connect_class, + TP_PROTO(char *hostname, + __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr), + TP_ARGS(hostname, conn_id, dst_addr), + TP_STRUCT__entry( + __string(hostname, hostname) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname, hostname); + ), + TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_CONNECT_EVENT(name) \ +DEFINE_EVENT(smb3_connect_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr), \ + TP_ARGS(hostname, conn_id, addr)) + +DEFINE_SMB3_CONNECT_EVENT(connect_done); + +DECLARE_EVENT_CLASS(smb3_connect_err_class, + TP_PROTO(char *hostname, __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr, int rc), + TP_ARGS(hostname, conn_id, dst_addr, rc), + TP_STRUCT__entry( + __string(hostname, hostname) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + __field(int, rc) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + __entry->rc = rc; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname, hostname); + ), + TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->rc, + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_CONNECT_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr, \ + int rc), \ + TP_ARGS(hostname, conn_id, addr, rc)) + +DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err); + DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, __u64 conn_id, diff --git a/fs/d_path.c b/fs/d_path.c index cd60c7535181..e4e0ebad1f15 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen) /** * prepend_name - prepend a pathname in front of current buffer pointer - * @buffer: buffer pointer - * @buflen: allocated length of the buffer - * @name: name string and length qstr structure + * @p: prepend buffer which contains buffer pointer and allocated length + * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are @@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry - * @buffer: pointer to the end of the buffer - * @buflen: pointer to buffer length + * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index de9bb1405099..bd1e167436f4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ @@ -759,6 +760,8 @@ void __ext4_error(struct super_block *sb, const char *function, sb->s_id, function, line, current->comm, &vaf); va_end(args); } + fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); + ext4_handle_error(sb, force_ro, error, 0, block, function, line); } @@ -789,6 +792,8 @@ void __ext4_error_inode(struct inode *inode, const char *function, current->comm, &vaf); va_end(args); } + fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); + ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } @@ -827,6 +832,8 @@ void __ext4_error_file(struct file *file, const char *function, current->comm, path, &vaf); va_end(args); } + fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); + ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } @@ -894,6 +901,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } + fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 678e2c51b855..0c6eacfcbeef 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1322,6 +1322,8 @@ static int isofs_read_inode(struct inode *inode, int relocated) de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; + if (de_len < sizeof(struct iso_directory_record)) + goto fail; if (offset + de_len > bufsize) { int frag1 = bufsize - offset; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index be3c1aad50ea..fdf89fcf1a0c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -602,6 +602,9 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie) { + if (WARN_ON_ONCE(!inode)) + return 0; + trace_nfsd_file_fsnotify_handle_event(inode, mask); /* Should be no marks on non-regular files */ diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 057abd2cf887..b6091775aa6e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -111,6 +111,16 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, return fanotify_info_equal(info1, info2); } +static bool fanotify_error_event_equal(struct fanotify_error_event *fee1, + struct fanotify_error_event *fee2) +{ + /* Error events against the same file system are always merged. */ + if (!fanotify_fsid_equal(&fee1->fsid, &fee2->fsid)) + return false; + + return true; +} + static bool fanotify_should_merge(struct fanotify_event *old, struct fanotify_event *new) { @@ -141,6 +151,9 @@ static bool fanotify_should_merge(struct fanotify_event *old, case FANOTIFY_EVENT_TYPE_FID_NAME: return fanotify_name_event_equal(FANOTIFY_NE(old), FANOTIFY_NE(new)); + case FANOTIFY_EVENT_TYPE_FS_ERROR: + return fanotify_error_event_equal(FANOTIFY_EE(old), + FANOTIFY_EE(new)); default: WARN_ON_ONCE(1); } @@ -176,6 +189,10 @@ static int fanotify_merge(struct fsnotify_group *group, break; if (fanotify_should_merge(old, new)) { old->mask |= new->mask; + + if (fanotify_is_error_event(old->mask)) + FANOTIFY_EE(old)->err_count++; + return 1; } } @@ -343,13 +360,23 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, static int fanotify_encode_fh_len(struct inode *inode) { int dwords = 0; + int fh_len; if (!inode) return 0; exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); + fh_len = dwords << 2; - return dwords << 2; + /* + * struct fanotify_error_event might be preallocated and is + * limited to MAX_HANDLE_SZ. This should never happen, but + * safeguard by forcing an invalid file handle. + */ + if (WARN_ON_ONCE(fh_len > MAX_HANDLE_SZ)) + return 0; + + return fh_len; } /* @@ -370,8 +397,14 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = FILEID_ROOT; fh->len = 0; fh->flags = 0; + + /* + * Invalid FHs are used by FAN_FS_ERROR for errors not + * linked to any inode. The f_handle won't be reported + * back to userspace. + */ if (!inode) - return 0; + goto out; /* * !gpf means preallocated variable size fh, but fh_len could @@ -403,8 +436,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = type; fh->len = fh_len; - /* Mix fh into event merge key */ - *hash ^= fanotify_hash_fh(fh); +out: + /* + * Mix fh into event merge key. Hash might be NULL in case of + * unhashed FID events (i.e. FAN_FS_ERROR). + */ + if (hash) + *hash ^= fanotify_hash_fh(fh); return FANOTIFY_FH_HDR_LEN + fh_len; @@ -452,7 +490,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data, if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) return dir; - if (S_ISDIR(inode->i_mode)) + if (inode && S_ISDIR(inode->i_mode)) return inode; return dir; @@ -563,6 +601,44 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, return &fne->fae; } +static struct fanotify_event *fanotify_alloc_error_event( + struct fsnotify_group *group, + __kernel_fsid_t *fsid, + const void *data, int data_type, + unsigned int *hash) +{ + struct fs_error_report *report = + fsnotify_data_error_report(data, data_type); + struct inode *inode; + struct fanotify_error_event *fee; + int fh_len; + + if (WARN_ON_ONCE(!report)) + return NULL; + + fee = mempool_alloc(&group->fanotify_data.error_events_pool, GFP_NOFS); + if (!fee) + return NULL; + + fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR; + fee->error = report->error; + fee->err_count = 1; + fee->fsid = *fsid; + + inode = report->inode; + fh_len = fanotify_encode_fh_len(inode); + + /* Bad fh_len. Fallback to using an invalid fh. Should never happen. */ + if (!fh_len && inode) + inode = NULL; + + fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0); + + *hash ^= fanotify_hash_fsid(fsid); + + return &fee->fae; +} + static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, @@ -630,6 +706,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); + } else if (fanotify_is_error_event(mask)) { + event = fanotify_alloc_error_event(group, fsid, data, + data_type, &hash); } else if (name_event && (file_name || child)) { event = fanotify_alloc_name_event(id, fsid, file_name, child, &hash, gfp); @@ -702,6 +781,9 @@ static void fanotify_insert_event(struct fsnotify_group *group, assert_spin_locked(&group->notification_lock); + if (!fanotify_is_hashed_event(event->mask)) + return; + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, group, event, bucket); @@ -738,8 +820,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); + BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type, dir); @@ -778,9 +861,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, } fsn_event = &event->fse; - ret = fsnotify_add_event(group, fsn_event, fanotify_merge, - fanotify_is_hashed_event(mask) ? - fanotify_insert_event : NULL); + ret = fsnotify_insert_event(group, fsn_event, fanotify_merge, + fanotify_insert_event); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); @@ -805,6 +887,9 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) if (group->fanotify_data.ucounts) dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_GROUPS); + + if (mempool_initialized(&group->fanotify_data.error_events_pool)) + mempool_exit(&group->fanotify_data.error_events_pool); } static void fanotify_free_path_event(struct fanotify_event *event) @@ -833,7 +918,16 @@ static void fanotify_free_name_event(struct fanotify_event *event) kfree(FANOTIFY_NE(event)); } -static void fanotify_free_event(struct fsnotify_event *fsn_event) +static void fanotify_free_error_event(struct fsnotify_group *group, + struct fanotify_event *event) +{ + struct fanotify_error_event *fee = FANOTIFY_EE(event); + + mempool_free(fee, &group->fanotify_data.error_events_pool); +} + +static void fanotify_free_event(struct fsnotify_group *group, + struct fsnotify_event *fsn_event) { struct fanotify_event *event; @@ -855,6 +949,9 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) case FANOTIFY_EVENT_TYPE_OVERFLOW: kfree(event); break; + case FANOTIFY_EVENT_TYPE_FS_ERROR: + fanotify_free_error_event(group, event); + break; default: WARN_ON_ONCE(1); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 4a5e555dc3d2..d25f500bf7e7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -141,6 +141,7 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH, FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ + FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ __FANOTIFY_EVENT_TYPE_NUM }; @@ -170,12 +171,18 @@ static inline void fanotify_init_event(struct fanotify_event *event, event->pid = NULL; } +#define FANOTIFY_INLINE_FH(name, size) \ +struct { \ + struct fanotify_fh (name); \ + /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \ + unsigned char _inline_fh_buf[(size)]; \ +} + struct fanotify_fid_event { struct fanotify_event fae; __kernel_fsid_t fsid; - struct fanotify_fh object_fh; - /* Reserve space in object_fh.buf[] - access with fanotify_fh_buf() */ - unsigned char _inline_fh_buf[FANOTIFY_INLINE_FH_LEN]; + + FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN); }; static inline struct fanotify_fid_event * @@ -196,12 +203,30 @@ FANOTIFY_NE(struct fanotify_event *event) return container_of(event, struct fanotify_name_event, fae); } +struct fanotify_error_event { + struct fanotify_event fae; + s32 error; /* Error reported by the Filesystem. */ + u32 err_count; /* Suppressed errors count */ + + __kernel_fsid_t fsid; /* FSID this error refers to. */ + + FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ); +}; + +static inline struct fanotify_error_event * +FANOTIFY_EE(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_error_event, fae); +} + static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_FID) return &FANOTIFY_FE(event)->fsid; else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return &FANOTIFY_NE(event)->fsid; + else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return &FANOTIFY_EE(event)->fsid; else return NULL; } @@ -213,6 +238,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh( return &FANOTIFY_FE(event)->object_fh; else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return fanotify_info_file_fh(&FANOTIFY_NE(event)->info); + else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return &FANOTIFY_EE(event)->object_fh; else return NULL; } @@ -244,6 +271,19 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event) return info ? fanotify_info_dir_fh_len(info) : 0; } +static inline bool fanotify_event_has_object_fh(struct fanotify_event *event) +{ + /* For error events, even zeroed fh are reported. */ + if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return true; + return fanotify_event_object_fh_len(event) > 0; +} + +static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event) +{ + return fanotify_event_dir_fh_len(event) > 0; +} + struct fanotify_path_event { struct fanotify_event fae; struct path path; @@ -287,6 +327,11 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct fanotify_event, fse); } +static inline bool fanotify_is_error_event(u32 mask) +{ + return mask & FAN_FS_ERROR; +} + static inline bool fanotify_event_has_path(struct fanotify_event *event) { return event->type == FANOTIFY_EVENT_TYPE_PATH || @@ -315,7 +360,8 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event) */ static inline bool fanotify_is_hashed_event(u32 mask) { - return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW); + return !(fanotify_is_perm_event(mask) || + fsnotify_is_overflow_event(mask)); } static inline unsigned int fanotify_event_hash_bucket( diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6facdf476255..559bc1e9926d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -30,6 +30,7 @@ #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_GROUPS 128 +#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 /* * Legacy fanotify marks limits (8192) is per group and we introduced a tunable @@ -114,6 +115,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) #define FANOTIFY_PIDFD_INFO_HDR_LEN \ sizeof(struct fanotify_event_info_pidfd) +#define FANOTIFY_ERROR_INFO_LEN \ + (sizeof(struct fanotify_event_info_error)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -126,17 +129,26 @@ static int fanotify_fid_info_len(int fh_len, int name_len) FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(unsigned int info_mode, - struct fanotify_event *event) +static size_t fanotify_event_len(unsigned int info_mode, + struct fanotify_event *event) { - struct fanotify_info *info = fanotify_event_info(event); - int dir_fh_len = fanotify_event_dir_fh_len(event); - int fh_len = fanotify_event_object_fh_len(event); - int info_len = 0; + size_t event_len = FAN_EVENT_METADATA_LEN; + struct fanotify_info *info; + int dir_fh_len; + int fh_len; int dot_len = 0; - if (dir_fh_len) { - info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); + if (!info_mode) + return event_len; + + if (fanotify_is_error_event(event->mask)) + event_len += FANOTIFY_ERROR_INFO_LEN; + + info = fanotify_event_info(event); + + if (fanotify_event_has_dir_fh(event)) { + dir_fh_len = fanotify_event_dir_fh_len(event); + event_len += fanotify_fid_info_len(dir_fh_len, info->name_len); } else if ((info_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* @@ -147,12 +159,14 @@ static int fanotify_event_info_len(unsigned int info_mode, } if (info_mode & FAN_REPORT_PIDFD) - info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; + event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; - if (fh_len) - info_len += fanotify_fid_info_len(fh_len, dot_len); + if (fanotify_event_has_object_fh(event)) { + fh_len = fanotify_event_object_fh_len(event); + event_len += fanotify_fid_info_len(fh_len, dot_len); + } - return info_len; + return event_len; } /* @@ -181,7 +195,7 @@ static void fanotify_unhash_event(struct fsnotify_group *group, static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - size_t event_size = FAN_EVENT_METADATA_LEN; + size_t event_size; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); @@ -194,8 +208,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, goto out; event = FANOTIFY_E(fsn_event); - if (info_mode) - event_size += fanotify_event_info_len(info_mode, event); + event_size = fanotify_event_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); @@ -316,6 +329,28 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } +static size_t copy_error_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_error info; + struct fanotify_error_event *fee = FANOTIFY_EE(event); + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; + info.hdr.pad = 0; + info.hdr.len = FANOTIFY_ERROR_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.error = fee->error; + info.error_count = fee->err_count; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, int info_type, const char *name, size_t name_len, @@ -331,9 +366,6 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", __func__, fh_len, name_len, info_len, count); - if (!fh_len) - return 0; - if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT; @@ -368,6 +400,11 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, handle.handle_type = fh->type; handle.handle_bytes = fh_len; + + /* Mangle handle_type for bad file_handle */ + if (!fh_len) + handle.handle_type = FILEID_INVALID; + if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT; @@ -444,7 +481,7 @@ static int copy_info_records_to_user(struct fanotify_event *event, /* * Event info records order is as follows: dir fid + name, child fid. */ - if (fanotify_event_dir_fh_len(event)) { + if (fanotify_event_has_dir_fh(event)) { info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : FAN_EVENT_INFO_TYPE_DFID; ret = copy_fid_info_to_user(fanotify_event_fsid(event), @@ -460,7 +497,7 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } - if (fanotify_event_object_fh_len(event)) { + if (fanotify_event_has_object_fh(event)) { const char *dot = NULL; int dot_len = 0; @@ -520,6 +557,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_is_error_event(event->mask)) { + ret = copy_error_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } @@ -537,8 +583,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(info_mode, event); + metadata.event_len = fanotify_event_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; @@ -1049,6 +1094,15 @@ out_dec_ucounts: return ERR_PTR(ret); } +static int fanotify_group_init_error_pool(struct fsnotify_group *group) +{ + if (mempool_initialized(&group->fanotify_data.error_events_pool)) + return 0; + + return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, + FANOTIFY_DEFAULT_FEE_POOL_SIZE, + sizeof(struct fanotify_error_event)); +} static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int type, @@ -1057,6 +1111,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_mark(connp, group); @@ -1067,13 +1122,26 @@ static int fanotify_add_mark(struct fsnotify_group *group, return PTR_ERR(fsn_mark); } } + + /* + * Error events are pre-allocated per group, only if strictly + * needed (i.e. FAN_FS_ERROR was requested). + */ + if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + ret = fanotify_group_init_error_pool(group); + if (ret) + goto out; + } + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); if (added & ~fsnotify_conn_mask(fsn_mark->connector)) fsnotify_recalc_mask(fsn_mark->connector); + +out: mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); - return 0; + return ret; } static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, @@ -1295,16 +1363,15 @@ out_destroy_group: return fd; } -/* Check if filesystem can encode a unique fid */ -static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) +static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) { __kernel_fsid_t root_fsid; int err; /* - * Make sure path is not in filesystem with zero fsid (e.g. tmpfs). + * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ - err = vfs_get_fsid(path->dentry, fsid); + err = vfs_get_fsid(dentry, fsid); if (err) return err; @@ -1312,10 +1379,10 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) return -ENODEV; /* - * Make sure path is not inside a filesystem subvolume (e.g. btrfs) + * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) * which uses a different fsid than sb root. */ - err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid); + err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); if (err) return err; @@ -1323,6 +1390,12 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) root_fsid.val[1] != fsid->val[1]) return -EXDEV; + return 0; +} + +/* Check if filesystem can encode a unique fid */ +static int fanotify_test_fid(struct dentry *dentry) +{ /* * We need to make sure that the file system supports at least * encoding a file handle so user can use name_to_handle_at() to @@ -1330,8 +1403,8 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) * objects. However, name_to_handle_at() requires that the * filesystem also supports decoding file handles. */ - if (!path->dentry->d_sb->s_export_op || - !path->dentry->d_sb->s_export_op->fh_to_dentry) + if (!dentry->d_sb->s_export_op || + !dentry->d_sb->s_export_op->fh_to_dentry) return -EOPNOTSUPP; return 0; @@ -1447,15 +1520,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group->priority == FS_PRIO_0) goto fput_and_out; + if (mask & FAN_FS_ERROR && + mark_type != FAN_MARK_FILESYSTEM) + goto fput_and_out; + /* - * Events with data type inode do not carry enough information to report - * event->fd, so we do not allow setting a mask for inode events unless - * group supports reporting fid. - * inode events are not supported on a mount mark, because they do not - * carry enough information (i.e. path) to be filtered by mount point. + * Events that do not carry enough information to report + * event->fd require a group that supports reporting fid. Those + * events are not supported on a mount mark, because they do not + * carry enough information (i.e. path) to be filtered by mount + * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); - if (mask & FANOTIFY_INODE_EVENTS && + if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) goto fput_and_out; @@ -1482,7 +1559,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } if (fid_mode) { - ret = fanotify_test_fid(&path, &__fsid); + ret = fanotify_test_fsid(path.dentry, &__fsid); + if (ret) + goto path_put_and_out; + + ret = fanotify_test_fid(path.dentry); if (ret) goto path_put_and_out; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 963e6ce75b96..4034ca566f95 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -252,6 +252,9 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group, if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; + if (WARN_ON_ONCE(!inode && !dir)) + return 0; + if ((inode_mark->mask & FS_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; @@ -455,16 +458,16 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - - * either @dir or @inode must be non-NULL. - * if both are non-NULL event may be reported to both. + * If @dir and @inode are both non-NULL, event may be + * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); + struct super_block *sb = fsnotify_data_sb(data, data_type); struct fsnotify_iter_info iter_info = {}; - struct super_block *sb; struct mount *mnt = NULL; struct inode *parent = NULL; int ret = 0; @@ -483,7 +486,6 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, */ parent = dir; } - sb = inode->i_sb; /* * Optimization: srcu_read_lock() has a memory barrier which can diff --git a/fs/notify/group.c b/fs/notify/group.c index fb89c351295d..6a297efc4788 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -88,7 +88,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group) * that deliberately ignores overflow events. */ if (group->overflow_event) - group->ops->free_event(group->overflow_event); + group->ops->free_event(group, group->overflow_event); fsnotify_put_group(group); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index d1a64daa0171..d92d7b0adc9a 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -116,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, if (len) strcpy(event->name, name->name); - ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL); + ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); @@ -177,7 +177,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group) dec_inotify_instances(group->inotify_data.ucounts); } -static void inotify_free_event(struct fsnotify_event *fsn_event) +static void inotify_free_event(struct fsnotify_group *group, + struct fsnotify_event *fsn_event) { kfree(INOTIFY_E(fsn_event)); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 62051247f6d2..29fca3284bb5 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -94,10 +94,10 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) __u32 mask; /* - * Everything should accept their own ignored and should receive events - * when the inode is unmounted. All directories care about children. + * Everything should receive events when the inode is unmounted. + * All directories care about children. */ - mask = (FS_IN_IGNORED | FS_UNMOUNT); + mask = (FS_UNMOUNT); if (S_ISDIR(inode->i_mode)) mask |= FS_EVENT_ON_CHILD; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 32f45543b9c6..9022ae650cf8 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -64,7 +64,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group, WARN_ON(!list_empty(&event->list)); spin_unlock(&group->notification_lock); } - group->ops->free_event(event); + group->ops->free_event(group, event); } /* @@ -78,12 +78,12 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * 2 if the event was not queued - either the queue of events has overflown * or the group is shutting down. */ -int fsnotify_add_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct fsnotify_group *, - struct fsnotify_event *), - void (*insert)(struct fsnotify_group *, - struct fsnotify_event *)) +int fsnotify_insert_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5d9ae17bd443..bb247bc349e4 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } @@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, data_alloc_bh, start_blk, num_clusters); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } @@ -6921,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, } /* - * Zero the area past i_size but still within an allocated - * cluster. This avoids exposing nonzero data on subsequent file - * extends. + * Zero partial cluster for a hole punch or truncate. This avoids exposing + * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because * otherwise block_write_full_page() will skip writeout of pages past - * i_size. The new_i_size parameter is passed for this reason. + * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, u64 range_start, u64 range_end) @@ -6945,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) return 0; + /* + * Avoid zeroing pages fully beyond current i_size. It is pointless as + * underlying blocks of those pages should be already zeroed out and + * page writeback will skip them anyway. + */ + range_end = min_t(u64, range_end, i_size_read(inode)); + if (range_start >= range_end) + return 0; + pages = kcalloc(ocfs2_pages_per_cluster(sb), sizeof(struct page *), GFP_NOFS); if (pages == NULL) { @@ -6953,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, goto out; } - if (range_start == range_end) - goto out; - ret = ocfs2_extent_map_get_blocks(inode, range_start >> sb->s_blocksize_bits, &phys, NULL, &ext_flags); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0e7aad1b11cc..5cd5f7511dac 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) continue; } retry: - ret = -EINVAL; mlog(0, "attempting to send begin reco msg to %d\n", nodenum); ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 54d7843c0211..fc5f780fa235 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode, * greater than page size, so we have to truncate them * anyway. */ - unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(inode->i_mapping, new_i_size); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + unmap_mapping_range(inode->i_mapping, + new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); status = ocfs2_truncate_inline(inode, di_bh, new_i_size, i_size_read(inode), 1); if (status) @@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode, goto bail_unlock_sem; } + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + status = ocfs2_commit_truncate(osb, inode, di_bh); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index bc8f32fab964..6c2411c2afcf 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; - journal_t *journal = OCFS2_SB(sb)->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ - if (journal) { + if (osb->journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4f15750aac5d..b9c339335a53 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; struct inode *inode = NULL; /* the journal inode */ journal_t *j_journal = NULL; + struct ocfs2_journal *journal = NULL; struct ocfs2_dinode *di = NULL; struct buffer_head *bh = NULL; - struct ocfs2_super *osb; int inode_lock = 0; - BUG_ON(!journal); + /* initialize our journal structure */ + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto done; + } + osb->journal = journal; + journal->j_osb = osb; - osb = journal->j_osb; + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = 1UL; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); + journal->j_state = OCFS2_JOURNAL_FREE; /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, @@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) journal->j_state = OCFS2_JOURNAL_FREE; -// up_write(&journal->j_trans_barrier); done: iput(inode); + kfree(journal); + osb->journal = NULL; } static void ocfs2_clear_journal_error(struct super_block *sb, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d158acb8b38a..8dcb2f2cadbc 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); -int ocfs2_journal_init(struct ocfs2_journal *journal, - int *dirty); +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 5c914ce9b3ac..1286b88b6fa1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - ocfs2_journal_shutdown(osb); - ocfs2_sync_blockdev(sb); ocfs2_purge_refcount_trees(osb); @@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_release_system_inodes(osb); + ocfs2_journal_shutdown(osb); + /* * If we're dismounting due to mount error, mount.ocfs2 will clean * up heartbeat. If we're a local mount, there is no heartbeat. @@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb, int i, cbits, bbits; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; - struct ocfs2_journal *journal; struct ocfs2_super *osb; u64 total_blocks; @@ -2197,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); - /* FIXME - * This should be done in ocfs2_journal_init(), but unknown - * ordering issues will cause the filesystem to crash. - * If anyone wants to figure out what part of the code - * refers to osb->journal before ocfs2_journal_init() is run, - * be my guest. - */ - /* initialize our journal structure */ - - journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); - if (!journal) { - mlog(ML_ERROR, "unable to alloc journal\n"); - status = -ENOMEM; - goto bail; - } - osb->journal = journal; - journal->j_osb = osb; - - atomic_set(&journal->j_num_trans, 0); - init_rwsem(&journal->j_trans_barrier); - init_waitqueue_head(&journal->j_checkpointed); - spin_lock_init(&journal->j_lock); - journal->j_trans_id = (unsigned long) 1; - INIT_LIST_HEAD(&journal->j_la_cleanups); - INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); - journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2404,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * ourselves. */ /* Init our journal object. */ - status = ocfs2_journal_init(osb->journal, &dirty); + status = ocfs2_journal_init(osb, &dirty); if (status < 0) { mlog(ML_ERROR, "Could not initialize journal!\n"); goto finally; @@ -2513,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); - /* FIXME - * This belongs in journal shutdown, but because we have to - * allocate osb->journal at the start of ocfs2_initialize_osb(), - * we free it here. - */ - kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); diff --git a/fs/open.c b/fs/open.c index a7f6cab81267..f732fb94600c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -856,8 +856,20 @@ static int do_dentry_open(struct file *f, * of THPs into the page cache will fail. */ smp_mb(); - if (filemap_nr_thps(inode->i_mapping)) - truncate_pagecache(inode, 0); + if (filemap_nr_thps(inode->i_mapping)) { + struct address_space *mapping = inode->i_mapping; + + filemap_invalidate_lock(inode->i_mapping); + /* + * unmap_mapping_range just need to be called once + * here, because the private pages is not need to be + * unmapped mapping (e.g. data segment of dynamic + * shared libraries here). + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + filemap_invalidate_unlock(inode->i_mapping); + } } return 0; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index f5c25f580dd9..9323a854a60a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -134,8 +134,7 @@ struct posix_acl *get_acl(struct inode *inode, int type) * to just call ->get_acl to fetch the ACL ourself. (This is going to * be an unlikely race.) */ - if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) - /* fall through */ ; + cmpxchg(p, ACL_NOT_CACHED, sentinel); /* * Normally, the ACL returned by ->get_acl will be cached. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cf25be3e0321..ad667dbc96f5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -397,7 +397,6 @@ struct mem_size_stats { u64 pss_shmem; u64 pss_locked; u64 swap_pss; - bool check_shmem_swap; }; static void smaps_page_accumulate(struct mem_size_stats *mss, @@ -478,9 +477,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; - mss->swap += shmem_partial_swap_usage( - walk->vma->vm_file->f_mapping, addr, end); + mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, + linear_page_index(vma, addr), + linear_page_index(vma, end)); return 0; } @@ -488,6 +489,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, #define smaps_pte_hole NULL #endif /* CONFIG_SHMEM */ +static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) +{ +#ifdef CONFIG_SHMEM + if (walk->ops->pte_hole) { + /* depth is not used */ + smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); + } +#endif +} + static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { @@ -516,12 +527,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); - } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap - && pte_none(*pte))) { - page = xa_load(&vma->vm_file->f_mapping->i_pages, - linear_page_index(vma, addr)); - if (xa_is_value(page)) - mss->swap += PAGE_SIZE; + } else { + smaps_pte_hole_lookup(addr, walk); return; } @@ -735,8 +742,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, return; #ifdef CONFIG_SHMEM - /* In case of smaps_rollup, reset the value from previous vma */ - mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -754,7 +759,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { - mss->check_shmem_swap = true; ops = &smaps_shmem_walk_ops; } } diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index d3e995e1046f..5f2405994280 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -414,6 +414,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); @@ -479,6 +480,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + newblk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); newblk = 0; @@ -578,6 +586,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; + if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + blk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); else diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 076f9ab94306..82e09901462e 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1435,7 +1435,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) unsigned long safe_mask = 0; unsigned int commit_max_age = (unsigned int)-1; struct reiserfs_journal *journal = SB_JOURNAL(s); - char *new_opts; int err; char *qf_names[REISERFS_MAXQUOTAS]; unsigned int qfmt = 0; @@ -1443,10 +1442,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) int i; #endif - new_opts = kstrdup(arg, GFP_KERNEL); - if (arg && !new_opts) - return -ENOMEM; - sync_filesystem(s); reiserfs_write_lock(s); @@ -1597,7 +1592,6 @@ out_ok_unlocked: out_err_unlock: reiserfs_write_unlock(s); out_err: - kfree(new_opts); return err; } diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h new file mode 100644 index 000000000000..7ccadcbe684b --- /dev/null +++ b/fs/smbfs_common/smb2pdu.h @@ -0,0 +1,989 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ +#ifndef _COMMON_SMB2PDU_H +#define _COMMON_SMB2PDU_H + +/* + * Note that, due to trying to use names similar to the protocol specifications, + * there are many mixed case field names in the structures below. Although + * this does not match typical Linux kernel style, it is necessary to be + * able to match against the protocol specfication. + * + * SMB2 commands + * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses + * (ie no useful data other than the SMB error code itself) and are marked such. + * Knowing this helps avoid response buffer allocations and copy in some cases. + */ + +/* List of commands in host endian */ +#define SMB2_NEGOTIATE_HE 0x0000 +#define SMB2_SESSION_SETUP_HE 0x0001 +#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */ +#define SMB2_TREE_CONNECT_HE 0x0003 +#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */ +#define SMB2_CREATE_HE 0x0005 +#define SMB2_CLOSE_HE 0x0006 +#define SMB2_FLUSH_HE 0x0007 /* trivial resp */ +#define SMB2_READ_HE 0x0008 +#define SMB2_WRITE_HE 0x0009 +#define SMB2_LOCK_HE 0x000A +#define SMB2_IOCTL_HE 0x000B +#define SMB2_CANCEL_HE 0x000C +#define SMB2_ECHO_HE 0x000D +#define SMB2_QUERY_DIRECTORY_HE 0x000E +#define SMB2_CHANGE_NOTIFY_HE 0x000F +#define SMB2_QUERY_INFO_HE 0x0010 +#define SMB2_SET_INFO_HE 0x0011 +#define SMB2_OPLOCK_BREAK_HE 0x0012 + +/* The same list in little endian */ +#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) +#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE) +#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE) +#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE) +#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE) +#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE) +#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE) +#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE) +#define SMB2_READ cpu_to_le16(SMB2_READ_HE) +#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE) +#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE) +#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE) +#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE) +#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE) +#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE) +#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE) +#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE) +#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) +#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) + +#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF) + +#define NUMBER_OF_SMB2_COMMANDS 0x0013 + +/* + * SMB2 Header Definition + * + * "MBZ" : Must be Zero + * "BB" : BugBug, Something to check/review/analyze later + * "PDU" : "Protocol Data Unit" (ie a network "frame") + * + */ + +#define __SMB2_HEADER_STRUCTURE_SIZE 64 +#define SMB2_HEADER_STRUCTURE_SIZE \ + cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE) + +#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) +#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) +#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc) + +/* + * SMB2 flag definitions + */ +#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) +#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) +#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) +#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */ +#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) +#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ + +/* See MS-SMB2 section 2.2.1 */ +struct smb2_hdr { + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ + __le16 StructureSize; /* 64 */ + __le16 CreditCharge; /* MBZ */ + __le32 Status; /* Error from server */ + __le16 Command; + __le16 CreditRequest; /* CreditResponse */ + __le32 Flags; + __le32 NextCommand; + __le64 MessageId; + union { + struct { + __le32 ProcessId; + __le32 TreeId; + } __packed SyncId; + __le64 AsyncId; + } __packed Id; + __le64 SessionId; + __u8 Signature[16]; +} __packed; + +struct smb2_pdu { + struct smb2_hdr hdr; + __le16 StructureSize2; /* size of wct area (varies, request specific) */ +} __packed; + +#define SMB3_AES_CCM_NONCE 11 +#define SMB3_AES_GCM_NONCE 12 + +/* Transform flags (for 3.0 dialect this flag indicates CCM */ +#define TRANSFORM_FLAG_ENCRYPTED 0x0001 +struct smb2_transform_hdr { + __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ + __u8 Signature[16]; + __u8 Nonce[16]; + __le32 OriginalMessageSize; + __u16 Reserved1; + __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */ + __le64 SessionId; +} __packed; + + +/* See MS-SMB2 2.2.42 */ +struct smb2_compression_transform_hdr_unchained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + __le16 CompressionAlgorithm; + __le16 Flags; + __le16 Length; /* if chained it is length, else offset */ +} __packed; + +/* See MS-SMB2 2.2.42.1 */ +#define SMB2_COMPRESSION_FLAG_NONE 0x0000 +#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 + +struct compression_payload_header { + __le16 CompressionAlgorithm; + __le16 Flags; + __le32 Length; /* length of compressed playload including field below if present */ + /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ +} __packed; + +/* See MS-SMB2 2.2.42.2 */ +struct smb2_compression_transform_hdr_chained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + /* struct compression_payload_header[] */ +} __packed; + +/* See MS-SMB2 2.2.42.2.2 */ +struct compression_pattern_payload_v1 { + __le16 Pattern; + __le16 Reserved1; + __le16 Reserved2; + __le32 Repetitions; +} __packed; + +/* See MS-SMB2 section 2.2.9.2 */ +/* Context Types */ +#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000 +#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001) + +struct tree_connect_contexts { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + __u8 Data[]; +} __packed; + +/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */ +struct smb3_blob_data { + __le16 BlobSize; + __u8 BlobData[]; +} __packed; + +/* Valid values for Attr */ +#define SE_GROUP_MANDATORY 0x00000001 +#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002 +#define SE_GROUP_ENABLED 0x00000004 +#define SE_GROUP_OWNER 0x00000008 +#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010 +#define SE_GROUP_INTEGRITY 0x00000020 +#define SE_GROUP_INTEGRITY_ENABLED 0x00000040 +#define SE_GROUP_RESOURCE 0x20000000 +#define SE_GROUP_LOGON_ID 0xC0000000 + +/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ + +struct sid_array_data { + __le16 SidAttrCount; + /* SidAttrList - array of sid_attr_data structs */ +} __packed; + +struct luid_attr_data { + +} __packed; + +/* + * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5 + * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA + */ + +struct privilege_array_data { + __le16 PrivilegeCount; + /* array of privilege_data structs */ +} __packed; + +struct remoted_identity_tcon_context { + __le16 TicketType; /* must be 0x0001 */ + __le16 TicketSize; /* total size of this struct */ + __le16 User; /* offset to SID_ATTR_DATA struct with user info */ + __le16 UserName; /* offset to null terminated Unicode username string */ + __le16 Domain; /* offset to null terminated Unicode domain name */ + __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */ + __le16 RestrictedGroups; /* similar to above */ + __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */ + __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */ + __le16 Owner; /* offset to BLOB_DATA struct */ + __le16 DefaultDacl; /* offset to BLOB_DATA struct */ + __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */ + __le16 UserClaims; /* offset to BLOB_DATA struct */ + __le16 DeviceClaims; /* offset to BLOB_DATA struct */ + __u8 TicketInfo[]; /* variable length buf - remoted identity data */ +} __packed; + +struct smb2_tree_connect_req_extension { + __le32 TreeConnectContextOffset; + __le16 TreeConnectContextCount; + __u8 Reserved[10]; + __u8 PathName[]; /* variable sized array */ + /* followed by array of TreeConnectContexts */ +} __packed; + +/* Flags/Reserved for SMB3.1.1 */ +#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001) +#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002) +#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004) + +struct smb2_tree_connect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 Flags; /* Flags in SMB3.1.1 */ + __le16 PathOffset; + __le16 PathLength; + __u8 Buffer[1]; /* variable length */ +} __packed; + +/* Possible ShareType values */ +#define SMB2_SHARE_TYPE_DISK 0x01 +#define SMB2_SHARE_TYPE_PIPE 0x02 +#define SMB2_SHARE_TYPE_PRINT 0x03 + +/* + * Possible ShareFlags - exactly one and only one of the first 4 caching flags + * must be set (any of the remaining, SHI1005, flags may be set individually + * or in combination. + */ +#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000 +#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010 +#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020 +#define SMB2_SHAREFLAG_NO_CACHING 0x00000030 +#define SHI1005_FLAGS_DFS 0x00000001 +#define SHI1005_FLAGS_DFS_ROOT 0x00000002 +#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 +#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 +#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 +#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 +#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 +#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 +#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 +#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 +#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ +#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ +#define SHI1005_FLAGS_ALL 0x0014FF33 + +/* Possible share capabilities */ +#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ +#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ +#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ +#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ +#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ +#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */ + +struct smb2_tree_connect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 16 */ + __u8 ShareType; /* see below */ + __u8 Reserved; + __le32 ShareFlags; /* see below */ + __le32 Capabilities; /* see below */ + __le32 MaximalAccess; +} __packed; + +struct smb2_tree_disconnect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_tree_disconnect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + + +/* + * SMB2_NEGOTIATE_PROTOCOL See MS-SMB2 section 2.2.3 + */ +/* SecurityMode flags */ +#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 +#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001) +#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 +#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002) +#define SMB2_SEC_MODE_FLAGS_ALL 0x0003 + +/* Capabilities flags */ +#define SMB2_GLOBAL_CAP_DFS 0x00000001 +#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ +/* Internal types */ +#define SMB2_NT_FIND 0x00100000 +#define SMB2_LARGE_FILES 0x00200000 + +#define SMB2_CLIENT_GUID_SIZE 16 +#define SMB2_CREATE_GUID_SIZE 16 + +/* Dialects */ +#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */ +#define SMB20_PROT_ID 0x0202 +#define SMB21_PROT_ID 0x0210 +#define SMB2X_PROT_ID 0x02FF +#define SMB30_PROT_ID 0x0300 +#define SMB302_PROT_ID 0x0302 +#define SMB311_PROT_ID 0x0311 +#define BAD_PROT_ID 0xFFFF + +#define SMB311_SALT_SIZE 32 +/* Hash Algorithm Types */ +#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) +#define SMB2_PREAUTH_HASH_SIZE 64 + +/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */ +#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) +#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) +#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) +#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) +#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) +#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) +#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) +#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) + +struct smb2_neg_context { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */ +} __packed; + +/* + * SaltLength that the server send can be zero, so the only three required + * fields (all __le16) end up six bytes total, so the minimum context data len + * in the response is six bytes which accounts for + * + * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm. + */ +#define MIN_PREAUTH_CTXT_DATA_LEN 6 + +struct smb2_preauth_neg_context { + __le16 ContextType; /* 1 */ + __le16 DataLength; + __le32 Reserved; + __le16 HashAlgorithmCount; /* 1 */ + __le16 SaltLength; + __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ + __u8 Salt[SMB311_SALT_SIZE]; +} __packed; + +/* Encryption Algorithms Ciphers */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) +#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) + +/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ +#define MIN_ENCRYPT_CTXT_DATA_LEN 4 +struct smb2_encryption_neg_context { + __le16 ContextType; /* 2 */ + __le16 DataLength; + __le32 Reserved; + /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ + __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ + __le16 Ciphers[]; +} __packed; + +/* See MS-SMB2 2.2.3.1.3 */ +#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000) +#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001) +#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) +#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) +/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ +#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ + +/* Compression Flags */ +#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) +#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001) + +struct smb2_compression_capabilities_context { + __le16 ContextType; /* 3 */ + __le16 DataLength; + __le32 Reserved; + __le16 CompressionAlgorithmCount; + __le16 Padding; + __le32 Flags; + __le16 CompressionAlgorithms[3]; + __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */ + /* Check if pad needed */ +} __packed; + +/* + * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4. + * Its struct simply contains NetName, an array of Unicode characters + */ +struct smb2_netname_neg_context { + __le16 ContextType; /* 5 */ + __le16 DataLength; + __le32 Reserved; + __le16 NetName[]; /* hostname of target converted to UCS-2 */ +} __packed; + +/* + * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 + * and 2.2.4.1.5 + */ + +/* Flags */ +#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 + +struct smb2_transport_capabilities_context { + __le16 ContextType; /* 6 */ + __le16 DataLength; + __u32 Reserved; + __le32 Flags; + __u32 Pad; +} __packed; + +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.6 + */ + +/* RDMA Transform IDs */ +#define SMB2_RDMA_TRANSFORM_NONE 0x0000 +#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 + +struct smb2_rdma_transform_capabilities_context { + __le16 ContextType; /* 7 */ + __le16 DataLength; + __u32 Reserved; + __le16 TransformCount; + __u16 Reserved1; + __u32 Reserved2; + __le16 RDMATransformIds[]; +} __packed; + +/* + * For signing capabilities context see MS-SMB2 2.2.3.1.7 + * and 2.2.4.1.7 + */ + +/* Signing algorithms */ +#define SIGNING_ALG_HMAC_SHA256 0 +#define SIGNING_ALG_HMAC_SHA256_LE cpu_to_le16(0) +#define SIGNING_ALG_AES_CMAC 1 +#define SIGNING_ALG_AES_CMAC_LE cpu_to_le16(1) +#define SIGNING_ALG_AES_GMAC 2 +#define SIGNING_ALG_AES_GMAC_LE cpu_to_le16(2) + +struct smb2_signing_capabilities { + __le16 ContextType; /* 8 */ + __le16 DataLength; + __le32 Reserved; + __le16 SigningAlgorithmCount; + __le16 SigningAlgorithms[]; + /* Followed by padding to 8 byte boundary (required by some servers) */ +} __packed; + +#define POSIX_CTXT_DATA_LEN 16 +struct smb2_posix_neg_context { + __le16 ContextType; /* 0x100 */ + __le16 DataLength; + __le32 Reserved; + __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ +} __packed; + +struct smb2_negotiate_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 DialectCount; + __le16 SecurityMode; + __le16 Reserved; /* MBZ */ + __le32 Capabilities; + __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; + /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ + __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ + __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ + __le16 Reserved2; + __le16 Dialects[]; +} __packed; + +struct smb2_negotiate_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 65 */ + __le16 SecurityMode; + __le16 DialectRevision; + __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ + __u8 ServerGUID[16]; + __le32 Capabilities; + __le32 MaxTransactSize; + __le32 MaxReadSize; + __le32 MaxWriteSize; + __le64 SystemTime; /* MBZ */ + __le64 ServerStartTime; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + + +/* + * SMB2_SESSION_SETUP See MS-SMB2 section 2.2.5 + */ +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + +struct smb2_sess_setup_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 25 */ + __u8 Flags; + __u8 SecurityMode; + __le32 Capabilities; + __le32 Channel; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le64 PreviousSessionId; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +/* Currently defined SessionFlags */ +#define SMB2_SESSION_FLAG_IS_GUEST 0x0001 +#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001) +#define SMB2_SESSION_FLAG_IS_NULL 0x0002 +#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002) +#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 +#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004) + +struct smb2_sess_setup_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 SessionFlags; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + + +/* + * SMB2_LOGOFF See MS-SMB2 section 2.2.7 + */ +struct smb2_logoff_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_logoff_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + + +/* + * SMB2_CLOSE See MS-SMB2 section 2.2.15 + */ +/* Currently defined values for close flags */ +#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) +struct smb2_close_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Flags; + __le32 Reserved; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ +} __packed; + +/* + * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) + */ +#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 + +struct smb2_close_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* 60 */ + __le16 Flags; + __le32 Reserved; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; + __le32 Attributes; +} __packed; + + +/* + * SMB2_READ See MS-SMB2 section 2.2.19 + */ +/* For read request Flags field below, following flag is defined for SMB3.02 */ +#define SMB2_READFLAG_READ_UNBUFFERED 0x01 +#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */ + +/* Channel field for read and write: exactly one of following flags can be set*/ +#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) +#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) +#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) + +/* SMB2 read request without RFC1001 length at the beginning */ +struct smb2_read_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __u8 Padding; /* offset from start of SMB2 header to place read */ + __u8 Flags; /* MBZ unless SMB3.02 or later */ + __le32 Length; + __le64 Offset; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 MinimumCount; + __le32 Channel; /* MBZ except for SMB3 or later */ + __le32 RemainingBytes; + __le16 ReadChannelInfoOffset; + __le16 ReadChannelInfoLength; + __u8 Buffer[1]; +} __packed; + +/* Read flags */ +#define SMB2_READFLAG_RESPONSE_NONE cpu_to_le32(0x00000000) +#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM cpu_to_le32(0x00000001) + +struct smb2_read_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __le32 Flags; + __u8 Buffer[1]; +} __packed; + + +/* + * SMB2_WRITE See MS-SMB2 section 2.2.21 + */ +/* For write request Flags field below the following flags are defined: */ +#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ +#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ + +struct smb2_write_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 DataOffset; /* offset from start of SMB2 header to write data */ + __le32 Length; + __le64 Offset; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ + __le32 Channel; /* MBZ unless SMB3.02 or later */ + __le32 RemainingBytes; + __le16 WriteChannelInfoOffset; + __le16 WriteChannelInfoLength; + __le32 Flags; + __u8 Buffer[1]; +} __packed; + +struct smb2_write_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + + +/* + * SMB2_FLUSH See MS-SMB2 section 2.2.17 + */ +struct smb2_flush_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Reserved1; + __le32 Reserved2; + __le64 PersistentFileId; + __le64 VolatileFileId; +} __packed; + +struct smb2_flush_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Reserved; +} __packed; + + +/* + * SMB2_NOTIFY See MS-SMB2 section 2.2.35 + */ +/* notify flags */ +#define SMB2_WATCH_TREE 0x0001 + +/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */ +#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 +#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 +#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 +#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 +#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 +#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 +#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 +#define FILE_NOTIFY_CHANGE_EA 0x00000080 +#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 +#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 +#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 +#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 + +/* SMB2 Notify Action Flags */ +#define FILE_ACTION_ADDED 0x00000001 +#define FILE_ACTION_REMOVED 0x00000002 +#define FILE_ACTION_MODIFIED 0x00000003 +#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004 +#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005 +#define FILE_ACTION_ADDED_STREAM 0x00000006 +#define FILE_ACTION_REMOVED_STREAM 0x00000007 +#define FILE_ACTION_MODIFIED_STREAM 0x00000008 +#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009 + +struct smb2_change_notify_req { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Flags; + __le32 OutputBufferLength; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ + __le32 CompletionFilter; + __u32 Reserved; +} __packed; + +struct smb2_change_notify_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; /* array of file notify structs */ +} __packed; + + +/* + * SMB2_CREATE See MS-SMB2 section 2.2.13 + */ +/* Oplock levels */ +#define SMB2_OPLOCK_LEVEL_NONE 0x00 +#define SMB2_OPLOCK_LEVEL_II 0x01 +#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 +#define SMB2_OPLOCK_LEVEL_BATCH 0x09 +#define SMB2_OPLOCK_LEVEL_LEASE 0xFF +/* Non-spec internal type */ +#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 + +/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */ +#define IL_ANONYMOUS cpu_to_le32(0x00000000) +#define IL_IDENTIFICATION cpu_to_le32(0x00000001) +#define IL_IMPERSONATION cpu_to_le32(0x00000002) +#define IL_DELEGATE cpu_to_le32(0x00000003) + +/* File Attrubutes */ +#define FILE_ATTRIBUTE_READONLY 0x00000001 +#define FILE_ATTRIBUTE_HIDDEN 0x00000002 +#define FILE_ATTRIBUTE_SYSTEM 0x00000004 +#define FILE_ATTRIBUTE_DIRECTORY 0x00000010 +#define FILE_ATTRIBUTE_ARCHIVE 0x00000020 +#define FILE_ATTRIBUTE_NORMAL 0x00000080 +#define FILE_ATTRIBUTE_TEMPORARY 0x00000100 +#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200 +#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400 +#define FILE_ATTRIBUTE_COMPRESSED 0x00000800 +#define FILE_ATTRIBUTE_OFFLINE 0x00001000 +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 +#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 +#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000 +#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000 +#define FILE_ATTRIBUTE__MASK 0x00007FB7 + +#define FILE_ATTRIBUTE_READONLY_LE cpu_to_le32(0x00000001) +#define FILE_ATTRIBUTE_HIDDEN_LE cpu_to_le32(0x00000002) +#define FILE_ATTRIBUTE_SYSTEM_LE cpu_to_le32(0x00000004) +#define FILE_ATTRIBUTE_DIRECTORY_LE cpu_to_le32(0x00000010) +#define FILE_ATTRIBUTE_ARCHIVE_LE cpu_to_le32(0x00000020) +#define FILE_ATTRIBUTE_NORMAL_LE cpu_to_le32(0x00000080) +#define FILE_ATTRIBUTE_TEMPORARY_LE cpu_to_le32(0x00000100) +#define FILE_ATTRIBUTE_SPARSE_FILE_LE cpu_to_le32(0x00000200) +#define FILE_ATTRIBUTE_REPARSE_POINT_LE cpu_to_le32(0x00000400) +#define FILE_ATTRIBUTE_COMPRESSED_LE cpu_to_le32(0x00000800) +#define FILE_ATTRIBUTE_OFFLINE_LE cpu_to_le32(0x00001000) +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED_LE cpu_to_le32(0x00002000) +#define FILE_ATTRIBUTE_ENCRYPTED_LE cpu_to_le32(0x00004000) +#define FILE_ATTRIBUTE_INTEGRITY_STREAM_LE cpu_to_le32(0x00008000) +#define FILE_ATTRIBUTE_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000) +#define FILE_ATTRIBUTE_MASK_LE cpu_to_le32(0x00007FB7) + +/* Desired Access Flags */ +#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) +#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001) +#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002) +#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004) +#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004) +#define FILE_READ_EA_LE cpu_to_le32(0x00000008) +#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010) +#define FILE_EXECUTE_LE cpu_to_le32(0x00000020) +#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040) +#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080) +#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100) +#define FILE_DELETE_LE cpu_to_le32(0x00010000) +#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000) +#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000) +#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000) +#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000) +#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000) +#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000) +#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000) +#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000) +#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000) +#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000) +#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF) + + +#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \ + FILE_READ_EA_LE | \ + FILE_GENERIC_READ_LE) +#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \ + FILE_APPEND_DATA_LE | \ + FILE_WRITE_EA_LE | \ + FILE_WRITE_ATTRIBUTES_LE | \ + FILE_GENERIC_WRITE_LE) + +/* ShareAccess Flags */ +#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001) +#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002) +#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004) +#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007) + +/* CreateDisposition Flags */ +#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000) +#define FILE_OPEN_LE cpu_to_le32(0x00000001) +#define FILE_CREATE_LE cpu_to_le32(0x00000002) +#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003) +#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004) +#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005) +#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007) + +#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \ + | FILE_READ_ATTRIBUTES) +#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES) +#define FILE_EXEC_RIGHTS (FILE_EXECUTE) + +/* CreateOptions Flags */ +#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001) +/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */ +#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) +#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) +#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008) +#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) +#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) +#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) +#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) +#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) +#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) +#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) +#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) +#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) +#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) +#define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF) + +#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ + | FILE_READ_ATTRIBUTES_LE) +#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \ + | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE) +#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE) + +/* Create Context Values */ +#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */ +#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" +#define SMB2_CREATE_ALLOCATION_SIZE "AISi" +#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" +#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" +#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" +#define SMB2_CREATE_REQUEST_LEASE "RqLs" +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" +#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" + +/* Flag (SMB3 open response) values */ +#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 + +struct create_context { + __le32 Next; + __le16 NameOffset; + __le16 NameLength; + __le16 Reserved; + __le16 DataOffset; + __le32 DataLength; + __u8 Buffer[]; +} __packed; + +struct smb2_create_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u8 SecurityFlags; + __u8 RequestedOplockLevel; + __le32 ImpersonationLevel; + __le64 SmbCreateFlags; + __le64 Reserved; + __le32 DesiredAccess; + __le32 FileAttributes; + __le32 ShareAccess; + __le32 CreateDisposition; + __le32 CreateOptions; + __le16 NameOffset; + __le16 NameLength; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[]; +} __packed; + +struct smb2_create_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 89 */ + __u8 OplockLevel; + __u8 Flags; /* 0x01 if reparse point */ + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; + __le64 EndofFile; + __le32 FileAttributes; + __le32 Reserved2; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[1]; +} __packed; + + +#endif /* _COMMON_SMB2PDU_H */ diff --git a/fs/super.c b/fs/super.c index bcef3a6f4c4b..3bfc0f8fbd5b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -476,6 +476,8 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb_lock); up_write(&sb->s_umount); if (sb->s_bdi != &noop_backing_dev_info) { + if (sb->s_iflags & SB_I_PERSB_BDI) + bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } @@ -1562,6 +1564,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; + sb->s_iflags |= SB_I_PERSB_BDI; return 0; } diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index d16302d3eb59..596ab2092289 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -80,20 +80,6 @@ static inline int arch_is_kernel_data(unsigned long addr) } #endif -/* - * Check if an address is part of freed initmem. This is needed on architectures - * with virt == phys kernel mapping, for code that wants to check if an address - * is part of a static object within [_stext, _end]. After initmem is freed, - * memory can be allocated from it, and such allocations would then have - * addresses within the range [_stext, _end]. - */ -#ifndef arch_is_kernel_initmem_freed -static inline int arch_is_kernel_initmem_freed(unsigned long addr) -{ - return 0; -} -#endif - /** * memory_contains - checks if an object is contained within a memory region * @begin: virtual address of the beginning of the memory region diff --git a/include/linux/acpi.h b/include/linux/acpi.h index fbc2146050a4..375715b0535f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -577,7 +577,6 @@ extern u32 osc_sb_native_usb4_control; #define OSC_PCI_MSI_SUPPORT 0x00000010 #define OSC_PCI_EDR_SUPPORT 0x00000080 #define OSC_PCI_HPX_TYPE_3_SUPPORT 0x00000100 -#define OSC_PCI_SUPPORT_MASKS 0x0000019f /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */ #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 0x00000001 @@ -587,7 +586,6 @@ extern u32 osc_sb_native_usb4_control; #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL 0x00000010 #define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 #define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080 -#define OSC_PCI_CONTROL_MASKS 0x000000bf #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 33207004cfde..993c5628a726 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -103,6 +103,9 @@ struct wb_completion { * change as blkcg is disabled and enabled higher up in the hierarchy, a wb * is tested for blkcg after lookup and removed from index on mismatch so * that a new wb for the combination can be created. + * + * Each bdi_writeback that is not embedded into the backing_dev_info must hold + * a reference to the parent backing_dev_info. See cgwb_create() for details. */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9c14f0a8dbe5..483979c1b9f4 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -141,7 +141,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) } long congestion_wait(int sync, long timeout); -long wait_iff_congested(int sync, long timeout); static inline bool mapping_can_writeback(struct address_space *mapping) { diff --git a/include/linux/cma.h b/include/linux/cma.h index 53fd8c3cdbd0..bd801023504b 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -46,6 +46,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); +extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7bbd8df02532..ccbbd31b3aae 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -150,3 +150,11 @@ #else #define __diag_GCC_8(s) #endif + +/* + * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" + * attribute) do not work, and must be disabled. + */ +#if GCC_VERSION < 90100 +#undef __alloc_size__ +#endif diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index e6ec63403965..3de06a8fae73 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -33,6 +33,15 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __aligned_largest __attribute__((__aligned__)) +/* + * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally + * available and includes other attributes. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size + */ +#define __alloc_size__(x, ...) __attribute__((__alloc_size__(x, ## __VA_ARGS__))) + /* * Note: users of __always_inline currently do not write "inline" themselves, * which seems to be required by gcc to apply the attribute according @@ -153,6 +162,7 @@ /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#malloc */ #define __malloc __attribute__((__malloc__)) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 05ceb2e92b0e..3f198879c292 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -250,6 +250,18 @@ struct ftrace_likely_data { # define __cficanonical #endif +/* + * Any place that could be marked with the "alloc_size" attribute is also + * a place to be marked with the "malloc" attribute. Do this as part of the + * __alloc_size macro to avoid redundant attributes and to avoid missing a + * __malloc marking. + */ +#ifdef __alloc_size__ +# define __alloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) __malloc +#else +# define __alloc_size(x, ...) __malloc +#endif + #ifndef asm_volatile_goto #define asm_volatile_goto(x...) asm goto(x) #endif diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d2b9c41c8edf..d58e0476ee8e 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -34,6 +34,8 @@ */ extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; +extern struct static_key_false cpusets_insane_config_key; + static inline bool cpusets_enabled(void) { return static_branch_unlikely(&cpusets_enabled_key); @@ -51,6 +53,19 @@ static inline void cpuset_dec(void) static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } +/* + * This will get enabled whenever a cpuset configuration is considered + * unsupportable in general. E.g. movable only node which cannot satisfy + * any non movable allocations (see update_nodemask). Page allocator + * needs to make additional checks for those configurations and this + * check is meant to guard those checks without any overhead for sane + * configurations. + */ +static inline bool cpusets_insane_config(void) +{ + return static_branch_unlikely(&cpusets_insane_config_key); +} + extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); @@ -167,6 +182,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline bool cpusets_enabled(void) { return false; } +static inline bool cpusets_insane_config(void) { return false; } + static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} diff --git a/include/linux/damon.h b/include/linux/damon.h index d68b67b8d458..b4d4be3cc987 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -14,6 +14,8 @@ /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE +/* Max priority score for DAMON-based operation schemes */ +#define DAMOS_MAX_SCORE (99) /** * struct damon_addr_range - Represents an address region of [@start, @end). @@ -31,12 +33,22 @@ struct damon_addr_range { * @sampling_addr: Address of the sample for the next access check. * @nr_accesses: Access frequency of this region. * @list: List head for siblings. + * @age: Age of this region. + * + * @age is initially zero, increased for each aggregation interval, and reset + * to zero again if the access frequency is significantly changed. If two + * regions are merged into a new region, both @nr_accesses and @age of the new + * region are set as region size-weighted average of those of the two regions. */ struct damon_region { struct damon_addr_range ar; unsigned long sampling_addr; unsigned int nr_accesses; struct list_head list; + + unsigned int age; +/* private: Internal value for age calculation. */ + unsigned int last_nr_accesses; }; /** @@ -59,16 +71,180 @@ struct damon_target { struct list_head list; }; +/** + * enum damos_action - Represents an action of a Data Access Monitoring-based + * Operation Scheme. + * + * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. + * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. + * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. + * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_STAT: Do nothing but count the stat. + */ +enum damos_action { + DAMOS_WILLNEED, + DAMOS_COLD, + DAMOS_PAGEOUT, + DAMOS_HUGEPAGE, + DAMOS_NOHUGEPAGE, + DAMOS_STAT, /* Do nothing but only record the stat */ +}; + +/** + * struct damos_quota - Controls the aggressiveness of the given scheme. + * @ms: Maximum milliseconds that the scheme can use. + * @sz: Maximum bytes of memory that the action can be applied. + * @reset_interval: Charge reset interval in milliseconds. + * + * @weight_sz: Weight of the region's size for prioritization. + * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. + * @weight_age: Weight of the region's age for prioritization. + * + * To avoid consuming too much CPU time or IO resources for applying the + * &struct damos->action to large memory, DAMON allows users to set time and/or + * size quotas. The quotas can be set by writing non-zero values to &ms and + * &sz, respectively. If the time quota is set, DAMON tries to use only up to + * &ms milliseconds within &reset_interval for applying the action. If the + * size quota is set, DAMON tries to apply the action only up to &sz bytes + * within &reset_interval. + * + * Internally, the time quota is transformed to a size quota using estimated + * throughput of the scheme's action. DAMON then compares it against &sz and + * uses smaller one as the effective quota. + * + * For selecting regions within the quota, DAMON prioritizes current scheme's + * target memory regions using the &struct damon_primitive->get_scheme_score. + * You could customize the prioritization logic by setting &weight_sz, + * &weight_nr_accesses, and &weight_age, because monitoring primitives are + * encouraged to respect those. + */ +struct damos_quota { + unsigned long ms; + unsigned long sz; + unsigned long reset_interval; + + unsigned int weight_sz; + unsigned int weight_nr_accesses; + unsigned int weight_age; + +/* private: */ + /* For throughput estimation */ + unsigned long total_charged_sz; + unsigned long total_charged_ns; + + unsigned long esz; /* Effective size quota in bytes */ + + /* For charging the quota */ + unsigned long charged_sz; + unsigned long charged_from; + struct damon_target *charge_target_from; + unsigned long charge_addr_from; + + /* For prioritization */ + unsigned long histogram[DAMOS_MAX_SCORE + 1]; + unsigned int min_score; +}; + +/** + * enum damos_wmark_metric - Represents the watermark metric. + * + * @DAMOS_WMARK_NONE: Ignore the watermarks of the given scheme. + * @DAMOS_WMARK_FREE_MEM_RATE: Free memory rate of the system in [0,1000]. + */ +enum damos_wmark_metric { + DAMOS_WMARK_NONE, + DAMOS_WMARK_FREE_MEM_RATE, +}; + +/** + * struct damos_watermarks - Controls when a given scheme should be activated. + * @metric: Metric for the watermarks. + * @interval: Watermarks check time interval in microseconds. + * @high: High watermark. + * @mid: Middle watermark. + * @low: Low watermark. + * + * If &metric is &DAMOS_WMARK_NONE, the scheme is always active. Being active + * means DAMON does monitoring and applying the action of the scheme to + * appropriate memory regions. Else, DAMON checks &metric of the system for at + * least every &interval microseconds and works as below. + * + * If &metric is higher than &high, the scheme is inactivated. If &metric is + * between &mid and &low, the scheme is activated. If &metric is lower than + * &low, the scheme is inactivated. + */ +struct damos_watermarks { + enum damos_wmark_metric metric; + unsigned long interval; + unsigned long high; + unsigned long mid; + unsigned long low; + +/* private: */ + bool activated; +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @min_sz_region: Minimum size of target regions. + * @max_sz_region: Maximum size of target regions. + * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. + * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. + * @min_age_region: Minimum age of target regions. + * @max_age_region: Maximum age of target regions. + * @action: &damo_action to be applied to the target regions. + * @quota: Control the aggressiveness of this scheme. + * @wmarks: Watermarks for automated (in)activation of this scheme. + * @stat_count: Total number of regions that this scheme is applied. + * @stat_sz: Total size of regions that this scheme is applied. + * @list: List head for siblings. + * + * For each aggregation interval, DAMON finds regions which fit in the + * condition (&min_sz_region, &max_sz_region, &min_nr_accesses, + * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to + * those. To avoid consuming too much CPU time or IO resources for the + * &action, "a is used. + * + * To do the work only when needed, schemes can be activated for specific + * system situations using &wmarks. If all schemes that registered to the + * monitoring context are inactive, DAMON stops monitoring either, and just + * repeatedly checks the watermarks. + * + * If all schemes that registered to a &struct damon_ctx are inactive, DAMON + * stops monitoring and just repeatedly checks the watermarks. + * + * After applying the &action to each region, &stat_count and &stat_sz is + * updated to reflect the number of regions and total size of regions that the + * &action is applied. + */ +struct damos { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; + enum damos_action action; + struct damos_quota quota; + struct damos_watermarks wmarks; + unsigned long stat_count; + unsigned long stat_sz; + struct list_head list; +}; + struct damon_ctx; /** - * struct damon_primitive Monitoring primitives for given use cases. + * struct damon_primitive - Monitoring primitives for given use cases. * * @init: Initialize primitive-internal data structures. * @update: Update primitive-internal data structures. * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. * @reset_aggregated: Reset aggregated accesses monitoring results. + * @get_scheme_score: Get the score of a region for a scheme. + * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. * @cleanup: Clean up the context. * @@ -94,6 +270,11 @@ struct damon_ctx; * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. + * @get_scheme_score should return the priority score of a region for a scheme + * as an integer in [0, &DAMOS_MAX_SCORE]. + * @apply_scheme is called from @kdamond when a region for user provided + * DAMON-based operation scheme is found. It should apply the scheme's action + * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -104,12 +285,17 @@ struct damon_primitive { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); + int (*get_scheme_score)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); + int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; -/* - * struct damon_callback Monitoring events notification callbacks. +/** + * struct damon_callback - Monitoring events notification callbacks. * * @before_start: Called before starting the monitoring. * @after_sampling: Called after each sampling. @@ -136,7 +322,7 @@ struct damon_callback { int (*before_start)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); - int (*before_terminate)(struct damon_ctx *context); + void (*before_terminate)(struct damon_ctx *context); }; /** @@ -182,6 +368,7 @@ struct damon_callback { * @min_nr_regions: The minimum number of adaptive monitoring regions. * @max_nr_regions: The maximum number of adaptive monitoring regions. * @adaptive_targets: Head of monitoring targets (&damon_target) list. + * @schemes: Head of schemes (&damos) list. */ struct damon_ctx { unsigned long sample_interval; @@ -194,7 +381,6 @@ struct damon_ctx { /* public: */ struct task_struct *kdamond; - bool kdamond_stop; struct mutex kdamond_lock; struct damon_primitive primitive; @@ -203,6 +389,7 @@ struct damon_ctx { unsigned long min_nr_regions; unsigned long max_nr_regions; struct list_head adaptive_targets; + struct list_head schemes; }; #define damon_next_region(r) \ @@ -211,6 +398,9 @@ struct damon_ctx { #define damon_prev_region(r) \ (container_of(r->list.prev, struct damon_region, list)) +#define damon_last_region(t) \ + (list_last_entry(&t->regions_list, struct damon_region, list)) + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) @@ -223,6 +413,12 @@ struct damon_ctx { #define damon_for_each_target_safe(t, next, ctx) \ list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list) +#define damon_for_each_scheme(s, ctx) \ + list_for_each_entry(s, &(ctx)->schemes, list) + +#define damon_for_each_scheme_safe(s, next, ctx) \ + list_for_each_entry_safe(s, next, &(ctx)->schemes, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); @@ -232,8 +428,18 @@ inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); +struct damos *damon_new_scheme( + unsigned long min_sz_region, unsigned long max_sz_region, + unsigned int min_nr_accesses, unsigned int max_nr_accesses, + unsigned int min_age_region, unsigned int max_age_region, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); +void damon_destroy_scheme(struct damos *s); + struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +bool damon_targets_empty(struct damon_ctx *ctx); void damon_free_target(struct damon_target *t); void damon_destroy_target(struct damon_target *t); unsigned int damon_nr_regions(struct damon_target *t); @@ -245,6 +451,8 @@ int damon_set_targets(struct damon_ctx *ctx, int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_set_schemes(struct damon_ctx *ctx, + struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); @@ -261,8 +469,26 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx); unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); void damon_va_cleanup(struct damon_ctx *ctx); +int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); +int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_VADDR */ +#ifdef CONFIG_DAMON_PADDR + +/* Monitoring primitives for the physical memory address space */ +void damon_pa_prepare_access_checks(struct damon_ctx *ctx); +unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); +bool damon_pa_target_valid(void *t); +int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); +int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); +void damon_pa_set_primitives(struct damon_ctx *ctx); + +#endif /* CONFIG_DAMON_PADDR */ + #endif /* _DAMON_H */ diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index eec3b7c40811..616af2ea20f3 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -84,13 +84,20 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ */ #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE) +/* Events that can be reported with event->fd */ +#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) + /* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */ #define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \ FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF) +/* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */ +#define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR) + /* Events that user can request to be notified on */ #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ - FANOTIFY_INODE_EVENTS) + FANOTIFY_INODE_EVENTS | \ + FANOTIFY_ERROR_EVENTS) /* Events that require a permission response from user */ #define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ diff --git a/include/linux/fs.h b/include/linux/fs.h index f3cfca5edc9a..4137a9bfae7a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1440,6 +1440,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ +#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ /* Possible states of 'frozen' field */ enum { diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 12d3a7d308ab..787545e87eeb 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -26,20 +26,20 @@ * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only * the child is interested and not the parent. */ -static inline void fsnotify_name(struct inode *dir, __u32 mask, - struct inode *child, - const struct qstr *name, u32 cookie) +static inline int fsnotify_name(__u32 mask, const void *data, int data_type, + struct inode *dir, const struct qstr *name, + u32 cookie) { if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0) - return; + return 0; - fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie); + return fsnotify(mask, data, data_type, dir, name, NULL, cookie); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { - fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0); + fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) @@ -86,7 +86,7 @@ notify_child: */ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) { - fsnotify_parent(dentry, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE); + fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } static inline int fsnotify_file(struct file *file, __u32 mask) @@ -154,8 +154,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, new_dir_mask |= FS_ISDIR; } - fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie); - fsnotify_name(new_dir, new_dir_mask, source, new_name, fs_cookie); + fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE, + old_dir, old_name, fs_cookie); + fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE, + new_dir, new_name, fs_cookie); if (target) fsnotify_link_count(target); @@ -190,16 +192,22 @@ static inline void fsnotify_inoderemove(struct inode *inode) /* * fsnotify_create - 'name' was linked in + * + * Caller must make sure that dentry->d_name is stable. + * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate + * ->d_inode later */ -static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) +static inline void fsnotify_create(struct inode *dir, struct dentry *dentry) { - audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); + audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_dirent(inode, dentry, FS_CREATE); + fsnotify_dirent(dir, dentry, FS_CREATE); } /* * fsnotify_link - new hardlink in 'inode' directory + * + * Caller must make sure that new_dentry->d_name is stable. * Note: We have to pass also the linked inode ptr as some filesystems leave * new_dentry->d_inode NULL and instantiate inode pointer later */ @@ -209,7 +217,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, fsnotify_link_count(inode); audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_name(dir, FS_CREATE, inode, &new_dentry->d_name, 0); + fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE, + dir, &new_dentry->d_name, 0); } /* @@ -227,12 +236,16 @@ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) /* * fsnotify_mkdir - directory 'name' was created + * + * Caller must make sure that dentry->d_name is stable. + * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate + * ->d_inode later */ -static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) +static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry) { - audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); + audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR); + fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR); } /* @@ -326,4 +339,17 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) fsnotify_dentry(dentry, mask); } +static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, + int error) +{ + struct fs_error_report report = { + .error = error, + .inode = inode, + .sb = sb, + }; + + return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, + NULL, NULL, NULL, 0); +} + #endif /* _LINUX_FS_NOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 1ce66748a2d2..51ef2b079bfa 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -19,6 +19,7 @@ #include #include #include +#include /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily @@ -42,6 +43,12 @@ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ +#define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */ + +/* + * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify + * which does not support FS_ERROR. + */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ @@ -95,7 +102,8 @@ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \ - FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED) + FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ + FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \ @@ -136,6 +144,7 @@ struct mem_cgroup; * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. + * Either @inode or @dir must be non-NULL. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * @@ -155,7 +164,7 @@ struct fsnotify_ops { const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); - void (*free_event)(struct fsnotify_event *event); + void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; @@ -238,6 +247,7 @@ struct fsnotify_group { int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; + mempool_t error_events_pool; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; @@ -248,6 +258,14 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, + FSNOTIFY_EVENT_DENTRY, + FSNOTIFY_EVENT_ERROR, +}; + +struct fs_error_report { + int error; + struct inode *inode; + struct super_block *sb; }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) @@ -255,8 +273,25 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; + case FSNOTIFY_EVENT_DENTRY: + return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); + case FSNOTIFY_EVENT_ERROR: + return ((struct fs_error_report *)data)->inode; + default: + return NULL; + } +} + +static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_DENTRY: + /* Non const is needed for dget() */ + return (struct dentry *)data; + case FSNOTIFY_EVENT_PATH: + return ((const struct path *)data)->dentry; default: return NULL; } @@ -273,6 +308,35 @@ static inline const struct path *fsnotify_data_path(const void *data, } } +static inline struct super_block *fsnotify_data_sb(const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_INODE: + return ((struct inode *)data)->i_sb; + case FSNOTIFY_EVENT_DENTRY: + return ((struct dentry *)data)->d_sb; + case FSNOTIFY_EVENT_PATH: + return ((const struct path *)data)->dentry->d_sb; + case FSNOTIFY_EVENT_ERROR: + return ((struct fs_error_report *) data)->sb; + default: + return NULL; + } +} + +static inline struct fs_error_report *fsnotify_data_error_report( + const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_ERROR: + return (struct fs_error_report *) data; + default: + return NULL; + } +} + enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_PARENT, @@ -482,16 +546,30 @@ extern int fsnotify_fasync(int fd, struct file *file, int on); extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ -extern int fsnotify_add_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct fsnotify_group *, - struct fsnotify_event *), - void (*insert)(struct fsnotify_group *, - struct fsnotify_event *)); +extern int fsnotify_insert_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)); + +static inline int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *)) +{ + return fsnotify_insert_event(group, event, merge, NULL); +} + /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { - fsnotify_add_event(group, group->overflow_event, NULL, NULL); + fsnotify_add_event(group, group->overflow_event, NULL); +} + +static inline bool fsnotify_is_overflow_event(u32 mask) +{ + return mask & FS_Q_OVERFLOW; } static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3745efd21cf6..b976c4177299 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -531,6 +531,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct list_head *page_list, struct page **page_array); +unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, + unsigned long nr_pages, + struct page **page_array); + /* Bulk allocate order-0 pages */ static inline unsigned long alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) @@ -618,9 +622,9 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -void *alloc_pages_exact(size_t size, gfp_t gfp_mask); +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); -void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); +__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 27cdd715c5f9..25aff0f2ed0b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -180,9 +180,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); - kunmap_atomic(addr); + kunmap_local(addr); } #endif @@ -214,9 +214,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, static inline void clear_highpage(struct page *page) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); clear_page(kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE @@ -239,7 +239,7 @@ static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); @@ -250,7 +250,7 @@ static inline void zero_user_segments(struct page *page, if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); - kunmap_atomic(kaddr); + kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } @@ -275,11 +275,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from, { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif @@ -290,11 +290,11 @@ static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_page(vto, vfrom); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1faebe1cd0ed..44c2ab0dfa59 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -124,6 +124,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, loff_t *); @@ -132,6 +133,10 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, unsigned long new_addr, + unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, @@ -143,9 +148,6 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page); -void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct page *ref_page); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo(void); @@ -218,6 +220,10 @@ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } +static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma) +{ +} + static inline unsigned long hugetlb_total_pages(void) { return 0; @@ -265,6 +271,16 @@ static inline int copy_hugetlb_page_range(struct mm_struct *dst, return 0; } +static inline int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, + unsigned long new_addr, + unsigned long len) +{ + BUG(); + return 0; +} + static inline void hugetlb_report_meminfo(struct seq_file *m) { } @@ -385,13 +401,6 @@ static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, BUG(); } -static inline void __unmap_hugepage_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) -{ - BUG(); -} - static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -596,6 +605,7 @@ struct hstate { int next_nid_to_alloc; int next_nid_to_free; unsigned int order; + unsigned int demote_order; unsigned long mask; unsigned long max_huge_pages; unsigned long nr_huge_pages; @@ -605,6 +615,7 @@ struct hstate { unsigned long nr_overcommit_huge_pages; struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; + unsigned int max_huge_pages_node[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; @@ -637,8 +648,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page); /* arch callback */ -int __init __alloc_bootmem_huge_page(struct hstate *h); -int __init alloc_bootmem_huge_page(struct hstate *h); +int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); +int __init alloc_bootmem_huge_page(struct hstate *h, int nid); +bool __init hugetlb_node_alloc_supported(void); void __init hugetlb_add_hstate(unsigned order); bool __init arch_hugetlb_valid_size(unsigned long size); diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index e9743cfd8585..66a774d2710e 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -132,13 +132,7 @@ io_mapping_init_wc(struct io_mapping *iomap, iomap->base = base; iomap->size = size; -#if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */ - iomap->prot = pgprot_noncached_wc(PAGE_KERNEL); -#elif defined(pgprot_writecombine) iomap->prot = pgprot_writecombine(PAGE_KERNEL); -#else - iomap->prot = pgprot_noncached(PAGE_KERNEL); -#endif return iomap; } diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9ee238ad29ce..553da4899f55 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -64,6 +64,10 @@ struct irq_fwspec { u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; }; +/* Conversion function from of_phandle_args fields to fwspec */ +void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec); + /* * Should several domains have the same device node, but serve * different purposes (for example one domain is for PCI/MSI, and the diff --git a/include/linux/kasan.h b/include/linux/kasan.h index de5f5913374d..d8783b682669 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -375,12 +375,14 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); +void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} +static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ @@ -439,6 +441,8 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); +void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); + #else /* CONFIG_KASAN_VMALLOC */ static inline int kasan_populate_vmalloc(unsigned long start, @@ -456,6 +460,10 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_start, unsigned long free_region_end) {} +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ } + #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4009e24f4dff..60b097643f45 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -247,6 +247,7 @@ extern bool early_boot_irqs_disabled; extern enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, + SYSTEM_FREEING_INITMEM, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 3fe6dd8a18c1..4b5e3679a72c 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -14,6 +14,9 @@ #ifdef CONFIG_KFENCE +#include +#include + /* * We allocate an even number of pages, as it simplifies calculations to map * address to metadata indices; effectively, the very first page serves as an @@ -22,13 +25,8 @@ #define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE) extern char *__kfence_pool; -#ifdef CONFIG_KFENCE_STATIC_KEYS -#include DECLARE_STATIC_KEY_FALSE(kfence_allocation_key); -#else -#include extern atomic_t kfence_allocation_gate; -#endif /** * is_kfence_address() - check if an address belongs to KFENCE pool @@ -116,13 +114,16 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags); */ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { -#ifdef CONFIG_KFENCE_STATIC_KEYS - if (static_branch_unlikely(&kfence_allocation_key)) +#if defined(CONFIG_KFENCE_STATIC_KEYS) || CONFIG_KFENCE_SAMPLE_INTERVAL == 0 + if (!static_branch_unlikely(&kfence_allocation_key)) + return NULL; #else - if (unlikely(!atomic_read(&kfence_allocation_gate))) + if (!static_branch_likely(&kfence_allocation_key)) + return NULL; #endif - return __kfence_alloc(s, size, flags); - return NULL; + if (likely(atomic_read(&kfence_allocation_gate))) + return NULL; + return __kfence_alloc(s, size, flags); } /** diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 34de69b3b8ba..7df557b16c1e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -28,17 +28,26 @@ extern unsigned long long max_possible_pfn; /** * enum memblock_flags - definition of memory region attributes * @MEMBLOCK_NONE: no special request - * @MEMBLOCK_HOTPLUG: hotpluggable region + * @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory + * map during early boot as hot(un)pluggable system RAM (e.g., memory range + * that might get hotunplugged later). With "movable_node" set on the kernel + * commandline, try keeping this memory region hotunpluggable. Does not apply + * to memblocks added ("hotplugged") after early boot. * @MEMBLOCK_MIRROR: mirrored region * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as * reserved in the memory map; refer to memblock_mark_nomap() description * for further details + * @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added + * via a driver, and never indicated in the firmware-provided memory map as + * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the + * kernel resource tree. */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ MEMBLOCK_MIRROR = 0x2, /* mirrored region */ MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ + MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ }; /** @@ -100,10 +109,11 @@ static inline void memblock_discard(void) {} #endif void memblock_allow_resize(void); -int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); +int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid, + enum memblock_flags flags); int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); -int memblock_free(phys_addr_t base, phys_addr_t size); +int memblock_phys_free(phys_addr_t base, phys_addr_t size); int memblock_reserve(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP int memblock_physmem_add(phys_addr_t base, phys_addr_t size); @@ -118,7 +128,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); void memblock_free_all(void); -void memblock_free_ptr(void *ptr, size_t size); +void memblock_free(void *ptr, size_t size); void reset_node_managed_pages(pg_data_t *pgdat); void reset_all_zones_managed_pages(void); @@ -133,7 +143,7 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __memblock_free_late(phys_addr_t base, phys_addr_t size); +void memblock_free_late(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, @@ -208,7 +218,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range(i, p_start, p_end) \ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_HOTPLUG, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED, \ + p_start, p_end, NULL) /** * for_each_mem_range_rev - reverse iterate through memblock areas from @@ -219,7 +230,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range_rev(i, p_start, p_end) \ __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_HOTPLUG, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED,\ + p_start, p_end, NULL) /** * for_each_reserved_mem_range - iterate over all reserved memblock areas @@ -249,6 +261,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m) return m->flags & MEMBLOCK_NOMAP; } +static inline bool memblock_is_driver_managed(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_DRIVER_MANAGED; +} + int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -441,23 +458,6 @@ static inline void *memblock_alloc_node(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } -static inline void memblock_free_early(phys_addr_t base, - phys_addr_t size) -{ - memblock_free(base, size); -} - -static inline void memblock_free_early_nid(phys_addr_t base, - phys_addr_t size, int nid) -{ - memblock_free(base, size); -} - -static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) -{ - __memblock_free_late(base, size); -} - /* * Set the allocation direction to bottom-up or top-down. */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e34bf0cbdf55..0c5c403f4be6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -180,12 +180,6 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; -enum memcg_kmem_state { - KMEM_NONE, - KMEM_ALLOCATED, - KMEM_ONLINE, -}; - #if defined(CONFIG_SMP) struct memcg_padding { char x[0]; @@ -318,7 +312,6 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; - enum memcg_kmem_state kmem_state; struct obj_cgroup __rcu *objcg; struct list_head objcg_list; /* list of inherited objcgs */ #endif @@ -1667,7 +1660,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) return true; do { - if (time_before(jiffies, memcg->socket_pressure)) + if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; diff --git a/include/linux/memory.h b/include/linux/memory.h index 182c606adb06..88eb587b5143 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,7 +96,6 @@ struct memory_notify { unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; - int status_change_nid_high; int status_change_nid; }; @@ -110,7 +109,7 @@ struct mem_section; #define SLAB_CALLBACK_PRI 1 #define IPC_CALLBACK_PRI 10 -#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) { return; @@ -126,7 +125,14 @@ static inline int memory_notify(unsigned long val, void *v) { return 0; } -#else +static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) +{ + return 0; +} +/* These aren't inline functions due to a GCC bug. */ +#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) +#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) +#else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, @@ -140,7 +146,6 @@ typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); -#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<span_seqlock); } -extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); -extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); -extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); extern void adjust_present_page_count(struct page *page, struct memory_group *group, long nr_pages); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4091692bed8c..3c7595e81150 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -8,7 +8,6 @@ #include #include -#include #include #include #include @@ -184,8 +183,6 @@ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); -extern bool numa_demotion_enabled; - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); @@ -301,8 +298,6 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) return NULL; } -#define numa_demotion_enabled false - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0d2aeb9b0f66..eeb818c4fc78 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,24 +19,7 @@ struct migration_target_control; */ #define MIGRATEPAGE_SUCCESS 0 -/* - * Keep sync with: - * - macro MIGRATE_REASON in include/trace/events/migrate.h - * - migrate_reason_names[MR_TYPES] in mm/debug.c - */ -enum migrate_reason { - MR_COMPACTION, - MR_MEMORY_FAILURE, - MR_MEMORY_HOTPLUG, - MR_SYSCALL, /* also applies to cpusets */ - MR_MEMPOLICY_MBIND, - MR_NUMA_MISPLACED, - MR_CONTIG_RANGE, - MR_LONGTERM_PIN, - MR_DEMOTION, - MR_TYPES -}; - +/* Defined in mm/debug.c: */ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION @@ -61,6 +44,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio); void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); + +extern bool numa_demotion_enabled; #else static inline void putback_movable_pages(struct list_head *l) {} @@ -86,6 +71,8 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, { return -ENOSYS; } + +#define numa_demotion_enabled false #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_COMPACTION diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index 883c99249033..f37cc03f9369 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -19,4 +19,17 @@ enum migrate_mode { MIGRATE_SYNC_NO_COPY, }; +enum migrate_reason { + MR_COMPACTION, + MR_MEMORY_FAILURE, + MR_MEMORY_HOTPLUG, + MR_SYSCALL, /* also applies to cpusets */ + MR_MEMPOLICY_MBIND, + MR_NUMA_MISPLACED, + MR_CONTIG_RANGE, + MR_LONGTERM_PIN, + MR_DEMOTION, + MR_TYPES +}; + #endif /* MIGRATE_MODE_H_INCLUDED */ diff --git a/include/linux/mm.h b/include/linux/mm.h index a62b91e769c8..a7e4a9e7d807 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -794,40 +794,6 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kvmalloc(size_t size, gfp_t flags) -{ - return kvmalloc_node(size, flags, NUMA_NO_NODE); -} -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) -{ - return kvmalloc_node(size, flags | __GFP_ZERO, node); -} -static inline void *kvzalloc(size_t size, gfp_t flags) -{ - return kvmalloc(size, flags | __GFP_ZERO); -} - -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) -{ - size_t bytes; - - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - - return kvmalloc(bytes, flags); -} - -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) -{ - return kvmalloc_array(n, size, flags | __GFP_ZERO); -} - -extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags); -extern void kvfree(const void *addr); -extern void kvfree_sensitive(const void *addr, size_t len); - static inline int head_compound_mapcount(struct page *head) { return atomic_read(compound_mapcount_ptr(head)) + 1; @@ -904,6 +870,8 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); +unsigned long nr_free_buffer_pages(void); + /* * Compound pages have a destructor function. Provide a * prototype for that function and accessor functions. @@ -1861,12 +1829,24 @@ extern void user_shm_unlock(size_t, struct ucounts *); * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ + struct address_space *zap_mapping; /* Check page->mapping if set */ struct page *single_page; /* Locked page to be unmapped */ }; +/* + * We set details->zap_mappings when we want to unmap shared but keep private + * pages. Return true if skip zapping this page, false otherwise. + */ +static inline bool +zap_skip_check_mapping(struct zap_details *details, struct page *page) +{ + if (!details || !page) + return false; + + return details->zap_mapping && + (details->zap_mapping != page_rmapping(page)); +} + struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -2576,7 +2556,7 @@ static inline unsigned long get_num_physpages(void) * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * memblock_add_node(base, size, nid) + * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); @@ -2604,6 +2584,7 @@ extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); +extern void calculate_min_free_kbytes(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f7326c8704bb..bb8c6f5f19bc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -114,10 +114,8 @@ struct page { struct page *next; #ifdef CONFIG_64BIT int pages; /* Nr of pages left */ - int pobjects; /* Approximate count */ #else short int pages; - short int pobjects; #endif }; }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6a1d79d84675..58e744b78c2c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -199,6 +199,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ @@ -272,6 +273,13 @@ enum lru_list { NR_LRU_LISTS }; +enum vmscan_throttle_state { + VMSCAN_THROTTLE_WRITEBACK, + VMSCAN_THROTTLE_ISOLATED, + VMSCAN_THROTTLE_NOPROGRESS, + NR_VMSCAN_THROTTLE, +}; + #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) @@ -841,6 +849,13 @@ typedef struct pglist_data { int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; + + /* workqueues for throttling reclaim for different reasons. */ + wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; + + atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ + unsigned long nr_reclaim_start; /* nr pages written while throttled + * when throttling started. */ struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; @@ -1220,6 +1235,28 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) +/* Whether the 'nodes' are all movable nodes */ +static inline bool movable_only_nodes(nodemask_t *nodes) +{ + struct zonelist *zonelist; + struct zoneref *z; + int nid; + + if (nodes_empty(*nodes)) + return false; + + /* + * We can chose arbitrary node from the nodemask to get a + * zonelist as they are interlinked. We just need to find + * at least one zone that can satisfy kernel allocations. + */ + nid = first_node(*nodes); + zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; + z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); + return (!z->zone) ? true : false; +} + + #ifdef CONFIG_SPARSEMEM #include #endif @@ -1481,7 +1518,7 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - ms = __nr_to_section(pfn_to_section_nr(pfn)); + ms = __pfn_to_section(pfn); if (!valid_section(ms)) return 0; /* @@ -1496,7 +1533,7 @@ static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - return present_section(__nr_to_section(pfn_to_section_nr(pfn))); + return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) diff --git a/include/linux/node.h b/include/linux/node.h index 8e5a29897936..bb21fd631b16 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -85,7 +85,7 @@ struct node { struct device dev; struct list_head access_list; -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS) +#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) struct work_struct node_work; #endif #ifdef CONFIG_HMEM_REPORTING @@ -98,7 +98,7 @@ struct memory_block; extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) +#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, enum meminit_context context); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 981341a3c3c4..52ec4b5e5615 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -245,7 +245,7 @@ static __always_inline int PageCompound(struct page *page) #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { - return page->flags == PAGE_POISON_PATTERN; + return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM diff --git a/include/linux/pci.h b/include/linux/pci.h index 35df61824d02..5950ed6035fc 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -342,7 +342,6 @@ struct pci_dev { u16 pcie_flags_reg; /* Cached PCIe Capabilities Register */ unsigned long *dma_alias_mask;/* Mask of enabled devfn aliases */ - struct pci_driver *driver; /* Driver bound to this device */ u64 dma_mask; /* Mask of the bits of bus address this device implements. Normally this is 0xffffffff. You only need to change @@ -900,7 +899,10 @@ struct pci_driver { struct pci_dynids dynids; }; -#define to_pci_driver(drv) container_of(drv, struct pci_driver, driver) +static inline struct pci_driver *to_pci_driver(struct device_driver *drv) +{ + return drv ? container_of(drv, struct pci_driver, driver) : NULL; +} /** * PCI_DEVICE - macro used to describe a specific PCI device @@ -1350,6 +1352,8 @@ void pci_unlock_rescan_remove(void); /* Vital Product Data routines */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); +ssize_t pci_read_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, void *buf); +ssize_t pci_write_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); @@ -1502,19 +1506,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, #define PCI_IRQ_ALL_TYPES \ (PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX) -/* kmem_cache style wrapper around pci_alloc_consistent() */ - #include -#define pci_pool dma_pool -#define pci_pool_create(name, pdev, size, align, allocation) \ - dma_pool_create(name, &pdev->dev, size, align, allocation) -#define pci_pool_destroy(pool) dma_pool_destroy(pool) -#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle) -#define pci_pool_zalloc(pool, flags, handle) \ - dma_pool_zalloc(pool, flags, handle) -#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr) - struct msix_entry { u32 vector; /* Kernel uses to write allocated vector */ u16 entry; /* Driver uses to specify entry, OS writes */ @@ -2130,7 +2123,7 @@ void pcibios_disable_device(struct pci_dev *dev); void pcibios_set_master(struct pci_dev *dev); int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state); -int pcibios_add_device(struct pci_dev *dev); +int pcibios_device_add(struct pci_dev *dev); void pcibios_release_device(struct pci_dev *dev); #ifdef CONFIG_PCI void pcibios_penalize_isa_irq(int irq, int active); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 5e76af742c80..98a9371133f8 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); +extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); extern bool is_kernel_percpu_address(unsigned long addr); @@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); extern void __init setup_per_cpu_areas(void); #endif -extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); -extern void __percpu *__alloc_percpu(size_t size, size_t align); +extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); +extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); extern void free_percpu(void __percpu *__pdata); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); diff --git a/include/linux/slab.h b/include/linux/slab.h index 083f3ce550bc..181045148b06 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -142,8 +142,6 @@ struct mem_cgroup; void __init kmem_cache_init(void); bool slab_is_available(void); -extern bool usercopy_fallback; - struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); @@ -152,8 +150,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); -void kmem_cache_destroy(struct kmem_cache *); -int kmem_cache_shrink(struct kmem_cache *); +void kmem_cache_destroy(struct kmem_cache *s); +int kmem_cache_shrink(struct kmem_cache *s); /* * Please use this macro to create slab caches. Simply specify the @@ -181,11 +179,11 @@ int kmem_cache_shrink(struct kmem_cache *); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *, size_t, gfp_t); -void kfree(const void *); -void kfree_sensitive(const void *); -size_t __ksize(const void *); -size_t ksize(const void *); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); +void kfree(const void *objp); +void kfree_sensitive(const void *objp); +size_t __ksize(const void *objp); +size_t ksize(const void *objp); #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); void kmem_dump_obj(void *object); @@ -425,9 +423,9 @@ static __always_inline unsigned int __kmalloc_index(size_t size, #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ -void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; -void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; -void kmem_cache_free(struct kmem_cache *, void *); +void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc; +void kmem_cache_free(struct kmem_cache *s, void *objp); /* * Bulk allocation and freeing operations. These are accelerated in an @@ -436,8 +434,8 @@ void kmem_cache_free(struct kmem_cache *, void *); * * Note that interrupts must be enabled when calling these functions. */ -void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p); /* * Caller must not use kfree_bulk() on memory not originally allocated @@ -449,10 +447,12 @@ static __always_inline void kfree_bulk(size_t size, void **p) } #ifdef CONFIG_NUMA -void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; -void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; +void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment + __alloc_size(1); +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment + __malloc; #else -static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __kmalloc(size, flags); } @@ -464,25 +464,24 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #endif #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc; +extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_slab_alignment __alloc_size(3); #ifdef CONFIG_NUMA -extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment __malloc; +extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) __assume_slab_alignment + __alloc_size(4); #else -static __always_inline void * -kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s, + gfp_t gfpflags, int node, size_t size) { return kmem_cache_alloc_trace(s, gfpflags, size); } #endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ -static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, - gfp_t flags, size_t size) +static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, + gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -490,10 +489,8 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, return ret; } -static __always_inline void * -kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = kmem_cache_alloc_node(s, gfpflags, node); @@ -502,19 +499,21 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment + __alloc_size(1); #ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; +extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) + __assume_page_alignment __alloc_size(1); #else -static __always_inline void * -kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags, + unsigned int order) { return kmalloc_order(size, flags, order); } #endif -static __always_inline void *kmalloc_large(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags) { unsigned int order = get_order(size); return kmalloc_order_trace(size, flags, order); @@ -574,7 +573,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * Try really hard to succeed the allocation but fail * eventually. */ -static __always_inline void *kmalloc(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { #ifndef CONFIG_SLOB @@ -596,7 +595,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } -static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && @@ -620,7 +619,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -638,8 +637,10 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static __must_check inline void * -krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags) +static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p, + size_t new_n, + size_t new_size, + gfp_t flags) { size_t bytes; @@ -655,7 +656,7 @@ krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags) * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kcalloc(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags) { return kmalloc_array(n, size, flags | __GFP_ZERO); } @@ -668,12 +669,13 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) + __alloc_size(1); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) -static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, - int node) +static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, + int node) { size_t bytes; @@ -684,14 +686,15 @@ static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, return __kmalloc_node(bytes, flags, node); } -static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) { return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } #ifdef CONFIG_NUMA -extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); +extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, + unsigned long caller) __alloc_size(1); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) @@ -716,7 +719,7 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kzalloc(size_t size, gfp_t flags) +static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags) { return kmalloc(size, flags | __GFP_ZERO); } @@ -727,11 +730,45 @@ static inline void *kzalloc(size_t size, gfp_t flags) * @flags: the type of memory to allocate (see kmalloc). * @node: memory node from which to allocate */ -static inline void *kzalloc_node(size_t size, gfp_t flags, int node) +static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node) { return kmalloc_node(size, flags | __GFP_ZERO, node); } +extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1); +static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags) +{ + return kvmalloc_node(size, flags, NUMA_NO_NODE); +} +static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node) +{ + return kvmalloc_node(size, flags | __GFP_ZERO, node); +} +static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags) +{ + return kvmalloc(size, flags | __GFP_ZERO); +} + +static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + + return kvmalloc(bytes, flags); +} + +static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags) +{ + return kvmalloc_array(n, size, flags | __GFP_ZERO); +} + +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) + __alloc_size(3); +extern void kvfree(const void *addr); +extern void kvfree_sensitive(const void *addr, size_t len); + unsigned int kmem_cache_size(struct kmem_cache *s); void __init kmem_cache_init_late(void); diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 85499f0586b0..0fa751b946fa 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -99,6 +99,8 @@ struct kmem_cache { #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; + /* Number of per cpu partial pages to keep around */ + unsigned int cpu_partial_pages; #endif struct kmem_cache_order_objects oo; @@ -141,17 +143,6 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_cpu_partial(s) ((s)->cpu_partial) -#define slub_set_cpu_partial(s, n) \ -({ \ - slub_cpu_partial(s) = (n); \ -}) -#else -#define slub_cpu_partial(s) (0) -#define slub_set_cpu_partial(s, n) -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS void sysfs_slab_unlink(struct kmem_cache *); diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 6bb4bc1a5f54..d29860966bc9 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -11,16 +11,20 @@ #ifndef _LINUX_STACKDEPOT_H #define _LINUX_STACKDEPOT_H +#include + typedef u32 depot_stack_handle_t; +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, bool can_alloc); + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); - #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); #else diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 9edecb494e9e..bef158815e83 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, unsigned int size, unsigned int skipnr); unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); /* Internal interfaces. Do not use in generic code */ #ifdef CONFIG_ARCH_STACKWALK diff --git a/include/linux/swap.h b/include/linux/swap.h index cdf0957a88a4..d1ea44b31f19 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -341,7 +341,6 @@ void workingset_update_node(struct xa_node *node); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; -extern unsigned long nr_free_buffer_pages(void); /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h index 082f1d51957a..be24056ac00f 100644 --- a/include/linux/switchtec.h +++ b/include/linux/switchtec.h @@ -19,6 +19,7 @@ #define SWITCHTEC_EVENT_EN_CLI BIT(2) #define SWITCHTEC_EVENT_EN_IRQ BIT(3) #define SWITCHTEC_EVENT_FATAL BIT(4) +#define SWITCHTEC_EVENT_NOT_SUPP BIT(31) #define SWITCHTEC_DMA_MRPC_EN BIT(0) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 671d402c3778..6e022cc712e6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -22,7 +22,7 @@ struct notifier_block; /* in notifier.h */ #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ #define VM_DMA_COHERENT 0x00000010 /* dma_alloc_coherent */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ -#define VM_NO_GUARD 0x00000040 /* don't add guard page */ +#define VM_NO_GUARD 0x00000040 /* ***DANGEROUS*** don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ @@ -136,21 +136,21 @@ static inline void vmalloc_init(void) static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif -extern void *vmalloc(unsigned long size); -extern void *vzalloc(unsigned long size); -extern void *vmalloc_user(unsigned long size); -extern void *vmalloc_node(unsigned long size, int node); -extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_32(unsigned long size); -extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); +extern void *vmalloc(unsigned long size) __alloc_size(1); +extern void *vzalloc(unsigned long size) __alloc_size(1); +extern void *vmalloc_user(unsigned long size) __alloc_size(1); +extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); +extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); +extern void *vmalloc_32(unsigned long size) __alloc_size(1); +extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, - const void *caller); + const void *caller) __alloc_size(1); void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, - int node, const void *caller); -void *vmalloc_no_huge(unsigned long size); + int node, const void *caller) __alloc_size(1); +void *vmalloc_no_huge(unsigned long size) __alloc_size(1); extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index 0abff67b96f0..14db8044c1ff 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -13,7 +13,7 @@ struct mm_struct; extern int trace_mmap_lock_reg(void); extern void trace_mmap_lock_unreg(void); -TRACE_EVENT_FN(mmap_lock_start_locking, +DECLARE_EVENT_CLASS(mmap_lock, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), @@ -32,15 +32,23 @@ TRACE_EVENT_FN(mmap_lock_start_locking, ), TP_printk( - "mm=%p memcg_path=%s write=%s\n", + "mm=%p memcg_path=%s write=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false" - ), - - trace_mmap_lock_reg, trace_mmap_lock_unreg + ) ); +#define DEFINE_MMAP_LOCK_EVENT(name) \ + DEFINE_EVENT_FN(mmap_lock, name, \ + TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ + bool write), \ + TP_ARGS(mm, memcg_path, write), \ + trace_mmap_lock_reg, trace_mmap_lock_unreg) + +DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); +DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); + TRACE_EVENT_FN(mmap_lock_acquire_returned, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, @@ -63,7 +71,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, ), TP_printk( - "mm=%p memcg_path=%s write=%s success=%s\n", + "mm=%p memcg_path=%s write=%s success=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false", @@ -73,34 +81,6 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, trace_mmap_lock_reg, trace_mmap_lock_unreg ); -TRACE_EVENT_FN(mmap_lock_released, - - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), - - TP_ARGS(mm, memcg_path, write), - - TP_STRUCT__entry( - __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) - __field(bool, write) - ), - - TP_fast_assign( - __entry->mm = mm; - __assign_str(memcg_path, memcg_path); - __entry->write = write; - ), - - TP_printk( - "mm=%p memcg_path=%s write=%s\n", - __entry->mm, - __get_str(memcg_path), - __entry->write ? "true" : "false" - ), - - trace_mmap_lock_reg, trace_mmap_lock_unreg -); - #endif /* _TRACE_MMAP_LOCK_H */ /* This part must be outside protection */ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 88faf2400ec2..f25a6149d3ba 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -27,6 +27,18 @@ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" +#define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) +#define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) +#define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS) + +#define show_throttle_flags(flags) \ + (flags) ? __print_flags(flags, "|", \ + {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ + {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \ + {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"} \ + ) : "VMSCAN_THROTTLE_NONE" + + #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ @@ -454,6 +466,32 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, TP_ARGS(nr_reclaimed) ); +TRACE_EVENT(mm_vmscan_throttled, + + TP_PROTO(int nid, int usec_timeout, int usec_delayed, int reason), + + TP_ARGS(nid, usec_timeout, usec_delayed, reason), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, usec_timeout) + __field(int, usec_delayed) + __field(int, reason) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->usec_timeout = usec_timeout; + __entry->usec_delayed = usec_delayed; + __entry->reason = 1U << reason; + ), + + TP_printk("nid=%d usec_timeout=%d usect_delayed=%d reason=%s", + __entry->nid, + __entry->usec_timeout, + __entry->usec_delayed, + show_throttle_flags(__entry->reason)) +); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 7dccb66474f7..a345b1e12daf 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -763,13 +763,6 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, TP_ARGS(usec_timeout, usec_delayed) ); -DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, - - TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), - - TP_ARGS(usec_timeout, usec_delayed) -); - DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 64553df9d735..bd1932c2074d 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -20,6 +20,7 @@ #define FAN_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ +#define FAN_FS_ERROR 0x00008000 /* Filesystem error */ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ @@ -125,6 +126,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_DFID_NAME 2 #define FAN_EVENT_INFO_TYPE_DFID 3 #define FAN_EVENT_INFO_TYPE_PIDFD 4 +#define FAN_EVENT_INFO_TYPE_ERROR 5 /* Variable length info record following event metadata */ struct fanotify_event_info_header { @@ -159,6 +161,12 @@ struct fanotify_event_info_pidfd { __s32 pidfd; }; +struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; +}; + struct fanotify_response { __s32 fd; __u32 response; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index e709ae8235e7..ff6ccbc6efe9 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -504,6 +504,12 @@ #define PCI_EXP_DEVCTL_URRE 0x0008 /* Unsupported Request Reporting En. */ #define PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */ #define PCI_EXP_DEVCTL_PAYLOAD 0x00e0 /* Max_Payload_Size */ +#define PCI_EXP_DEVCTL_PAYLOAD_128B 0x0000 /* 128 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_256B 0x0020 /* 256 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_512B 0x0040 /* 512 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_1024B 0x0060 /* 1024 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_2048B 0x0080 /* 2048 Bytes */ +#define PCI_EXP_DEVCTL_PAYLOAD_4096B 0x00a0 /* 4096 Bytes */ #define PCI_EXP_DEVCTL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ #define PCI_EXP_DEVCTL_PHANTOM 0x0200 /* Phantom Functions Enable */ #define PCI_EXP_DEVCTL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ diff --git a/init/Kconfig b/init/Kconfig index 9b94cc1b5546..5e82f7f77abd 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -901,7 +901,7 @@ config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION + depends on SMP && NUMA && MIGRATION && !PREEMPT_RT help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when diff --git a/init/initramfs.c b/init/initramfs.c index a842c0544745..2f3d96dc3db6 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -607,7 +607,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end) unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); unsigned long aligned_end = ALIGN(end, PAGE_SIZE); - memblock_free(__pa(aligned_start), aligned_end - aligned_start); + memblock_free((void *)aligned_start, aligned_end - aligned_start); #endif free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, diff --git a/init/main.c b/init/main.c index 183f861707e5..0c4a6e0d8234 100644 --- a/init/main.c +++ b/init/main.c @@ -381,7 +381,7 @@ static char * __init xbc_make_cmdline(const char *key) ret = xbc_snprint_cmdline(new_cmdline, len + 1, root); if (ret < 0 || ret > len) { pr_err("Failed to print extra kernel cmdline.\n"); - memblock_free_ptr(new_cmdline, len + 1); + memblock_free(new_cmdline, len + 1); return NULL; } @@ -916,7 +916,7 @@ static void __init print_unknown_bootoptions(void) end += sprintf(end, " %s", *p); pr_notice("Unknown command line parameters:%s\n", unknown_options); - memblock_free_ptr(unknown_options, len); + memblock_free(unknown_options, len); } asmlinkage __visible void __init __no_sanitize_address start_kernel(void) @@ -1497,6 +1497,8 @@ static int __ref kernel_init(void *unused) kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + + system_state = SYSTEM_FREEING_INITMEM; kprobe_free_init_mem(); ftrace_free_init_mem(); kgdb_free_init_mem(); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 60739d5e3373..02348b48447c 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -160,8 +160,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask, audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); - if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) || - WARN_ON_ONCE(!inode)) + if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group)) return 0; if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 698b62b4a2ec..713b256be944 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -473,8 +473,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask, parent = container_of(inode_mark, struct audit_parent, mark); - if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) || - WARN_ON_ONCE(!inode)) + if (WARN_ON_ONCE(inode_mark->group != audit_watch_group)) return 0; if (mask & (FS_CREATE|FS_MOVED_TO) && inode) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2a9695ccb65f..d0e163a02099 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -69,6 +69,13 @@ DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); +/* + * There could be abnormal cpuset configurations for cpu or memory + * node binding, add this key to provide a quick low-cost judgement + * of the situation. + */ +DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); + /* See "Frequency meter" comments, below. */ struct fmeter { @@ -372,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); +static inline void check_insane_mems_config(nodemask_t *nodes) +{ + if (!cpusets_insane_config() && + movable_only_nodes(nodes)) { + static_branch_enable(&cpusets_insane_config_key); + pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" + "Cpuset allocations might fail even with a lot of memory available.\n", + nodemask_pr_args(nodes)); + } +} + /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting @@ -1870,6 +1888,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; + check_insane_mems_config(&trialcs->mems_allowed); + spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; spin_unlock_irq(&callback_lock); @@ -3173,6 +3193,9 @@ update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); + if (mems_updated) + check_insane_mems_config(&new_mems); + if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 6fab31ea47d6..8e840fbbed7c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -247,7 +247,7 @@ swiotlb_init(int verbose) return; fail_free_mem: - memblock_free_early(__pa(tlb), bytes); + memblock_free(tlb, bytes); fail: pr_warn("Cannot allocate buffer"); } diff --git a/kernel/extable.c b/kernel/extable.c index b0ea5eb0c3b4..290661f68e6b 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -76,7 +76,7 @@ int notrace core_kernel_text(unsigned long addr) addr < (unsigned long)_etext) return 1; - if (system_state < SYSTEM_RUNNING && + if (system_state < SYSTEM_FREEING_INITMEM && init_kernel_text(addr)) return 1; return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4d8fc65cf38f..bf38c546aa25 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -744,9 +744,8 @@ static int irq_domain_translate(struct irq_domain *d, return 0; } -static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, - unsigned int count, - struct irq_fwspec *fwspec) +void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec) { int i; @@ -756,6 +755,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, for (i = 0; i < count; i++) fwspec->param[i] = args[i]; } +EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { @@ -1502,6 +1502,7 @@ out_free_desc: irq_free_descs(virq, nr_irqs); return ret; } +EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs); /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 33400ff051a8..8347fc158d2b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -556,6 +556,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, if (kbuf->image->type == KEXEC_TYPE_CRASH) return func(&crashk_res, kbuf); + /* + * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See + * IORESOURCE_SYSRAM_DRIVER_MANAGED handling in + * locate_mem_hole_callback(). + */ if (kbuf->top_down) { for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE, &mstart, &mend, NULL) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7096384dc60f..74d371665747 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -788,6 +788,21 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ +/* + * Check if an address is part of freed initmem. After initmem is freed, + * memory can be allocated from it, and such allocations would then have + * addresses within the range [_stext, _end]. + */ +#ifndef arch_is_kernel_initmem_freed +static int arch_is_kernel_initmem_freed(unsigned long addr) +{ + if (system_state < SYSTEM_FREEING_INITMEM) + return 0; + + return init_section_contains((void *)addr, 1); +} +#endif + static int static_obj(const void *obj) { unsigned long start = (unsigned long) &_stext, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9e5dfb1896a9..013bfd6dcc34 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early) return; err_free_descs: - memblock_free_ptr(new_descs, new_descs_size); + memblock_free(new_descs, new_descs_size); err_free_log_buf: - memblock_free_ptr(new_log_buf, new_log_buf_len); + memblock_free(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 30169c7685b6..d201a7052a29 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1492,7 +1492,6 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9f8117c7cfdd..9c625257023d 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -13,6 +13,7 @@ #include #include #include +#include /** * stack_trace_print - Print the entries in the stack trace @@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) #endif /* CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* !CONFIG_ARCH_STACKWALK */ + +static inline bool in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +/** + * filter_irq_stacks - Find first IRQ stack entry in trace + * @entries: Pointer to stack trace array + * @nr_entries: Number of entries in the storage array + * + * Return: Number of trace entries until IRQ stack starts. + */ +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries) +{ + unsigned int i; + + for (i = 0; i < nr_entries; i++) { + if (in_irqentry_text(entries[i])) { + /* Include the irqentry function into the stack. */ + return i + 1; + } + } + return nr_entries; +} +EXPORT_SYMBOL_GPL(filter_irq_stacks); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 257ffb993ea2..f00de83d0246 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -137,7 +137,7 @@ static void __acct_update_integrals(struct task_struct *tsk, * the rest of the math is done in xacct_add_tsk. */ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; + tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10; } /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1a7df882f55e..613917bbc4e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1351,7 +1351,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct worker_pool *pool = pwq->pool; /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6fdbf9613aec..9ef7ce18b4f5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -877,7 +877,7 @@ config DEBUG_MEMORY_INIT config MEMORY_NOTIFIER_ERROR_INJECT tristate "Memory hotplug notifier error injection module" - depends on MEMORY_HOTPLUG_SPARSE && NOTIFIER_ERROR_INJECTION + depends on MEMORY_HOTPLUG && NOTIFIER_ERROR_INJECTION help This option provides the ability to inject artificial errors to memory hotplug notifier chain callbacks. It is controlled through diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index e641add33947..912f252a41fc 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -25,17 +25,6 @@ menuconfig KFENCE if KFENCE -config KFENCE_STATIC_KEYS - bool "Use static keys to set up allocations" - default y - depends on JUMP_LABEL # To ensure performance, require jump labels - help - Use static keys (static branches) to set up KFENCE allocations. Using - static keys is normally recommended, because it avoids a dynamic - branch in the allocator's fast path. However, with very low sample - intervals, or on systems that do not support jump labels, a dynamic - branch may still be an acceptable performance trade-off. - config KFENCE_SAMPLE_INTERVAL int "Default sample interval in milliseconds" default 100 @@ -56,6 +45,21 @@ config KFENCE_NUM_OBJECTS pages are required; with one containing the object and two adjacent ones used as guard pages. +config KFENCE_STATIC_KEYS + bool "Use static keys to set up allocations" if EXPERT + depends on JUMP_LABEL + help + Use static keys (static branches) to set up KFENCE allocations. This + option is only recommended when using very large sample intervals, or + performance has carefully been evaluated with this option. + + Using static keys comes with trade-offs that need to be carefully + evaluated given target workloads and system architectures. Notably, + enabling and disabling static keys invoke IPI broadcasts, the latency + and impact of which is much harder to predict than a dynamic branch. + + Say N if you are unsure. + config KFENCE_STRESS_TEST_FAULTS int "Stress testing of fault handling and error reporting" if EXPERT default 0 diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 70e0d52ffd24..74f3201ab8e5 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -50,7 +50,7 @@ static inline void * __init xbc_alloc_mem(size_t size) static inline void __init xbc_free_mem(void *addr, size_t size) { - memblock_free_ptr(addr, size); + memblock_free(addr, size); } #else /* !__KERNEL__ */ diff --git a/lib/cpumask.c b/lib/cpumask.c index c3c76b833384..a971a82d2f43 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -188,7 +188,7 @@ EXPORT_SYMBOL(free_cpumask_var); */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { - memblock_free_early(__pa(mask), cpumask_size()); + memblock_free(mask, cpumask_size()); } #endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 0a2e417f83cb..09485dc5bd12 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -20,7 +20,6 @@ */ #include -#include #include #include #include @@ -102,8 +101,8 @@ static bool init_stack_slab(void **prealloc) } /* Allocation of a new stack in raw storage */ -static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, - u32 hash, void **prealloc, gfp_t alloc_flags) +static struct stack_record * +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) { struct stack_record *stack; size_t required_size = struct_size(stack, entries, size); @@ -248,17 +247,28 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, EXPORT_SYMBOL_GPL(stack_depot_fetch); /** - * stack_depot_save - Save a stack trace from an array + * __stack_depot_save - Save a stack trace from an array * * @entries: Pointer to storage array * @nr_entries: Size of the storage array * @alloc_flags: Allocation gfp flags + * @can_alloc: Allocate stack slabs (increased chance of failure if false) * - * Return: The handle of the stack struct stored in depot + * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is + * %true, is allowed to replenish the stack slab pool in case no space is left + * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids + * any allocations and will fail if no space is left to store the stack trace. + * + * Context: Any context, but setting @can_alloc to %false is required if + * alloc_pages() cannot be used from the current context. Currently + * this is the case from contexts where neither %GFP_ATOMIC nor + * %GFP_NOWAIT can be used (NMI, raw_spin_lock). + * + * Return: The handle of the stack struct stored in depot, 0 on failure. */ -depot_stack_handle_t stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t alloc_flags) +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; depot_stack_handle_t retval = 0; @@ -291,7 +301,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, * The smp_load_acquire() here pairs with smp_store_release() to * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). */ - if (unlikely(!smp_load_acquire(&next_slab_inited))) { + if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) { /* * Zero out zone modifiers, as we don't have specific zone * requirements. Keep the flags related to allocation in atomic @@ -309,9 +319,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { - struct stack_record *new = - depot_alloc_stack(entries, nr_entries, - hash, &prealloc, alloc_flags); + struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); + if (new) { new->next = *bucket; /* @@ -340,27 +349,24 @@ exit: fast_exit: return retval; } +EXPORT_SYMBOL_GPL(__stack_depot_save); + +/** + * stack_depot_save - Save a stack trace from an array + * + * @entries: Pointer to storage array + * @nr_entries: Size of the storage array + * @alloc_flags: Allocation gfp flags + * + * Context: Contexts where allocations via alloc_pages() are allowed. + * See __stack_depot_save() for more details. + * + * Return: The handle of the stack struct stored in depot, 0 on failure. + */ +depot_stack_handle_t stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t alloc_flags) +{ + return __stack_depot_save(entries, nr_entries, alloc_flags, true); +} EXPORT_SYMBOL_GPL(stack_depot_save); - -static inline int in_irqentry_text(unsigned long ptr) -{ - return (ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end) || - (ptr >= (unsigned long)&__softirqentry_text_start && - ptr < (unsigned long)&__softirqentry_text_end); -} - -unsigned int filter_irq_stacks(unsigned long *entries, - unsigned int nr_entries) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (in_irqentry_text(entries[i])) { - /* Include the irqentry function into the stack. */ - return i + 1; - } - } - return nr_entries; -} -EXPORT_SYMBOL_GPL(filter_irq_stacks); diff --git a/lib/test_kasan.c b/lib/test_kasan.c index ebed755ebf34..67ed689a0b1b 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -440,6 +440,7 @@ static void kmalloc_oob_memset_2(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2)); kfree(ptr); } @@ -452,6 +453,7 @@ static void kmalloc_oob_memset_4(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4)); kfree(ptr); } @@ -464,6 +466,7 @@ static void kmalloc_oob_memset_8(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8)); kfree(ptr); } @@ -476,6 +479,7 @@ static void kmalloc_oob_memset_16(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16)); kfree(ptr); } @@ -488,16 +492,17 @@ static void kmalloc_oob_in_memset(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + KASAN_GRANULE_SIZE)); kfree(ptr); } -static void kmalloc_memmove_invalid_size(struct kunit *test) +static void kmalloc_memmove_negative_size(struct kunit *test) { char *ptr; size_t size = 64; - volatile size_t invalid_size = -2; + size_t invalid_size = -2; /* * Hardware tag-based mode doesn't check memmove for negative size. @@ -509,6 +514,22 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(invalid_size); + KUNIT_EXPECT_KASAN_FAIL(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + kfree(ptr); +} + +static void kmalloc_memmove_invalid_size(struct kunit *test) +{ + char *ptr; + size_t size = 64; + volatile size_t invalid_size = size; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + memset((char *)ptr, 0, 64); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); @@ -1129,6 +1150,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_memset_4), KUNIT_CASE(kmalloc_oob_memset_8), KUNIT_CASE(kmalloc_oob_memset_16), + KUNIT_CASE(kmalloc_memmove_negative_size), KUNIT_CASE(kmalloc_memmove_invalid_size), KUNIT_CASE(kmalloc_uaf), KUNIT_CASE(kmalloc_uaf_memset), diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c index 7ebf433edef3..b112cbc835e9 100644 --- a/lib/test_kasan_module.c +++ b/lib/test_kasan_module.c @@ -35,6 +35,8 @@ static noinline void __init copy_user_test(void) return; } + OPTIMIZER_HIDE_VAR(size); + pr_info("out-of-bounds in copy_from_user()\n"); unused = copy_from_user(kmem, usermem, size + 1); diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index e14993bc84d2..cf41fd6df42a 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -393,7 +393,7 @@ static struct test_driver { static void shuffle_array(int *arr, int n) { unsigned int rnd; - int i, j, x; + int i, j; for (i = n - 1; i > 0; i--) { get_random_bytes(&rnd, sizeof(rnd)); @@ -402,9 +402,7 @@ static void shuffle_array(int *arr, int n) j = rnd % i; /* Swap indexes. */ - x = arr[i]; - arr[i] = arr[j]; - arr[j] = x; + swap(arr[i], arr[j]); } } diff --git a/mm/Kconfig b/mm/Kconfig index d16ba9249bc5..ae1f151c2924 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -123,15 +123,11 @@ config ARCH_ENABLE_MEMORY_HOTPLUG config MEMORY_HOTPLUG bool "Allow for memory hot-add" select MEMORY_ISOLATION - depends on SPARSEMEM || X86_64_ACPI_NUMA + depends on SPARSEMEM depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on 64BIT || BROKEN + depends on 64BIT select NUMA_KEEP_MEMINFO if NUMA -config MEMORY_HOTPLUG_SPARSE - def_bool y - depends on SPARSEMEM && MEMORY_HOTPLUG - config MEMORY_HOTPLUG_DEFAULT_ONLINE bool "Online the newly added memory blocks by default" depends on MEMORY_HOTPLUG @@ -371,7 +367,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT select COMPACTION select XARRAY_MULTI help diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c878d995af06..1eead4761011 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -292,8 +292,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, memset(wb, 0, sizeof(*wb)); - if (wb != &bdi->wb) - bdi_get(bdi); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); @@ -317,7 +315,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, err = fprop_local_init_percpu(&wb->completions, gfp); if (err) - goto out_put_bdi; + return err; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); @@ -331,9 +329,6 @@ out_destroy_stat: while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); -out_put_bdi: - if (wb != &bdi->wb) - bdi_put(bdi); return err; } @@ -374,8 +369,6 @@ static void wb_exit(struct bdi_writeback *wb) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); - if (wb != &wb->bdi->wb) - bdi_put(wb->bdi); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -398,6 +391,7 @@ static void cgwb_release_workfn(struct work_struct *work) struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); + struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); @@ -417,6 +411,7 @@ static void cgwb_release_workfn(struct work_struct *work) percpu_ref_exit(&wb->refcnt); wb_exit(wb); + bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); } @@ -498,6 +493,7 @@ static int cgwb_create(struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); + bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and @@ -529,6 +525,7 @@ static int cgwb_create(struct backing_dev_info *bdi, goto out_put; err_fprop_exit: + bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); @@ -959,14 +956,14 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi->owner = NULL; } } +EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); - if (test_bit(WB_registered, &bdi->wb.state)) - bdi_unregister(bdi); + WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); @@ -1058,51 +1055,3 @@ long congestion_wait(int sync, long timeout) return ret; } EXPORT_SYMBOL(congestion_wait); - -/** - * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes - * @sync: SYNC or ASYNC IO - * @timeout: timeout in jiffies - * - * In the event of a congested backing_dev (any backing_dev) this waits - * for up to @timeout jiffies for either a BDI to exit congestion of the - * given @sync queue or a write to complete. - * - * The return value is 0 if the sleep is for the full timeout. Otherwise, - * it is the number of jiffies that were still remaining when the function - * returned. return_value == timeout implies the function did not sleep. - */ -long wait_iff_congested(int sync, long timeout) -{ - long ret; - unsigned long start = jiffies; - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[sync]; - - /* - * If there is no congestion, yield if necessary instead - * of sleeping on the congestion queue - */ - if (atomic_read(&nr_wb_congested[sync]) == 0) { - cond_resched(); - - /* In case we scheduled, work out time remaining */ - ret = timeout - (jiffies - start); - if (ret < 0) - ret = 0; - - goto out; - } - - /* Sleep until uncongested or a write happens */ - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - ret = io_schedule_timeout(timeout); - finish_wait(wqh, &wait); - -out: - trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), - jiffies_to_usecs(jiffies - start)); - - return ret; -} -EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/cma.c b/mm/cma.c index 588f7e7885cf..9fa9a485eb3a 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -384,7 +384,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, return 0; free_mem: - memblock_free(base, size); + memblock_phys_free(base, size); err: pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; @@ -534,6 +534,25 @@ out: return page; } +bool cma_pages_valid(struct cma *cma, const struct page *pages, + unsigned long count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) { + pr_debug("%s(page %p, count %lu)\n", __func__, + (void *)pages, count); + return false; + } + + return true; +} + /** * cma_release() - release allocated pages * @cma: Contiguous memory region for which the allocation is performed. @@ -549,16 +568,13 @@ bool cma_release(struct cma *cma, const struct page *pages, { unsigned long pfn; - if (!cma || !pages) + if (!cma_pages_valid(cma, pages, count)) return false; pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) - return false; - VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); free_contig_range(pfn, count); diff --git a/mm/compaction.c b/mm/compaction.c index fbc60f964c38..6e446094ce90 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -761,6 +761,8 @@ isolate_freepages_range(struct compact_control *cc, /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(pg_data_t *pgdat) { + bool too_many; + unsigned long active, inactive, isolated; inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + @@ -770,7 +772,11 @@ static bool too_many_isolated(pg_data_t *pgdat) isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + node_page_state(pgdat, NR_ISOLATED_ANON); - return isolated > (inactive + active) / 2; + too_many = isolated > (inactive + active) / 2; + if (!too_many) + wake_throttle_isolated(pgdat); + + return too_many; } /** @@ -822,7 +828,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (cc->mode == MIGRATE_ASYNC) return -EAGAIN; - congestion_wait(BLK_RW_ASYNC, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); if (fatal_signal_pending(current)) return -EINTR; diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 37024798a97c..5bcf05851ad0 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -30,7 +30,15 @@ config DAMON_VADDR select PAGE_IDLE_FLAG help This builds the default data access monitoring primitives for DAMON - that works for virtual address spaces. + that work for virtual address spaces. + +config DAMON_PADDR + bool "Data access monitoring primitives for the physical address space" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring primitives for DAMON + that works for the physical address space. config DAMON_VADDR_KUNIT_TEST bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS @@ -46,7 +54,7 @@ config DAMON_VADDR_KUNIT_TEST config DAMON_DBGFS bool "DAMON debugfs interface" - depends on DAMON_VADDR && DEBUG_FS + depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS help This builds the debugfs interface for DAMON. The user space admins can use the interface for arbitrary data access monitoring. @@ -65,4 +73,16 @@ config DAMON_DBGFS_KUNIT_TEST If unsure, say N. +config DAMON_RECLAIM + bool "Build DAMON-based reclaim (DAMON_RECLAIM)" + depends on DAMON_PADDR + help + This builds the DAMON-based reclamation subsystem. It finds pages + that not accessed for a long time (cold) using DAMON and reclaim + those. + + This is suggested to be used as a proactive and lightweight + reclamation under light memory pressure, while the traditional page + scanning-based reclamation is used for heavy pressure. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index fed4be3bace3..f7d5ac377a2b 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DAMON) := core.o -obj-$(CONFIG_DAMON_VADDR) += vaddr.o +obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o +obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o +obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o diff --git a/mm/damon/core.c b/mm/damon/core.c index 30e9211f494a..c381b3c525d0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -10,8 +10,10 @@ #include #include #include +#include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -45,6 +47,9 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) region->nr_accesses = 0; INIT_LIST_HEAD(®ion->list); + region->age = 0; + region->last_nr_accesses = 0; + return region; } @@ -82,6 +87,74 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t) damon_free_region(r); } +struct damos *damon_new_scheme( + unsigned long min_sz_region, unsigned long max_sz_region, + unsigned int min_nr_accesses, unsigned int max_nr_accesses, + unsigned int min_age_region, unsigned int max_age_region, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) +{ + struct damos *scheme; + + scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); + if (!scheme) + return NULL; + scheme->min_sz_region = min_sz_region; + scheme->max_sz_region = max_sz_region; + scheme->min_nr_accesses = min_nr_accesses; + scheme->max_nr_accesses = max_nr_accesses; + scheme->min_age_region = min_age_region; + scheme->max_age_region = max_age_region; + scheme->action = action; + scheme->stat_count = 0; + scheme->stat_sz = 0; + INIT_LIST_HEAD(&scheme->list); + + scheme->quota.ms = quota->ms; + scheme->quota.sz = quota->sz; + scheme->quota.reset_interval = quota->reset_interval; + scheme->quota.weight_sz = quota->weight_sz; + scheme->quota.weight_nr_accesses = quota->weight_nr_accesses; + scheme->quota.weight_age = quota->weight_age; + scheme->quota.total_charged_sz = 0; + scheme->quota.total_charged_ns = 0; + scheme->quota.esz = 0; + scheme->quota.charged_sz = 0; + scheme->quota.charged_from = 0; + scheme->quota.charge_target_from = NULL; + scheme->quota.charge_addr_from = 0; + + scheme->wmarks.metric = wmarks->metric; + scheme->wmarks.interval = wmarks->interval; + scheme->wmarks.high = wmarks->high; + scheme->wmarks.mid = wmarks->mid; + scheme->wmarks.low = wmarks->low; + scheme->wmarks.activated = true; + + return scheme; +} + +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s) +{ + list_add_tail(&s->list, &ctx->schemes); +} + +static void damon_del_scheme(struct damos *s) +{ + list_del(&s->list); +} + +static void damon_free_scheme(struct damos *s) +{ + kfree(s); +} + +void damon_destroy_scheme(struct damos *s) +{ + damon_del_scheme(s); + damon_free_scheme(s); +} + /* * Construct a damon_target struct * @@ -107,6 +180,11 @@ void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) list_add_tail(&t->list, &ctx->adaptive_targets); } +bool damon_targets_empty(struct damon_ctx *ctx) +{ + return list_empty(&ctx->adaptive_targets); +} + static void damon_del_target(struct damon_target *t) { list_del(&t->list); @@ -153,6 +231,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->max_nr_regions = 1000; INIT_LIST_HEAD(&ctx->adaptive_targets); + INIT_LIST_HEAD(&ctx->schemes); return ctx; } @@ -172,7 +251,13 @@ static void damon_destroy_targets(struct damon_ctx *ctx) void damon_destroy_ctx(struct damon_ctx *ctx) { + struct damos *s, *next_s; + damon_destroy_targets(ctx); + + damon_for_each_scheme_safe(s, next_s, ctx) + damon_destroy_scheme(s); + kfree(ctx); } @@ -247,6 +332,30 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, return 0; } +/** + * damon_set_schemes() - Set data access monitoring based operation schemes. + * @ctx: monitoring context + * @schemes: array of the schemes + * @nr_schemes: number of entries in @schemes + * + * This function should not be called while the kdamond of the context is + * running. + * + * Return: 0 if success, or negative error code otherwise. + */ +int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, + ssize_t nr_schemes) +{ + struct damos *s, *next; + ssize_t i; + + damon_for_each_scheme_safe(s, next, ctx) + damon_destroy_scheme(s); + for (i = 0; i < nr_schemes; i++) + damon_add_scheme(ctx, schemes[i]); + return 0; +} + /** * damon_nr_running_ctxs() - Return number of currently running contexts. */ @@ -281,17 +390,6 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) return sz; } -static bool damon_kdamond_running(struct damon_ctx *ctx) -{ - bool running; - - mutex_lock(&ctx->kdamond_lock); - running = ctx->kdamond != NULL; - mutex_unlock(&ctx->kdamond_lock); - - return running; -} - static int kdamond_fn(void *data); /* @@ -309,12 +407,11 @@ static int __damon_start(struct damon_ctx *ctx) mutex_lock(&ctx->kdamond_lock); if (!ctx->kdamond) { err = 0; - ctx->kdamond_stop = false; ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", nr_running_ctxs); if (IS_ERR(ctx->kdamond)) { err = PTR_ERR(ctx->kdamond); - ctx->kdamond = 0; + ctx->kdamond = NULL; } } mutex_unlock(&ctx->kdamond_lock); @@ -365,13 +462,15 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs) */ static int __damon_stop(struct damon_ctx *ctx) { + struct task_struct *tsk; + mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ctx->kdamond_stop = true; + tsk = ctx->kdamond; + if (tsk) { + get_task_struct(tsk); mutex_unlock(&ctx->kdamond_lock); - while (damon_kdamond_running(ctx)) - usleep_range(ctx->sample_interval, - ctx->sample_interval * 2); + kthread_stop(tsk); + put_task_struct(tsk); return 0; } mutex_unlock(&ctx->kdamond_lock); @@ -444,11 +543,203 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_for_each_region(r, t) { trace_damon_aggregated(t, r, damon_nr_regions(t)); + r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } } } +static void damon_split_region_at(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + unsigned long sz_r); + +static bool __damos_valid_target(struct damon_region *r, struct damos *s) +{ + unsigned long sz; + + sz = r->ar.end - r->ar.start; + return s->min_sz_region <= sz && sz <= s->max_sz_region && + s->min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->max_nr_accesses && + s->min_age_region <= r->age && r->age <= s->max_age_region; +} + +static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + bool ret = __damos_valid_target(r, s); + + if (!ret || !s->quota.esz || !c->primitive.get_scheme_score) + return ret; + + return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score; +} + +static void damon_do_apply_schemes(struct damon_ctx *c, + struct damon_target *t, + struct damon_region *r) +{ + struct damos *s; + + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + unsigned long sz = r->ar.end - r->ar.start; + struct timespec64 begin, end; + + if (!s->wmarks.activated) + continue; + + /* Check the quota */ + if (quota->esz && quota->charged_sz >= quota->esz) + continue; + + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + continue; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + continue; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + continue; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz) { + if (r->ar.end - r->ar.start <= + DAMON_MIN_REGION) + continue; + sz = DAMON_MIN_REGION; + } + damon_split_region_at(c, t, r, sz); + r = damon_next_region(r); + sz = r->ar.end - r->ar.start; + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + + if (!damos_valid_target(c, t, r, s)) + continue; + + /* Apply the scheme */ + if (c->primitive.apply_scheme) { + if (quota->esz && + quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(c, t, r, sz); + } + ktime_get_coarse_ts64(&begin); + c->primitive.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); + quota->charged_sz += sz; + if (quota->esz && quota->charged_sz >= quota->esz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } + } + if (s->action != DAMOS_STAT) + r->age = 0; + +update_stat: + s->stat_count++; + s->stat_sz += sz; + } +} + +/* Shouldn't be called if quota->ms and quota->sz are zero */ +static void damos_set_effective_quota(struct damos_quota *quota) +{ + unsigned long throughput; + unsigned long esz; + + if (!quota->ms) { + quota->esz = quota->sz; + return; + } + + if (quota->total_charged_ns) + throughput = quota->total_charged_sz * 1000000 / + quota->total_charged_ns; + else + throughput = PAGE_SIZE * 1024; + esz = throughput * quota->ms; + + if (quota->sz && quota->sz < esz) + esz = quota->sz; + quota->esz = esz; +} + +static void kdamond_apply_schemes(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r, *next_r; + struct damos *s; + + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; + + if (!s->wmarks.activated) + continue; + + if (!quota->ms && !quota->sz) + continue; + + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies( + quota->reset_interval))) { + quota->total_charged_sz += quota->charged_sz; + quota->charged_from = jiffies; + quota->charged_sz = 0; + damos_set_effective_quota(quota); + } + + if (!c->primitive.get_scheme_score) + continue; + + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->primitive.get_scheme_score( + c, t, r, s); + quota->histogram[score] += + r->ar.end - r->ar.start; + if (score > max_score) + max_score = score; + } + } + + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; + } + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next_r, t) + damon_do_apply_schemes(c, t, r); + } +} + #define sz_damon_region(r) (r->ar.end - r->ar.start) /* @@ -461,6 +752,7 @@ static void damon_merge_two_regions(struct damon_target *t, l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); + l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; damon_destroy_region(r, t); } @@ -480,6 +772,11 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { + if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres) + r->age = 0; + else + r->age++; + if (prev && prev->ar.end == r->ar.start && diff_of(prev->nr_accesses, r->nr_accesses) <= thres && sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) @@ -527,6 +824,9 @@ static void damon_split_region_at(struct damon_ctx *ctx, r->ar.end = new->ar.start; + new->age = r->age; + new->last_nr_accesses = r->last_nr_accesses; + damon_insert_region(new, r, damon_next_region(r), t); } @@ -615,12 +915,8 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx) static bool kdamond_need_stop(struct damon_ctx *ctx) { struct damon_target *t; - bool stop; - mutex_lock(&ctx->kdamond_lock); - stop = ctx->kdamond_stop; - mutex_unlock(&ctx->kdamond_lock); - if (stop) + if (kthread_should_stop()) return true; if (!ctx->primitive.target_valid) @@ -634,11 +930,81 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) return true; } -static void set_kdamond_stop(struct damon_ctx *ctx) +static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) { - mutex_lock(&ctx->kdamond_lock); - ctx->kdamond_stop = true; - mutex_unlock(&ctx->kdamond_lock); + struct sysinfo i; + + switch (metric) { + case DAMOS_WMARK_FREE_MEM_RATE: + si_meminfo(&i); + return i.freeram * 1000 / i.totalram; + default: + break; + } + return -EINVAL; +} + +/* + * Returns zero if the scheme is active. Else, returns time to wait for next + * watermark check in micro-seconds. + */ +static unsigned long damos_wmark_wait_us(struct damos *scheme) +{ + unsigned long metric; + + if (scheme->wmarks.metric == DAMOS_WMARK_NONE) + return 0; + + metric = damos_wmark_metric_value(scheme->wmarks.metric); + /* higher than high watermark or lower than low watermark */ + if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { + if (scheme->wmarks.activated) + pr_debug("deactivate a scheme (%d) for %s wmark\n", + scheme->action, + metric > scheme->wmarks.high ? + "high" : "low"); + scheme->wmarks.activated = false; + return scheme->wmarks.interval; + } + + /* inactive and higher than middle watermark */ + if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) && + !scheme->wmarks.activated) + return scheme->wmarks.interval; + + if (!scheme->wmarks.activated) + pr_debug("activate a scheme (%d)\n", scheme->action); + scheme->wmarks.activated = true; + return 0; +} + +static void kdamond_usleep(unsigned long usecs) +{ + if (usecs > 100 * 1000) + schedule_timeout_interruptible(usecs_to_jiffies(usecs)); + else + usleep_range(usecs, usecs + 1); +} + +/* Returns negative error code if it's not activated but should return */ +static int kdamond_wait_activation(struct damon_ctx *ctx) +{ + struct damos *s; + unsigned long wait_time; + unsigned long min_wait_time = 0; + + while (!kdamond_need_stop(ctx)) { + damon_for_each_scheme(s, ctx) { + wait_time = damos_wmark_wait_us(s); + if (!min_wait_time || wait_time < min_wait_time) + min_wait_time = wait_time; + } + if (!min_wait_time) + return 0; + + kdamond_usleep(min_wait_time); + } + return -EBUSY; } /* @@ -651,24 +1017,26 @@ static int kdamond_fn(void *data) struct damon_region *r, *next; unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; + bool done = false; - mutex_lock(&ctx->kdamond_lock); - pr_info("kdamond (%d) starts\n", ctx->kdamond->pid); - mutex_unlock(&ctx->kdamond_lock); + pr_debug("kdamond (%d) starts\n", current->pid); if (ctx->primitive.init) ctx->primitive.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) - set_kdamond_stop(ctx); + done = true; sz_limit = damon_region_sz_limit(ctx); - while (!kdamond_need_stop(ctx)) { + while (!kdamond_need_stop(ctx) && !done) { + if (kdamond_wait_activation(ctx)) + continue; + if (ctx->primitive.prepare_access_checks) ctx->primitive.prepare_access_checks(ctx); if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) - set_kdamond_stop(ctx); + done = true; usleep_range(ctx->sample_interval, ctx->sample_interval + 1); @@ -681,7 +1049,8 @@ static int kdamond_fn(void *data) sz_limit); if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) - set_kdamond_stop(ctx); + done = true; + kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->primitive.reset_aggregated) @@ -699,13 +1068,12 @@ static int kdamond_fn(void *data) damon_destroy_region(r, t); } - if (ctx->callback.before_terminate && - ctx->callback.before_terminate(ctx)) - set_kdamond_stop(ctx); + if (ctx->callback.before_terminate) + ctx->callback.before_terminate(ctx); if (ctx->primitive.cleanup) ctx->primitive.cleanup(ctx); - pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid); + pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); @@ -714,7 +1082,7 @@ static int kdamond_fn(void *data) nr_running_ctxs--; mutex_unlock(&damon_lock); - do_exit(0); + return 0; } #include "core-test.h" diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 4eddcfa73996..86b9f9528231 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -109,9 +109,63 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) dbgfs_destroy_ctx(ctx); } +static void damon_dbgfs_test_set_init_regions(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + unsigned long ids[] = {1, 2, 3}; + /* Each line represents one region in `` `` */ + char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45", + "2 10 20\n", + "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n", + ""}; + /* Reading the file again will show sorted, clean output */ + char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n", + "2 10 20\n", + "1 39 59\n1 70 134\n2 10 20\n2 20 25\n", + ""}; + char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */ + "2 10 20\n 2 14 26\n", /* regions overlap */ + "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */ + char *input, *expect; + int i, rc; + char buf[256]; + + damon_set_targets(ctx, ids, 3); + + /* Put valid inputs and check the results */ + for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { + input = valid_inputs[i]; + expect = valid_expects[i]; + + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, 0); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, expect); + } + /* Put invalid inputs and check the return error code */ + for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { + input = invalid_inputs[i]; + pr_info("input: %s\n", input); + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, -EINVAL); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, ""); + } + + damon_set_targets(ctx, NULL, 0); + damon_destroy_ctx(ctx); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), KUNIT_CASE(damon_dbgfs_test_set_targets), + KUNIT_CASE(damon_dbgfs_test_set_init_regions), {}, }; diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index faee070977d8..eccc14b34901 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -69,8 +69,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, struct damon_ctx *ctx = file->private_data; unsigned long s, a, r, minr, maxr; char *kbuf; - ssize_t ret = count; - int err; + ssize_t ret; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -88,9 +87,9 @@ static ssize_t dbgfs_attrs_write(struct file *file, goto unlock_out; } - err = damon_set_attrs(ctx, s, a, r, minr, maxr); - if (err) - ret = err; + ret = damon_set_attrs(ctx, s, a, r, minr, maxr); + if (!ret) + ret = count; unlock_out: mutex_unlock(&ctx->kdamond_lock); out: @@ -98,6 +97,177 @@ out: return ret; } +static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damos *s; + int written = 0; + int rc; + + damon_for_each_scheme(s, c) { + rc = scnprintf(&buf[written], len - written, + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n", + s->min_sz_region, s->max_sz_region, + s->min_nr_accesses, s->max_nr_accesses, + s->min_age_region, s->max_age_region, + s->action, + s->quota.ms, s->quota.sz, + s->quota.reset_interval, + s->quota.weight_sz, + s->quota.weight_nr_accesses, + s->quota.weight_age, + s->wmarks.metric, s->wmarks.interval, + s->wmarks.high, s->wmarks.mid, s->wmarks.low, + s->stat_count, s->stat_sz); + if (!rc) + return -ENOMEM; + + written += rc; + } + return written; +} + +static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_schemes(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) +{ + ssize_t i; + + for (i = 0; i < nr_schemes; i++) + kfree(schemes[i]); + kfree(schemes); +} + +static bool damos_action_valid(int action) +{ + switch (action) { + case DAMOS_WILLNEED: + case DAMOS_COLD: + case DAMOS_PAGEOUT: + case DAMOS_HUGEPAGE: + case DAMOS_NOHUGEPAGE: + case DAMOS_STAT: + return true; + default: + return false; + } +} + +/* + * Converts a string into an array of struct damos pointers + * + * Returns an array of struct damos pointers that converted if the conversion + * success, or NULL otherwise. + */ +static struct damos **str_to_schemes(const char *str, ssize_t len, + ssize_t *nr_schemes) +{ + struct damos *scheme, **schemes; + const int max_nr_schemes = 256; + int pos = 0, parsed, ret; + unsigned long min_sz, max_sz; + unsigned int min_nr_a, max_nr_a, min_age, max_age; + unsigned int action; + + schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), + GFP_KERNEL); + if (!schemes) + return NULL; + + *nr_schemes = 0; + while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_quota quota = {}; + struct damos_watermarks wmarks; + + ret = sscanf(&str[pos], + "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", + &min_sz, &max_sz, &min_nr_a, &max_nr_a, + &min_age, &max_age, &action, "a.ms, + "a.sz, "a.reset_interval, + "a.weight_sz, "a.weight_nr_accesses, + "a.weight_age, &wmarks.metric, + &wmarks.interval, &wmarks.high, &wmarks.mid, + &wmarks.low, &parsed); + if (ret != 18) + break; + if (!damos_action_valid(action)) { + pr_err("wrong action %d\n", action); + goto fail; + } + + pos += parsed; + scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, + min_age, max_age, action, "a, &wmarks); + if (!scheme) + goto fail; + + schemes[*nr_schemes] = scheme; + *nr_schemes += 1; + } + return schemes; +fail: + free_schemes_arr(schemes, *nr_schemes); + return NULL; +} + +static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + struct damos **schemes; + ssize_t nr_schemes = 0, ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + schemes = str_to_schemes(kbuf, count, &nr_schemes); + if (!schemes) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + ret = damon_set_schemes(ctx, schemes, nr_schemes); + if (!ret) { + ret = count; + nr_schemes = 0; + } + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + free_schemes_arr(schemes, nr_schemes); +out: + kfree(kbuf); + return ret; +} + static inline bool targetid_is_pid(const struct damon_ctx *ctx) { return ctx->primitive.target_valid == damon_va_target_valid; @@ -185,26 +355,31 @@ static ssize_t dbgfs_target_ids_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; + bool id_is_pid = true; char *kbuf, *nrs; unsigned long *targets; ssize_t nr_targets; - ssize_t ret = count; + ssize_t ret; int i; - int err; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); nrs = kbuf; + if (!strncmp(kbuf, "paddr\n", count)) { + id_is_pid = false; + /* target id is meaningless here, but we set it just for fun */ + scnprintf(kbuf, count, "42 "); + } - targets = str_to_target_ids(nrs, ret, &nr_targets); + targets = str_to_target_ids(nrs, count, &nr_targets); if (!targets) { ret = -ENOMEM; goto out; } - if (targetid_is_pid(ctx)) { + if (id_is_pid) { for (i = 0; i < nr_targets; i++) { targets[i] = (unsigned long)find_get_pid( (int)targets[i]); @@ -218,17 +393,27 @@ static ssize_t dbgfs_target_ids_write(struct file *file, mutex_lock(&ctx->kdamond_lock); if (ctx->kdamond) { - if (targetid_is_pid(ctx)) + if (id_is_pid) dbgfs_put_pids(targets, nr_targets); ret = -EBUSY; goto unlock_out; } - err = damon_set_targets(ctx, targets, nr_targets); - if (err) { - if (targetid_is_pid(ctx)) + /* remove targets with previously-set primitive */ + damon_set_targets(ctx, NULL, 0); + + /* Configure the context for the address space type */ + if (id_is_pid) + damon_va_set_primitives(ctx); + else + damon_pa_set_primitives(ctx); + + ret = damon_set_targets(ctx, targets, nr_targets); + if (ret) { + if (id_is_pid) dbgfs_put_pids(targets, nr_targets); - ret = err; + } else { + ret = count; } unlock_out: @@ -240,6 +425,152 @@ out: return ret; } +static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r; + int written = 0; + int rc; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + rc = scnprintf(&buf[written], len - written, + "%lu %lu %lu\n", + t->id, r->ar.start, r->ar.end); + if (!rc) + return -ENOMEM; + written += rc; + } + } + return written; +} + +static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + mutex_unlock(&ctx->kdamond_lock); + len = -EBUSY; + goto out; + } + + len = sprint_init_regions(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static int add_init_region(struct damon_ctx *c, + unsigned long target_id, struct damon_addr_range *ar) +{ + struct damon_target *t; + struct damon_region *r, *prev; + unsigned long id; + int rc = -EINVAL; + + if (ar->start >= ar->end) + return -EINVAL; + + damon_for_each_target(t, c) { + id = t->id; + if (targetid_is_pid(c)) + id = (unsigned long)pid_vnr((struct pid *)id); + if (id == target_id) { + r = damon_new_region(ar->start, ar->end); + if (!r) + return -ENOMEM; + damon_add_region(r, t); + if (damon_nr_regions(t) > 1) { + prev = damon_prev_region(r); + if (prev->ar.end > r->ar.start) { + damon_destroy_region(r, t); + return -EINVAL; + } + } + rc = 0; + } + } + return rc; +} + +static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r, *next; + int pos = 0, parsed, ret; + unsigned long target_id; + struct damon_addr_range ar; + int err; + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + + while (pos < len) { + ret = sscanf(&str[pos], "%lu %lu %lu%n", + &target_id, &ar.start, &ar.end, &parsed); + if (ret != 3) + break; + err = add_init_region(c, target_id, &ar); + if (err) + goto fail; + pos += parsed; + } + + return 0; + +fail: + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + return err; +} + +static ssize_t dbgfs_init_regions_write(struct file *file, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = set_init_regions(ctx, kbuf, ret); + if (err) + ret = err; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + kfree(kbuf); + return ret; +} + static ssize_t dbgfs_kdamond_pid_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -279,12 +610,24 @@ static const struct file_operations attrs_fops = { .write = dbgfs_attrs_write, }; +static const struct file_operations schemes_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_schemes_read, + .write = dbgfs_schemes_write, +}; + static const struct file_operations target_ids_fops = { .open = damon_dbgfs_open, .read = dbgfs_target_ids_read, .write = dbgfs_target_ids_write, }; +static const struct file_operations init_regions_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_init_regions_read, + .write = dbgfs_init_regions_write, +}; + static const struct file_operations kdamond_pid_fops = { .open = damon_dbgfs_open, .read = dbgfs_kdamond_pid_read, @@ -292,28 +635,27 @@ static const struct file_operations kdamond_pid_fops = { static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) { - const char * const file_names[] = {"attrs", "target_ids", - "kdamond_pid"}; - const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops, - &kdamond_pid_fops}; + const char * const file_names[] = {"attrs", "schemes", "target_ids", + "init_regions", "kdamond_pid"}; + const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, + &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; int i; for (i = 0; i < ARRAY_SIZE(file_names); i++) debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); } -static int dbgfs_before_terminate(struct damon_ctx *ctx) +static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; if (!targetid_is_pid(ctx)) - return 0; + return; damon_for_each_target_safe(t, next, ctx) { put_pid((struct pid *)t->id); damon_destroy_target(t); } - return 0; } static struct damon_ctx *dbgfs_new_ctx(void) @@ -388,8 +730,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file, { char *kbuf; char *ctx_name; - ssize_t ret = count; - int err; + ssize_t ret; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -407,9 +748,9 @@ static ssize_t dbgfs_mk_context_write(struct file *file, } mutex_lock(&damon_dbgfs_lock); - err = dbgfs_mk_context(ctx_name); - if (err) - ret = err; + ret = dbgfs_mk_context(ctx_name); + if (!ret) + ret = count; mutex_unlock(&damon_dbgfs_lock); out: @@ -478,8 +819,7 @@ static ssize_t dbgfs_rm_context_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *kbuf; - ssize_t ret = count; - int err; + ssize_t ret; char *ctx_name; kbuf = user_input_str(buf, count, ppos); @@ -498,9 +838,9 @@ static ssize_t dbgfs_rm_context_write(struct file *file, } mutex_lock(&damon_dbgfs_lock); - err = dbgfs_rm_context(ctx_name); - if (err) - ret = err; + ret = dbgfs_rm_context(ctx_name); + if (!ret) + ret = count; mutex_unlock(&damon_dbgfs_lock); out: @@ -524,9 +864,8 @@ static ssize_t dbgfs_monitor_on_read(struct file *file, static ssize_t dbgfs_monitor_on_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - ssize_t ret = count; + ssize_t ret; char *kbuf; - int err; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -538,15 +877,24 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return -EINVAL; } - if (!strncmp(kbuf, "on", count)) - err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); - else if (!strncmp(kbuf, "off", count)) - err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); - else - err = -EINVAL; + if (!strncmp(kbuf, "on", count)) { + int i; - if (err) - ret = err; + for (i = 0; i < dbgfs_nr_ctxs; i++) { + if (damon_targets_empty(dbgfs_ctxs[i])) { + kfree(kbuf); + return -EINVAL; + } + } + ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs); + } else if (!strncmp(kbuf, "off", count)) { + ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + } else { + ret = -EINVAL; + } + + if (!ret) + ret = count; kfree(kbuf); return ret; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c new file mode 100644 index 000000000000..a496d6f203d6 --- /dev/null +++ b/mm/damon/paddr.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for The Physical Address Space + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-pa: " fmt + +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "prmtv-common.h" + +static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; + + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) + damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr); + else + damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr); + } + return true; +} + +static void damon_pa_mkold(unsigned long paddr) +{ + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct rmap_walk_control rwc = { + .rmap_one = __damon_pa_mkold, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return; + + if (!page_mapped(page) || !page_rmapping(page)) { + set_page_idle(page); + goto out; + } + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) + goto out; + + rmap_walk(page, &rwc); + + if (need_lock) + unlock_page(page); + +out: + put_page(page); +} + +static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, + struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_pa_mkold(r->sampling_addr); +} + +void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + __damon_pa_prepare_access_check(ctx, r); + } +} + +struct damon_pa_access_chk_result { + unsigned long page_sz; + bool accessed; +}; + +static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct damon_pa_access_chk_result *result = arg; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; + + result->accessed = false; + result->page_sz = PAGE_SIZE; + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + result->accessed = pte_young(*pvmw.pte) || + !page_is_idle(page) || + mmu_notifier_test_young(vma->vm_mm, addr); + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + result->accessed = pmd_young(*pvmw.pmd) || + !page_is_idle(page) || + mmu_notifier_test_young(vma->vm_mm, addr); + result->page_sz = ((1UL) << HPAGE_PMD_SHIFT); +#else + WARN_ON_ONCE(1); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + } + if (result->accessed) { + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* If accessed, stop walking */ + return !result->accessed; +} + +static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) +{ + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct damon_pa_access_chk_result result = { + .page_sz = PAGE_SIZE, + .accessed = false, + }; + struct rmap_walk_control rwc = { + .arg = &result, + .rmap_one = __damon_pa_young, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return false; + + if (!page_mapped(page) || !page_rmapping(page)) { + if (page_is_idle(page)) + result.accessed = false; + else + result.accessed = true; + put_page(page); + goto out; + } + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) { + put_page(page); + return NULL; + } + + rmap_walk(page, &rwc); + + if (need_lock) + unlock_page(page); + put_page(page); + +out: + *page_sz = result.page_sz; + return result.accessed; +} + +static void __damon_pa_check_access(struct damon_ctx *ctx, + struct damon_region *r) +{ + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz)) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_addr = r->sampling_addr; +} + +unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) { + __damon_pa_check_access(ctx, r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + } + + return max_nr_accesses; +} + +bool damon_pa_target_valid(void *t) +{ + return true; +} + +int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *scheme) +{ + unsigned long addr; + LIST_HEAD(page_list); + + if (scheme->action != DAMOS_PAGEOUT) + return -EINVAL; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + if (PageUnevictable(page)) { + putback_lru_page(page); + } else { + list_add(&page->lru, &page_list); + put_page(page); + } + } + reclaim_pages(&page_list); + cond_resched(); + return 0; +} + +int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pageout_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + +void damon_pa_set_primitives(struct damon_ctx *ctx) +{ + ctx->primitive.init = NULL; + ctx->primitive.update = NULL; + ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks; + ctx->primitive.check_accesses = damon_pa_check_accesses; + ctx->primitive.reset_aggregated = NULL; + ctx->primitive.target_valid = damon_pa_target_valid; + ctx->primitive.cleanup = NULL; + ctx->primitive.apply_scheme = damon_pa_apply_scheme; + ctx->primitive.get_scheme_score = damon_pa_scheme_score; +} diff --git a/mm/damon/prmtv-common.c b/mm/damon/prmtv-common.c new file mode 100644 index 000000000000..92a04f5831d6 --- /dev/null +++ b/mm/damon/prmtv-common.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include +#include +#include +#include + +#include "prmtv-common.h" + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (pte_young(*pte)) { + referenced = true; + *pte = pte_mkold(*pte); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmd_young(*pmd)) { + referenced = true; + *pmd = pmd_mkold(*pmd); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + ((1UL) << HPAGE_PMD_SHIFT))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} + +#define DAMON_MAX_SUBSCORE (100) +#define DAMON_MAX_AGE_IN_LOG (32) + +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + unsigned int max_nr_accesses; + int freq_subscore; + unsigned int age_in_sec; + int age_in_log, age_subscore; + unsigned int freq_weight = s->quota.weight_nr_accesses; + unsigned int age_weight = s->quota.weight_age; + int hotness; + + max_nr_accesses = c->aggr_interval / c->sample_interval; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + + age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; + age_in_log++, age_in_sec >>= 1) + ; + + /* If frequency is 0, higher age means it's colder */ + if (freq_subscore == 0) + age_in_log *= -1; + + /* + * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. + * Scale it to be in [0, 100] and set it as age subscore. + */ + age_in_log += DAMON_MAX_AGE_IN_LOG; + age_subscore = age_in_log * DAMON_MAX_SUBSCORE / + DAMON_MAX_AGE_IN_LOG / 2; + + hotness = (freq_weight * freq_subscore + age_weight * age_subscore); + if (freq_weight + age_weight) + hotness /= freq_weight + age_weight; + /* + * Transform it to fit in [0, DAMOS_MAX_SCORE] + */ + hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; + + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; +} diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h new file mode 100644 index 000000000000..61f27037603e --- /dev/null +++ b/mm/damon/prmtv-common.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include +#include + +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + +struct page *damon_get_page(unsigned long pfn); + +void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); +void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); + +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c new file mode 100644 index 000000000000..dc1485044eaf --- /dev/null +++ b/mm/damon/reclaim.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based page reclamation + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-reclaim: " fmt + +#include +#include +#include +#include +#include + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_reclaim." + +/* + * Enable or disable DAMON_RECLAIM. + * + * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. + * Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could + * do no real monitoring and reclamation due to the watermarks-based activation + * condition. Refer to below descriptions for the watermarks parameter for + * this. + */ +static bool enabled __read_mostly; +module_param(enabled, bool, 0600); + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_RECLAIM + * identifies the region as cold, and reclaims. 120 seconds by default. + */ +static unsigned long min_age __read_mostly = 120000000; +module_param(min_age, ulong, 0600); + +/* + * Limit of time for trying the reclamation in milliseconds. + * + * DAMON_RECLAIM tries to use only up to this time within a time window + * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be + * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, + * the limit is disabled. + * + * 10 ms by default. + */ +static unsigned long quota_ms __read_mostly = 10; +module_param(quota_ms, ulong, 0600); + +/* + * Limit of size of memory for the reclamation in bytes. + * + * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a + * time window (quota_reset_interval_ms) and makes no more than this limit is + * tried. This can be used for limiting consumption of CPU and IO. If this + * value is zero, the limit is disabled. + * + * 128 MiB by default. + */ +static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024; +module_param(quota_sz, ulong, 0600); + +/* + * The time/size quota charge reset interval in milliseconds. + * + * The charge reset interval for the quota of time (quota_ms) and size + * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than + * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms + * milliseconds. + * + * 1 second by default. + */ +static unsigned long quota_reset_interval_ms __read_mostly = 1000; +module_param(quota_reset_interval_ms, ulong, 0600); + +/* + * The watermarks check time interval in microseconds. + * + * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is + * enabled but inactive due to its watermarks rule. 5 seconds by default. + */ +static unsigned long wmarks_interval __read_mostly = 5000000; +module_param(wmarks_interval, ulong, 0600); + +/* + * Free memory rate (per thousand) for the high watermark. + * + * If free memory of the system in bytes per thousand bytes is higher than + * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically + * checks the watermarks. 500 (50%) by default. + */ +static unsigned long wmarks_high __read_mostly = 500; +module_param(wmarks_high, ulong, 0600); + +/* + * Free memory rate (per thousand) for the middle watermark. + * + * If free memory of the system in bytes per thousand bytes is between this and + * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring + * and the reclaiming. 400 (40%) by default. + */ +static unsigned long wmarks_mid __read_mostly = 400; +module_param(wmarks_mid, ulong, 0600); + +/* + * Free memory rate (per thousand) for the low watermark. + * + * If free memory of the system in bytes per thousand bytes is lower than this, + * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks + * the watermarks. In the case, the system falls back to the LRU-based page + * granularity reclamation logic. 200 (20%) by default. + */ +static unsigned long wmarks_low __read_mostly = 200; +module_param(wmarks_low, ulong, 0600); + +/* + * Sampling interval for the monitoring in microseconds. + * + * The sampling interval of DAMON for the cold memory monitoring. Please refer + * to the DAMON documentation for more detail. 5 ms by default. + */ +static unsigned long sample_interval __read_mostly = 5000; +module_param(sample_interval, ulong, 0600); + +/* + * Aggregation interval for the monitoring in microseconds. + * + * The aggregation interval of DAMON for the cold memory monitoring. Please + * refer to the DAMON documentation for more detail. 100 ms by default. + */ +static unsigned long aggr_interval __read_mostly = 100000; +module_param(aggr_interval, ulong, 0600); + +/* + * Minimum number of monitoring regions. + * + * The minimal number of monitoring regions of DAMON for the cold memory + * monitoring. This can be used to set lower-bound of the monitoring quality. + * But, setting this too high could result in increased monitoring overhead. + * Please refer to the DAMON documentation for more detail. 10 by default. + */ +static unsigned long min_nr_regions __read_mostly = 10; +module_param(min_nr_regions, ulong, 0600); + +/* + * Maximum number of monitoring regions. + * + * The maximum number of monitoring regions of DAMON for the cold memory + * monitoring. This can be used to set upper-bound of the monitoring overhead. + * However, setting this too low could result in bad monitoring quality. + * Please refer to the DAMON documentation for more detail. 1000 by default. + */ +static unsigned long max_nr_regions __read_mostly = 1000; +module_param(max_nr_regions, ulong, 0600); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +static struct damon_ctx *ctx; +static struct damon_target *target; + +struct damon_reclaim_ram_walk_arg { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_reclaim_ram_walk_arg *a = arg; + + if (a->end - a->start < res->end - res->start) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +static bool get_monitoring_region(unsigned long *start, unsigned long *end) +{ + struct damon_reclaim_ram_walk_arg arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + +static struct damos *damon_reclaim_new_scheme(void) +{ + struct damos_watermarks wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = wmarks_interval, + .high = wmarks_high, + .mid = wmarks_mid, + .low = wmarks_low, + }; + struct damos_quota quota = { + /* + * Do not try reclamation for more than quota_ms milliseconds + * or quota_sz bytes within quota_reset_interval_ms. + */ + .ms = quota_ms, + .sz = quota_sz, + .reset_interval = quota_reset_interval_ms, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 + }; + struct damos *scheme = damon_new_scheme( + /* Find regions having PAGE_SIZE or larger size */ + PAGE_SIZE, ULONG_MAX, + /* and not accessed at all */ + 0, 0, + /* for min_age or more micro-seconds, and */ + min_age / aggr_interval, UINT_MAX, + /* page out those, as soon as found */ + DAMOS_PAGEOUT, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &wmarks); + + return scheme; +} + +static int damon_reclaim_turn(bool on) +{ + struct damon_region *region; + struct damos *scheme; + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, + min_nr_regions, max_nr_regions); + if (err) + return err; + + if (monitor_region_start > monitor_region_end) + return -EINVAL; + if (!monitor_region_start && !monitor_region_end && + !get_monitoring_region(&monitor_region_start, + &monitor_region_end)) + return -EINVAL; + /* DAMON will free this on its own when finish monitoring */ + region = damon_new_region(monitor_region_start, monitor_region_end); + if (!region) + return -ENOMEM; + damon_add_region(region, target); + + /* Will be freed by 'damon_set_schemes()' below */ + scheme = damon_reclaim_new_scheme(); + if (!scheme) { + err = -ENOMEM; + goto free_region_out; + } + err = damon_set_schemes(ctx, &scheme, 1); + if (err) + goto free_scheme_out; + + err = damon_start(&ctx, 1); + if (!err) { + kdamond_pid = ctx->kdamond->pid; + return 0; + } + +free_scheme_out: + damon_destroy_scheme(scheme); +free_region_out: + damon_destroy_region(region, target); + return err; +} + +#define ENABLE_CHECK_INTERVAL_MS 1000 +static struct delayed_work damon_reclaim_timer; +static void damon_reclaim_timer_fn(struct work_struct *work) +{ + static bool last_enabled; + bool now_enabled; + + now_enabled = enabled; + if (last_enabled != now_enabled) { + if (!damon_reclaim_turn(now_enabled)) + last_enabled = now_enabled; + else + enabled = last_enabled; + } + + schedule_delayed_work(&damon_reclaim_timer, + msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS)); +} +static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); + +static int __init damon_reclaim_init(void) +{ + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + damon_pa_set_primitives(ctx); + + /* 4242 means nothing but fun */ + target = damon_new_target(4242); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + schedule_delayed_work(&damon_reclaim_timer, 0); + return 0; +} + +module_init(damon_reclaim_init); diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 1f5c13257dba..ecfd0b2ed222 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -233,7 +233,7 @@ static void damon_test_apply_three_regions3(struct kunit *test) * and 70-100) has totally freed and mapped to different area (30-32 and * 65-68). The target regions which were in the old second and third big * regions should now be removed and new target regions covering the new second - * and third big regions should be crated. + * and third big regions should be created. */ static void damon_test_apply_three_regions4(struct kunit *test) { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 58c1fb2aafa9..35fe49080ee9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -7,25 +7,20 @@ #define pr_fmt(fmt) "damon-va: " fmt -#include -#include -#include -#include +#include #include +#include +#include #include #include -#include -#include -#include + +#include "prmtv-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST #undef DAMON_MIN_REGION #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - /* * 't->id' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. @@ -311,7 +306,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, struct damon_addr_range bregions[3]) { struct damon_region *r, *next; - unsigned int i = 0; + unsigned int i; /* Remove regions which are not in the three big regions now */ damon_for_each_region_safe(r, next, t) { @@ -372,82 +367,6 @@ void damon_va_update(struct damon_ctx *ctx) } } -/* - * Get an online page for a pfn if it's in the LRU list. Otherwise, returns - * NULL. - * - * The body of this function is stolen from the 'page_idle_get_page()'. We - * steal rather than reuse it because the code is quite simple. - */ -static struct page *damon_get_page(unsigned long pfn) -{ - struct page *page = pfn_to_online_page(pfn); - - if (!page || !PageLRU(page) || !get_page_unless_zero(page)) - return NULL; - - if (unlikely(!PageLRU(page))) { - put_page(page); - page = NULL; - } - return page; -} - -static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, - unsigned long addr) -{ - bool referenced = false; - struct page *page = damon_get_page(pte_pfn(*pte)); - - if (!page) - return; - - if (pte_young(*pte)) { - referenced = true; - *pte = pte_mkold(*pte); - } - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE)) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) - set_page_young(page); - - set_page_idle(page); - put_page(page); -} - -static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, - unsigned long addr) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool referenced = false; - struct page *page = damon_get_page(pmd_pfn(*pmd)); - - if (!page) - return; - - if (pmd_young(*pmd)) { - referenced = true; - *pmd = pmd_mkold(*pmd); - } - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(mm, addr, - addr + ((1UL) << HPAGE_PMD_SHIFT))) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) - set_page_young(page); - - set_page_idle(page); - put_page(page); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -} - static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -475,7 +394,7 @@ out: return 0; } -static struct mm_walk_ops damon_mkold_ops = { +static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, }; @@ -571,7 +490,7 @@ out: return 0; } -static struct mm_walk_ops damon_young_ops = { +static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, }; @@ -658,6 +577,76 @@ bool damon_va_target_valid(void *target) return false; } +#ifndef CONFIG_ADVISE_SYSCALLS +static int damos_madvise(struct damon_target *target, struct damon_region *r, + int behavior) +{ + return -EINVAL; +} +#else +static int damos_madvise(struct damon_target *target, struct damon_region *r, + int behavior) +{ + struct mm_struct *mm; + int ret = -ENOMEM; + + mm = damon_get_mm(target); + if (!mm) + goto out; + + ret = do_madvise(mm, PAGE_ALIGN(r->ar.start), + PAGE_ALIGN(r->ar.end - r->ar.start), behavior); + mmput(mm); +out: + return ret; +} +#endif /* CONFIG_ADVISE_SYSCALLS */ + +int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *scheme) +{ + int madv_action; + + switch (scheme->action) { + case DAMOS_WILLNEED: + madv_action = MADV_WILLNEED; + break; + case DAMOS_COLD: + madv_action = MADV_COLD; + break; + case DAMOS_PAGEOUT: + madv_action = MADV_PAGEOUT; + break; + case DAMOS_HUGEPAGE: + madv_action = MADV_HUGEPAGE; + break; + case DAMOS_NOHUGEPAGE: + madv_action = MADV_NOHUGEPAGE; + break; + case DAMOS_STAT: + return 0; + default: + pr_warn("Wrong action %d\n", scheme->action); + return -EINVAL; + } + + return damos_madvise(t, r, madv_action); +} + +int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme) +{ + + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pageout_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + void damon_va_set_primitives(struct damon_ctx *ctx) { ctx->primitive.init = damon_va_init; @@ -667,6 +656,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx) ctx->primitive.reset_aggregated = NULL; ctx->primitive.target_valid = damon_va_target_valid; ctx->primitive.cleanup = NULL; + ctx->primitive.apply_scheme = damon_va_apply_scheme; + ctx->primitive.get_scheme_score = damon_va_scheme_score; } #include "vaddr-test.h" diff --git a/mm/debug.c b/mm/debug.c index d0020fc58202..a05a39ff8fe4 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -16,17 +16,19 @@ #include #include "internal.h" +#include + +/* + * Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can + * be used to populate migrate_reason_names[]. + */ +#undef EM +#undef EMe +#define EM(a, b) b, +#define EMe(a, b) b const char *migrate_reason_names[MR_TYPES] = { - "compaction", - "memory_failure", - "memory_hotplug", - "syscall_or_cpuset", - "mempolicy_mbind", - "numa_misplaced", - "contig_range", - "longterm_pin", - "demotion", + MIGRATE_REASON }; const struct trace_print_flags pageflag_names[] = { diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1403639302e4..228e3954b90c 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1104,13 +1104,14 @@ static int __init init_args(struct pgtable_debug_args *args) /* * Initialize the debugging data. * - * __P000 (or even __S000) will help create page table entries with - * PROT_NONE permission as required for pxx_protnone_tests(). + * protection_map[0] (or even protection_map[8]) will help create + * page table entries with PROT_NONE permission as required for + * pxx_protnone_tests(). */ memset(args, 0, sizeof(*args)); args->vaddr = get_random_vaddr(); args->page_prot = vm_get_page_prot(VMFLAGS); - args->page_prot_none = __P000; + args->page_prot_none = protection_map[0]; args->is_contiguous_page = false; args->pud_pfn = ULONG_MAX; args->pmd_pfn = ULONG_MAX; diff --git a/mm/filemap.c b/mm/filemap.c index bfcef6ff7a27..615512caa0b5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -638,6 +638,30 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } +static bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; + struct page *page; + + if (end_byte < start_byte) + return false; + + rcu_read_lock(); + xas_for_each(&xas, page, max) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + break; + } + rcu_read_unlock(); + return page != NULL; + +} + /** * filemap_range_needs_writeback - check if range potentially needs writeback * @mapping: address space within which to check @@ -655,29 +679,12 @@ static bool mapping_needs_writeback(struct address_space *mapping) bool filemap_range_needs_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); - pgoff_t max = end_byte >> PAGE_SHIFT; - struct page *page; - if (!mapping_needs_writeback(mapping)) return false; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) return false; - if (end_byte < start_byte) - return false; - - rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) - continue; - if (xa_is_value(page)) - continue; - if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) - break; - } - rcu_read_unlock(); - return page != NULL; + return filemap_range_has_writeback(mapping, start_byte, end_byte); } EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); @@ -1592,6 +1599,7 @@ void folio_end_writeback(struct folio *folio) smp_mb__after_atomic(); folio_wake(folio, PG_writeback); + acct_reclaim_writeback(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); @@ -2088,7 +2096,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, if (!xa_is_value(page)) { if (page->index < start) goto put; - VM_BUG_ON_PAGE(page->index != xas.xa_index, page); if (page->index + thp_nr_pages(page) - 1 > end) goto put; if (!trylock_page(page)) @@ -2621,6 +2628,9 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; + if (unlikely(iocb->ki_pos >= i_size_read(inode))) + break; + error = filemap_get_pages(iocb, iter, &pvec); if (error < 0) break; @@ -2733,9 +2743,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - loff_t size; - size = i_size_read(inode); if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, iocb->ki_pos, iocb->ki_pos + count - 1)) @@ -2767,8 +2775,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ - if (retval < 0 || !count || iocb->ki_pos >= size || - IS_DAX(inode)) + if (retval < 0 || !count || IS_DAX(inode)) + return retval; + if (iocb->ki_pos >= i_size_read(inode)) return retval; } @@ -3193,23 +3202,16 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) } if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { - vm_fault_t ret = do_set_pmd(vmf, page); - if (!ret) { - /* The page is mapped successfully, reference consumed. */ - unlock_page(page); - return true; - } + vm_fault_t ret = do_set_pmd(vmf, page); + if (!ret) { + /* The page is mapped successfully, reference consumed. */ + unlock_page(page); + return true; + } } - if (pmd_none(*vmf->pmd)) { - vmf->ptl = pmd_lock(mm, vmf->pmd); - if (likely(pmd_none(*vmf->pmd))) { - mm_inc_nr_ptes(mm); - pmd_populate(mm, vmf->pmd, vmf->prealloc_pte); - vmf->prealloc_pte = NULL; - } - spin_unlock(vmf->ptl); - } + if (pmd_none(*vmf->pmd)) + pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); /* See comment in handle_pte_fault() */ if (pmd_devmap_trans_unstable(vmf->pmd)) { diff --git a/mm/gup.c b/mm/gup.c index e1c7e4bde11f..2c51e9748a6a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2365,7 +2365,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; - int ret = 1; do { struct page *page = pfn_to_page(pfn); @@ -2373,14 +2372,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); - ret = 0; break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); - ret = 0; break; } (*nr)++; @@ -2388,7 +2385,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); - return ret; + return addr == end; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, diff --git a/mm/highmem.c b/mm/highmem.c index 471d9779a7f4..88f65f155845 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -382,7 +382,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); if (end1 > start1) { - kaddr = kmap_atomic(page + i); + kaddr = kmap_local_page(page + i); memset(kaddr + start1, 0, this_end - start1); } end1 -= this_end; @@ -397,7 +397,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, if (end2 > start2) { if (!kaddr) - kaddr = kmap_atomic(page + i); + kaddr = kmap_local_page(page + i); memset(kaddr + start2, 0, this_end - start2); } end2 -= this_end; @@ -405,7 +405,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, } if (kaddr) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); flush_dcache_page(page + i); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6378c1066459..e09159c957e3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -50,6 +50,17 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; +static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; +static bool hugetlb_cma_page(struct page *page, unsigned int order) +{ + return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, + 1 << order); +} +#else +static bool hugetlb_cma_page(struct page *page, unsigned int order) +{ + return false; +} #endif static unsigned long hugetlb_cma_size __initdata; @@ -66,6 +77,7 @@ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; +static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -321,8 +333,7 @@ static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { #ifdef CONFIG_CGROUP_HUGETLB - return rg && org && - rg->reservation_counter == org->reservation_counter && + return rg->reservation_counter == org->reservation_counter && rg->css == org->css; #else @@ -435,7 +446,6 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, t, h, h_cg, regions_needed); - VM_BUG_ON(add < 0); return add; } @@ -1004,6 +1014,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) vma->vm_private_data = (void *)0; } +/* + * Reset and decrement one ref on hugepage private reservation. + * Called with mm->mmap_sem writer semaphore held. + * This function should be only used by move_vma() and operate on + * same sized vma. It should never come here with last ref on the + * reservation. + */ +void clear_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + /* + * Clear the old hugetlb private page reservation. + * It has already been transferred to new_vma. + * + * During a mremap() operation of a hugetlb vma we call move_vma() + * which copies vma into new_vma and unmaps vma. After the copy + * operation both new_vma and vma share a reference to the resv_map + * struct, and at that point vma is about to be unmapped. We don't + * want to return the reservation to the pool at unmap of vma because + * the reservation still lives on in new_vma, so simply decrement the + * ref here and remove the resv_map reference from this vma. + */ + struct resv_map *reservations = vma_resv_map(vma); + + if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&reservations->refs, resv_map_release); + + reset_vma_resv_huge_pages(vma); +} + /* Returns true if the VMA has associated reserve pages */ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) { @@ -1260,9 +1299,9 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_page(struct page *page, - unsigned int order) +/* used to demote non-gigantic_huge pages as well */ +static void __destroy_compound_gigantic_page(struct page *page, + unsigned int order, bool demote) { int i; int nr_pages = 1 << order; @@ -1272,8 +1311,10 @@ static void destroy_compound_gigantic_page(struct page *page, atomic_set(compound_pincount_ptr(page), 0); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + p->mapping = NULL; clear_compound_head(p); - set_page_refcounted(p); + if (!demote) + set_page_refcounted(p); } set_compound_order(page, 0); @@ -1281,6 +1322,19 @@ static void destroy_compound_gigantic_page(struct page *page, __ClearPageHead(page); } +static void destroy_compound_hugetlb_page_for_demote(struct page *page, + unsigned int order) +{ + __destroy_compound_gigantic_page(page, order, true); +} + +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static void destroy_compound_gigantic_page(struct page *page, + unsigned int order) +{ + __destroy_compound_gigantic_page(page, order, false); +} + static void free_gigantic_page(struct page *page, unsigned int order) { /* @@ -1353,12 +1407,15 @@ static inline void destroy_compound_gigantic_page(struct page *page, /* * Remove hugetlb page from lists, and update dtor so that page appears - * as just a compound page. A reference is held on the page. + * as just a compound page. + * + * A reference is held on the page, except in the case of demote. * * Must be called with hugetlb lock held. */ -static void remove_hugetlb_page(struct hstate *h, struct page *page, - bool adjust_surplus) +static void __remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus, + bool demote) { int nid = page_to_nid(page); @@ -1396,8 +1453,12 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, * * This handles the case where more than one ref is held when and * after update_and_free_page is called. + * + * In the case of demote we do not ref count the page as it will soon + * be turned into a page of smaller size. */ - set_page_refcounted(page); + if (!demote) + set_page_refcounted(page); if (hstate_is_gigantic(h)) set_compound_page_dtor(page, NULL_COMPOUND_DTOR); else @@ -1407,6 +1468,18 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, h->nr_huge_pages_node[nid]--; } +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + __remove_hugetlb_page(h, page, adjust_surplus, false); +} + +static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + __remove_hugetlb_page(h, page, adjust_surplus, true); +} + static void add_hugetlb_page(struct hstate *h, struct page *page, bool adjust_surplus) { @@ -1476,7 +1549,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_private | 1 << PG_writeback); } - if (hstate_is_gigantic(h)) { + + /* + * Non-gigantic pages demoted from CMA allocated gigantic pages + * need to be given back to CMA in free_gigantic_page. + */ + if (hstate_is_gigantic(h) || + hugetlb_cma_page(page, huge_page_order(h))) { destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); } else { @@ -1664,7 +1743,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_unlock_irq(&hugetlb_lock); } -static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, + bool demote) { int i, j; int nr_pages = 1 << order; @@ -1702,12 +1782,17 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order) * the set of pages can not be converted to a gigantic page. * The caller who allocated the pages should then discard the * pages using the appropriate free interface. + * + * In the case of demote, the ref count will be zero. */ - if (!page_ref_freeze(p, 1)) { - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); - goto out_error; + if (!demote) { + if (!page_ref_freeze(p, 1)) { + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + goto out_error; + } + } else { + VM_BUG_ON_PAGE(page_count(p), p); } - set_page_count(p, 0); set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); @@ -1730,6 +1815,17 @@ out_error: return false; } +static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +{ + return __prep_compound_gigantic_page(page, order, false); +} + +static bool prep_compound_gigantic_page_for_demote(struct page *page, + unsigned int order) +{ + return __prep_compound_gigantic_page(page, order, true); +} + /* * PageHuge() only returns true for hugetlbfs pages, but not for normal or * transparent huge pages. See the PageTransHuge() documentation for more @@ -2868,33 +2964,39 @@ out_subpool_put: return ERR_PTR(-ENOSPC); } -int alloc_bootmem_huge_page(struct hstate *h) +int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); -int __alloc_bootmem_huge_page(struct hstate *h) +int __alloc_bootmem_huge_page(struct hstate *h, int nid) { - struct huge_bootmem_page *m; + struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node; + if (nid >= nr_online_nodes) + return 0; + /* do node specific alloc */ + if (nid != NUMA_NO_NODE) { + m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + if (!m) + return 0; + goto found; + } + /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { - void *addr; - - addr = memblock_alloc_try_nid_raw( + m = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); - if (addr) { - /* - * Use the beginning of the huge page to store the - * huge_bootmem_page struct (until gather_bootmem - * puts them into the mem_map). - */ - m = addr; - goto found; - } + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + if (!m) + return 0; + goto found; } - return 0; found: - BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); @@ -2934,12 +3036,61 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } } +static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) +{ + unsigned long i; + char buf[32]; + + for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { + if (hstate_is_gigantic(h)) { + if (!alloc_bootmem_huge_page(h, nid)) + break; + } else { + struct page *page; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + + page = alloc_fresh_huge_page(h, gfp_mask, nid, + &node_states[N_MEMORY], NULL); + if (!page) + break; + put_page(page); /* free it into the hugepage allocator */ + } + cond_resched(); + } + if (i == h->max_huge_pages_node[nid]) + return; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", + h->max_huge_pages_node[nid], buf, nid, i); + h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); + h->max_huge_pages_node[nid] = i; +} static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; nodemask_t *node_alloc_noretry; + bool node_specific_alloc = false; + /* skip gigantic hugepages allocation if hugetlb_cma enabled */ + if (hstate_is_gigantic(h) && hugetlb_cma_size) { + pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); + return; + } + + /* do node specific alloc */ + for (i = 0; i < nr_online_nodes; i++) { + if (h->max_huge_pages_node[i] > 0) { + hugetlb_hstate_alloc_pages_onenode(h, i); + node_specific_alloc = true; + } + } + + if (node_specific_alloc) + return; + + /* below will do all node balanced alloc */ if (!hstate_is_gigantic(h)) { /* * Bit mask controlling how hard we retry per-node allocations. @@ -2960,11 +3111,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { - if (hugetlb_cma_size) { - pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); - goto free; - } - if (!alloc_bootmem_huge_page(h)) + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; } else if (!alloc_pool_huge_page(h, &node_states[N_MEMORY], @@ -2980,13 +3127,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) h->max_huge_pages, buf, i); h->max_huge_pages = i; } -free: kfree(node_alloc_noretry); } static void __init hugetlb_init_hstates(void) { - struct hstate *h; + struct hstate *h, *h2; for_each_hstate(h) { if (minimum_order > huge_page_order(h)) @@ -2995,6 +3141,26 @@ static void __init hugetlb_init_hstates(void) /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); + + /* + * Set demote order for each hstate. Note that + * h->demote_order is initially 0. + * - We can not demote gigantic pages if runtime freeing + * is not supported, so skip this. + * - If CMA allocation is possible, we can not demote + * HUGETLB_PAGE_ORDER or smaller size pages. + */ + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + continue; + if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) + continue; + for_each_hstate(h2) { + if (h2 == h) + continue; + if (h2->order < h->order && + h2->order > h->demote_order) + h->demote_order = h2->order; + } } VM_BUG_ON(minimum_order == UINT_MAX); } @@ -3235,9 +3401,100 @@ out: return 0; } +static int demote_free_huge_page(struct hstate *h, struct page *page) +{ + int i, nid = page_to_nid(page); + struct hstate *target_hstate; + int rc = 0; + + target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); + + remove_hugetlb_page_for_demote(h, page, false); + spin_unlock_irq(&hugetlb_lock); + + rc = alloc_huge_page_vmemmap(h, page); + if (rc) { + /* Allocation of vmemmmap failed, we can not demote page */ + spin_lock_irq(&hugetlb_lock); + set_page_refcounted(page); + add_hugetlb_page(h, page, false); + return rc; + } + + /* + * Use destroy_compound_hugetlb_page_for_demote for all huge page + * sizes as it will not ref count pages. + */ + destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h)); + + /* + * Taking target hstate mutex synchronizes with set_max_huge_pages. + * Without the mutex, pages added to target hstate could be marked + * as surplus. + * + * Note that we already hold h->resize_lock. To prevent deadlock, + * use the convention of always taking larger size hstate mutex first. + */ + mutex_lock(&target_hstate->resize_lock); + for (i = 0; i < pages_per_huge_page(h); + i += pages_per_huge_page(target_hstate)) { + if (hstate_is_gigantic(target_hstate)) + prep_compound_gigantic_page_for_demote(page + i, + target_hstate->order); + else + prep_compound_page(page + i, target_hstate->order); + set_page_private(page + i, 0); + set_page_refcounted(page + i); + prep_new_huge_page(target_hstate, page + i, nid); + put_page(page + i); + } + mutex_unlock(&target_hstate->resize_lock); + + spin_lock_irq(&hugetlb_lock); + + /* + * Not absolutely necessary, but for consistency update max_huge_pages + * based on pool changes for the demoted page. + */ + h->max_huge_pages--; + target_hstate->max_huge_pages += pages_per_huge_page(h); + + return rc; +} + +static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) + __must_hold(&hugetlb_lock) +{ + int nr_nodes, node; + struct page *page; + int rc = 0; + + lockdep_assert_held(&hugetlb_lock); + + /* We should never get here if no demote order */ + if (!h->demote_order) { + pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); + return -EINVAL; /* internal error */ + } + + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + if (!list_empty(&h->hugepage_freelists[node])) { + page = list_entry(h->hugepage_freelists[node].next, + struct page, lru); + rc = demote_free_huge_page(h, page); + break; + } + } + + return rc; +} + #define HSTATE_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define HSTATE_ATTR_WO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_WO(_name) + #define HSTATE_ATTR(_name) \ static struct kobj_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) @@ -3433,6 +3690,103 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj, } HSTATE_ATTR_RO(surplus_hugepages); +static ssize_t demote_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + unsigned long nr_demote; + unsigned long nr_available; + nodemask_t nodes_allowed, *n_mask; + struct hstate *h; + int err = 0; + int nid; + + err = kstrtoul(buf, 10, &nr_demote); + if (err) + return err; + h = kobj_to_hstate(kobj, &nid); + + if (nid != NUMA_NO_NODE) { + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; + } else { + n_mask = &node_states[N_MEMORY]; + } + + /* Synchronize with other sysfs operations modifying huge pages */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); + + while (nr_demote) { + /* + * Check for available pages to demote each time thorough the + * loop as demote_pool_huge_page will drop hugetlb_lock. + */ + if (nid != NUMA_NO_NODE) + nr_available = h->free_huge_pages_node[nid]; + else + nr_available = h->free_huge_pages; + nr_available -= h->resv_huge_pages; + if (!nr_available) + break; + + err = demote_pool_huge_page(h, n_mask); + if (err) + break; + + nr_demote--; + } + + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); + + if (err) + return err; + return len; +} +HSTATE_ATTR_WO(demote); + +static ssize_t demote_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int nid; + struct hstate *h = kobj_to_hstate(kobj, &nid); + unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; + + return sysfs_emit(buf, "%lukB\n", demote_size); +} + +static ssize_t demote_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct hstate *h, *demote_hstate; + unsigned long demote_size; + unsigned int demote_order; + int nid; + + demote_size = (unsigned long)memparse(buf, NULL); + + demote_hstate = size_to_hstate(demote_size); + if (!demote_hstate) + return -EINVAL; + demote_order = demote_hstate->order; + if (demote_order < HUGETLB_PAGE_ORDER) + return -EINVAL; + + /* demote order must be smaller than hstate order */ + h = kobj_to_hstate(kobj, &nid); + if (demote_order >= h->order) + return -EINVAL; + + /* resize_lock synchronizes access to demote size and writes */ + mutex_lock(&h->resize_lock); + h->demote_order = demote_order; + mutex_unlock(&h->resize_lock); + + return count; +} +HSTATE_ATTR(demote_size); + static struct attribute *hstate_attrs[] = { &nr_hugepages_attr.attr, &nr_overcommit_hugepages_attr.attr, @@ -3449,6 +3803,16 @@ static const struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; +static struct attribute *hstate_demote_attrs[] = { + &demote_size_attr.attr, + &demote_attr.attr, + NULL, +}; + +static const struct attribute_group hstate_demote_attr_group = { + .attrs = hstate_demote_attrs, +}; + static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct kobject **hstate_kobjs, const struct attribute_group *hstate_attr_group) @@ -3466,6 +3830,12 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, hstate_kobjs[hi] = NULL; } + if (h->demote_order) { + if (sysfs_create_group(hstate_kobjs[hi], + &hstate_demote_attr_group)) + pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); + } + return retval; } @@ -3671,6 +4041,10 @@ static int __init hugetlb_init(void) } default_hstate.max_huge_pages = default_hstate_max_huge_pages; + + for (i = 0; i < nr_online_nodes; i++) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; } } @@ -3731,6 +4105,10 @@ void __init hugetlb_add_hstate(unsigned int order) parsed_hstate = h; } +bool __init __weak hugetlb_node_alloc_supported(void) +{ + return true; +} /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz @@ -3742,6 +4120,10 @@ static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; + int node = NUMA_NO_NODE; + int count; + unsigned long tmp; + char *p = s; if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); @@ -3765,8 +4147,40 @@ static int __init hugepages_setup(char *s) return 0; } - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; + while (*p) { + count = 0; + if (sscanf(p, "%lu%n", &tmp, &count) != 1) + goto invalid; + /* Parameter is node format */ + if (p[count] == ':') { + if (!hugetlb_node_alloc_supported()) { + pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); + return 0; + } + node = tmp; + p += count + 1; + if (node < 0 || node >= nr_online_nodes) + goto invalid; + /* Parse hugepages */ + if (sscanf(p, "%lu%n", &tmp, &count) != 1) + goto invalid; + if (!hugetlb_max_hstate) + default_hugepages_in_node[node] = tmp; + else + parsed_hstate->max_huge_pages_node[node] = tmp; + *mhp += tmp; + /* Go to parse next node*/ + if (p[count] == ',') + p += count + 1; + else + break; + } else { + if (p != s) + goto invalid; + *mhp = tmp; + break; + } + } /* * Global state is always initialized later in hugetlb_init. @@ -3779,6 +4193,10 @@ static int __init hugepages_setup(char *s) last_mhp = mhp; return 1; + +invalid: + pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); + return 0; } __setup("hugepages=", hugepages_setup); @@ -3840,6 +4258,7 @@ __setup("hugepagesz=", hugepagesz_setup); static int __init default_hugepagesz_setup(char *s) { unsigned long size; + int i; parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { @@ -3868,6 +4287,9 @@ static int __init default_hugepagesz_setup(char *s) */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; + for (i = 0; i < nr_online_nodes; i++) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; if (hstate_is_gigantic(&default_hstate)) hugetlb_hstate_alloc_pages(&default_hstate); default_hstate_max_huge_pages = 0; @@ -4426,9 +4848,85 @@ again: return ret; } -void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct page *ref_page) +static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pte_t *src_pte) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + pte_t *dst_pte, pte; + spinlock_t *src_ptl, *dst_ptl; + + dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h)); + dst_ptl = huge_pte_lock(h, mm, dst_pte); + src_ptl = huge_pte_lockptr(h, mm, src_pte); + + /* + * We don't have to worry about the ordering of src and dst ptlocks + * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock. + */ + if (src_ptl != dst_ptl) + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + set_huge_pte_at(mm, new_addr, dst_pte, pte); + + if (src_ptl != dst_ptl) + spin_unlock(src_ptl); + spin_unlock(dst_ptl); +} + +int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, unsigned long new_addr, + unsigned long len) +{ + struct hstate *h = hstate_vma(vma); + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + unsigned long old_end = old_addr + len; + unsigned long old_addr_copy; + pte_t *src_pte, *dst_pte; + struct mmu_notifier_range range; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, + old_end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); + /* Prevent race with file truncation */ + i_mmap_lock_write(mapping); + for (; old_addr < old_end; old_addr += sz, new_addr += sz) { + src_pte = huge_pte_offset(mm, old_addr, sz); + if (!src_pte) + continue; + if (huge_pte_none(huge_ptep_get(src_pte))) + continue; + + /* old_addr arg to huge_pmd_unshare() is a pointer and so the + * arg may be modified. Pass a copy instead to preserve the + * value in old_addr. + */ + old_addr_copy = old_addr; + + if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) + continue; + + dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); + if (!dst_pte) + break; + + move_huge_pte(vma, old_addr, new_addr, src_pte); + } + i_mmap_unlock_write(mapping); + flush_tlb_range(vma, old_end - len, old_end); + mmu_notifier_invalidate_range_end(&range); + + return len + old_addr - old_end; +} + +static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -4616,7 +5114,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, /* * Hugetlb_cow() should be called with page lock of the original hugepage held. - * Called with hugetlb_instantiation_mutex held and pte_page locked so we + * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ @@ -5965,12 +6463,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * sharing is possible. For hugetlbfs, this prevents removal of any page * table entries associated with the address space. This is important as we * are setting up sharing based on existing page table entries (mappings). - * - * NOTE: This routine is only called from huge_pte_alloc. Some callers of - * huge_pte_alloc know that sharing is not possible and do not take - * i_mmap_rwsem as a performance optimization. This is handled by the - * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is - * only required for subsequent processing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -6371,7 +6863,38 @@ static bool cma_reserve_called __initdata; static int __init cmdline_parse_hugetlb_cma(char *p) { - hugetlb_cma_size = memparse(p, &p); + int nid, count = 0; + unsigned long tmp; + char *s = p; + + while (*s) { + if (sscanf(s, "%lu%n", &tmp, &count) != 1) + break; + + if (s[count] == ':') { + nid = tmp; + if (nid < 0 || nid >= MAX_NUMNODES) + break; + + s += count + 1; + tmp = memparse(s, &s); + hugetlb_cma_size_in_node[nid] = tmp; + hugetlb_cma_size += tmp; + + /* + * Skip the separator if have one, otherwise + * break the parsing. + */ + if (*s == ',') + s++; + else + break; + } else { + hugetlb_cma_size = memparse(p, &p); + break; + } + } + return 0; } @@ -6380,37 +6903,80 @@ early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); void __init hugetlb_cma_reserve(int order) { unsigned long size, reserved, per_node; + bool node_specific_cma_alloc = false; int nid; cma_reserve_called = true; + if (!hugetlb_cma_size) + return; + + for (nid = 0; nid < MAX_NUMNODES; nid++) { + if (hugetlb_cma_size_in_node[nid] == 0) + continue; + + if (!node_state(nid, N_ONLINE)) { + pr_warn("hugetlb_cma: invalid node %d specified\n", nid); + hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; + hugetlb_cma_size_in_node[nid] = 0; + continue; + } + + if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { + pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", + nid, (PAGE_SIZE << order) / SZ_1M); + hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; + hugetlb_cma_size_in_node[nid] = 0; + } else { + node_specific_cma_alloc = true; + } + } + + /* Validate the CMA size again in case some invalid nodes specified. */ if (!hugetlb_cma_size) return; if (hugetlb_cma_size < (PAGE_SIZE << order)) { pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", (PAGE_SIZE << order) / SZ_1M); + hugetlb_cma_size = 0; return; } - /* - * If 3 GB area is requested on a machine with 4 numa nodes, - * let's allocate 1 GB on first three nodes and ignore the last one. - */ - per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); - pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", - hugetlb_cma_size / SZ_1M, per_node / SZ_1M); + if (!node_specific_cma_alloc) { + /* + * If 3 GB area is requested on a machine with 4 numa nodes, + * let's allocate 1 GB on first three nodes and ignore the last one. + */ + per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); + pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", + hugetlb_cma_size / SZ_1M, per_node / SZ_1M); + } reserved = 0; for_each_node_state(nid, N_ONLINE) { int res; char name[CMA_MAX_NAME]; - size = min(per_node, hugetlb_cma_size - reserved); + if (node_specific_cma_alloc) { + if (hugetlb_cma_size_in_node[nid] == 0) + continue; + + size = hugetlb_cma_size_in_node[nid]; + } else { + size = min(per_node, hugetlb_cma_size - reserved); + } + size = round_up(size, PAGE_SIZE << order); snprintf(name, sizeof(name), "hugetlb%d", nid); - res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, + /* + * Note that 'order per bit' is based on smallest size that + * may be returned to CMA allocator in the case of + * huge page demotion. + */ + res = cma_declare_contiguous_nid(0, size, 0, + PAGE_SIZE << HUGETLB_PAGE_ORDER, 0, false, name, &hugetlb_cma[nid], nid); if (res) { @@ -6426,6 +6992,13 @@ void __init hugetlb_cma_reserve(int order) if (reserved >= hugetlb_cma_size) break; } + + if (!reserved) + /* + * hugetlb_cma_size is used to determine if allocations from + * cma are possible. Set to zero if no cma regions are set up. + */ + hugetlb_cma_size = 0; } void __init hugetlb_cma_check(void) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 5383023d0cca..79d93534ef1e 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -27,9 +27,6 @@ #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -#define hugetlb_cgroup_from_counter(counter, idx) \ - container_of(counter, struct hugetlb_cgroup, hugepage[idx]) - static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline struct page_counter * diff --git a/mm/internal.h b/mm/internal.h index b1001ebeb286..3b79a5c9427a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -41,12 +41,33 @@ static inline void *folio_raw_mapping(struct folio *folio) return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } +void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, + int nr_throttled); +static inline void acct_reclaim_writeback(struct folio *folio) +{ + pg_data_t *pgdat = folio_pgdat(folio); + int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); + + if (nr_throttled) + __acct_reclaim_writeback(pgdat, folio, nr_throttled); +} + +static inline void wake_throttle_isolated(pg_data_t *pgdat) +{ + wait_queue_head_t *wqh; + + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; + if (waitqueue_active(wqh)) + wake_up(wqh); +} + vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { @@ -129,6 +150,7 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); +extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* * in mm/rmap.c: diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2baf121fb8c5..8428da2aaf17 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -30,20 +30,20 @@ #include "kasan.h" #include "../slab.h" -depot_stack_handle_t kasan_save_stack(gfp_t flags) +depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) { unsigned long entries[KASAN_STACK_DEPTH]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); nr_entries = filter_irq_stacks(entries, nr_entries); - return stack_depot_save(entries, nr_entries, flags); + return __stack_depot_save(entries, nr_entries, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) { track->pid = current->pid; - track->stack = kasan_save_stack(flags); + track->stack = kasan_save_stack(flags, true); } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) @@ -298,7 +298,7 @@ static inline u8 assign_tag(struct kmem_cache *cache, /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ #ifdef CONFIG_SLAB /* For SLAB assign tags based on the object index in the freelist. */ - return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); + return (u8)obj_to_index(cache, virt_to_head_page(object), (void *)object); #else /* * For SLUB assign a random tag during slab creation, otherwise reuse diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index c3f5ba7a294a..84a038b07c6f 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,7 +328,7 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); -void kasan_record_aux_stack(void *addr) +static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct page *page = kasan_addr_to_page(addr); struct kmem_cache *cache; @@ -345,7 +345,17 @@ void kasan_record_aux_stack(void *addr) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); + alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc); +} + +void kasan_record_aux_stack(void *addr) +{ + return __kasan_record_aux_stack(addr, true); +} + +void kasan_record_aux_stack_noalloc(void *addr) +{ + return __kasan_record_aux_stack(addr, false); } void kasan_set_free_info(struct kmem_cache *cache, diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b495e17445ad..aebd8df86a1f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -266,7 +266,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip); struct page *kasan_addr_to_page(const void *addr); -depot_stack_handle_t kasan_save_stack(gfp_t flags); +depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 8d95ee52d019..4a4929b29a23 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -254,6 +254,11 @@ core_initcall(kasan_memhotplug_init); #ifdef CONFIG_KASAN_VMALLOC +void __init __weak kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ +} + static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 7a97db8bc8e7..09945784df9e 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -10,12 +10,15 @@ #include #include #include +#include #include +#include #include #include #include #include #include +#include #include #include #include @@ -82,6 +85,10 @@ static const struct kernel_param_ops sample_interval_param_ops = { }; module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); +/* Pool usage% threshold when currently covered allocations are skipped. */ +static unsigned long kfence_skip_covered_thresh __read_mostly = 75; +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); + /* The pool of pages used for guard pages and objects. */ char *__kfence_pool __ro_after_init; EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ @@ -97,14 +104,41 @@ struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ -#ifdef CONFIG_KFENCE_STATIC_KEYS -/* The static key to set up a KFENCE allocation. */ +/* + * The static key to set up a KFENCE allocation; or if static keys are not used + * to gate allocations, to avoid a load and compare if KFENCE is disabled. + */ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); -#endif /* Gates the allocation, ensuring only one succeeds in a given period. */ atomic_t kfence_allocation_gate = ATOMIC_INIT(1); +/* + * A Counting Bloom filter of allocation coverage: limits currently covered + * allocations of the same source filling up the pool. + * + * Assuming a range of 15%-85% unique allocations in the pool at any point in + * time, the below parameters provide a probablity of 0.02-0.33 for false + * positive hits respectively: + * + * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM + */ +#define ALLOC_COVERED_HNUM 2 +#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2) +#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER) +#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER) +#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1) +static atomic_t alloc_covered[ALLOC_COVERED_SIZE]; + +/* Stack depth used to determine uniqueness of an allocation. */ +#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8) + +/* + * Randomness for stack hashes, making the same collisions across reboots and + * different machines less likely. + */ +static u32 stack_hash_seed __ro_after_init; + /* Statistics counters for debugfs. */ enum kfence_counter_id { KFENCE_COUNTER_ALLOCATED, @@ -112,6 +146,9 @@ enum kfence_counter_id { KFENCE_COUNTER_FREES, KFENCE_COUNTER_ZOMBIES, KFENCE_COUNTER_BUGS, + KFENCE_COUNTER_SKIP_INCOMPAT, + KFENCE_COUNTER_SKIP_CAPACITY, + KFENCE_COUNTER_SKIP_COVERED, KFENCE_COUNTER_COUNT, }; static atomic_long_t counters[KFENCE_COUNTER_COUNT]; @@ -121,11 +158,59 @@ static const char *const counter_names[] = { [KFENCE_COUNTER_FREES] = "total frees", [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", [KFENCE_COUNTER_BUGS] = "total bugs", + [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)", + [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)", + [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)", }; static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); /* === Internals ============================================================ */ +static inline bool should_skip_covered(void) +{ + unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100; + + return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh; +} + +static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries) +{ + num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH); + num_entries = filter_irq_stacks(stack_entries, num_entries); + return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed); +} + +/* + * Adds (or subtracts) count @val for allocation stack trace hash + * @alloc_stack_hash from Counting Bloom filter. + */ +static void alloc_covered_add(u32 alloc_stack_hash, int val) +{ + int i; + + for (i = 0; i < ALLOC_COVERED_HNUM; i++) { + atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]); + alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); + } +} + +/* + * Returns true if the allocation stack trace hash @alloc_stack_hash is + * currently contained (non-zero count) in Counting Bloom filter. + */ +static bool alloc_covered_contains(u32 alloc_stack_hash) +{ + int i; + + for (i = 0; i < ALLOC_COVERED_HNUM; i++) { + if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK])) + return false; + alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); + } + + return true; +} + static bool kfence_protect(unsigned long addr) { return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); @@ -183,19 +268,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m * Update the object's metadata state, including updating the alloc/free stacks * depending on the state transition. */ -static noinline void metadata_update_state(struct kfence_metadata *meta, - enum kfence_object_state next) +static noinline void +metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next, + unsigned long *stack_entries, size_t num_stack_entries) { struct kfence_track *track = next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; lockdep_assert_held(&meta->lock); - /* - * Skip over 1 (this) functions; noinline ensures we do not accidentally - * skip over the caller by never inlining. - */ - track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); + if (stack_entries) { + memcpy(track->stack_entries, stack_entries, + num_stack_entries * sizeof(stack_entries[0])); + } else { + /* + * Skip over 1 (this) functions; noinline ensures we do not + * accidentally skip over the caller by never inlining. + */ + num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); + } + track->num_stack_entries = num_stack_entries; track->pid = task_pid_nr(current); track->cpu = raw_smp_processor_id(); track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ @@ -218,12 +310,19 @@ static inline bool set_canary_byte(u8 *addr) /* Check canary byte at @addr. */ static inline bool check_canary_byte(u8 *addr) { + struct kfence_metadata *meta; + unsigned long flags; + if (likely(*addr == KFENCE_CANARY_PATTERN(addr))) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr), - KFENCE_ERROR_CORRUPTION); + + meta = addr_to_metadata((unsigned long)addr); + raw_spin_lock_irqsave(&meta->lock, flags); + kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); + raw_spin_unlock_irqrestore(&meta->lock, flags); + return false; } @@ -233,8 +332,6 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta, const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); unsigned long addr; - lockdep_assert_held(&meta->lock); - /* * We'll iterate over each canary byte per-side until fn() returns * false. However, we'll still iterate over the canary bytes to the @@ -257,7 +354,9 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta, } } -static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp) +static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, + unsigned long *stack_entries, size_t num_stack_entries, + u32 alloc_stack_hash) { struct kfence_metadata *meta = NULL; unsigned long flags; @@ -271,8 +370,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g list_del_init(&meta->list); } raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); - if (!meta) + if (!meta) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]); return NULL; + } if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { /* @@ -314,11 +415,14 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g addr = (void *)meta->addr; /* Update remaining metadata. */ - metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED); + metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries); /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ WRITE_ONCE(meta->cache, cache); meta->size = size; - for_each_canary(meta, set_canary_byte); + meta->alloc_stack_hash = alloc_stack_hash; + raw_spin_unlock_irqrestore(&meta->lock, flags); + + alloc_covered_add(alloc_stack_hash, 1); /* Set required struct page fields. */ page = virt_to_page(meta->addr); @@ -328,9 +432,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g if (IS_ENABLED(CONFIG_SLAB)) page->s_mem = addr; - raw_spin_unlock_irqrestore(&meta->lock, flags); - /* Memory initialization. */ + for_each_canary(meta, set_canary_byte); /* * We check slab_want_init_on_alloc() ourselves, rather than letting @@ -355,6 +458,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z { struct kcsan_scoped_access assert_page_exclusive; unsigned long flags; + bool init; raw_spin_lock_irqsave(&meta->lock, flags); @@ -382,6 +486,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z meta->unprotected_page = 0; } + /* Mark the object as freed. */ + metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); + init = slab_want_init_on_free(meta->cache); + raw_spin_unlock_irqrestore(&meta->lock, flags); + + alloc_covered_add(meta->alloc_stack_hash, -1); + /* Check canary bytes for memory corruption. */ for_each_canary(meta, check_canary_byte); @@ -390,14 +501,9 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z * data is still there, and after a use-after-free is detected, we * unprotect the page, so the data is still accessible. */ - if (!zombie && unlikely(slab_want_init_on_free(meta->cache))) + if (!zombie && unlikely(init)) memzero_explicit(addr, meta->size); - /* Mark the object as freed. */ - metadata_update_state(meta, KFENCE_OBJECT_FREED); - - raw_spin_unlock_irqrestore(&meta->lock, flags); - /* Protect to detect use-after-frees. */ kfence_protect((unsigned long)addr); @@ -663,11 +769,14 @@ void __init kfence_init(void) if (!kfence_sample_interval) return; + stack_hash_seed = (u32)random_get_entropy(); if (!kfence_init_pool()) { pr_err("%s failed\n", __func__); return; } + if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) + static_branch_enable(&kfence_allocation_key); WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, @@ -736,12 +845,18 @@ void kfence_shutdown_cache(struct kmem_cache *s) void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { + unsigned long stack_entries[KFENCE_STACK_DEPTH]; + size_t num_stack_entries; + u32 alloc_stack_hash; + /* * Perform size check before switching kfence_allocation_gate, so that * we don't disable KFENCE without making an allocation. */ - if (size > PAGE_SIZE) + if (size > PAGE_SIZE) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; + } /* * Skip allocations from non-default zones, including DMA. We cannot @@ -749,15 +864,12 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) * properties (e.g. reside in DMAable memory). */ if ((flags & GFP_ZONEMASK) || - (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) + (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; + } - /* - * allocation_gate only needs to become non-zero, so it doesn't make - * sense to continue writing to it and pay the associated contention - * cost, in case we have a large number of concurrent allocations. - */ - if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) + if (atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS /* @@ -776,7 +888,25 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) if (!READ_ONCE(kfence_enabled)) return NULL; - return kfence_guarded_alloc(s, size, flags); + num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0); + + /* + * Do expensive check for coverage of allocation in slow-path after + * allocation_gate has already become non-zero, even though it might + * mean not making any allocation within a given sample interval. + * + * This ensures reasonable allocation coverage when the pool is almost + * full, including avoiding long-lived allocations of the same source + * filling up the pool (e.g. pagecache allocations). + */ + alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries); + if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]); + return NULL; + } + + return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries, + alloc_stack_hash); } size_t kfence_ksize(const void *addr) diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index c1f23c61e5f9..2a2d5de9d379 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -87,6 +87,8 @@ struct kfence_metadata { /* Allocation and free stack information. */ struct kfence_track alloc_track; struct kfence_track free_track; + /* For updating alloc_covered on frees. */ + u32 alloc_stack_hash; }; extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index f1690cf54199..695030c1fff8 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -32,6 +32,11 @@ #define arch_kfence_test_address(addr) (addr) #endif +#define KFENCE_TEST_REQUIRES(test, cond) do { \ + if (!(cond)) \ + kunit_skip((test), "Test requires: " #cond); \ +} while (0) + /* Report as observed from console. */ static struct { spinlock_t lock; @@ -555,8 +560,7 @@ static void test_init_on_free(struct kunit *test) }; int i; - if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON)) - return; + KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON)); /* Assume it hasn't been disabled on command line. */ setup_test_cache(test, size, 0, NULL); @@ -603,10 +607,8 @@ static void test_gfpzero(struct kunit *test) char *buf1, *buf2; int i; - if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) { - kunit_warn(test, "skipping ... would take too long\n"); - return; - } + /* Skip if we think it'd take too long. */ + KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100); setup_test_cache(test, size, 0, NULL); buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5f02fda6f265..e99101162f1a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2299,6 +2299,11 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; + if (!khugepaged_enabled()) { + calculate_min_free_kbytes(); + goto update_wmarks; + } + for_each_populated_zone(zone) { /* * We don't need to worry about fragmentation of @@ -2334,6 +2339,8 @@ static void set_recommended_min_free_kbytes(void) min_free_kbytes = recommended_min; } + +update_wmarks: setup_per_zone_wmarks(); } @@ -2355,12 +2362,11 @@ int start_stop_khugepaged(void) if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); - - set_recommended_min_free_kbytes(); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } + set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; diff --git a/mm/list_lru.c b/mm/list_lru.c index cd58790d0fb3..0cd5e89ca063 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,18 +15,29 @@ #include "slab.h" #ifdef CONFIG_MEMCG_KMEM -static LIST_HEAD(list_lrus); +static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return lru->memcg_aware; +} + static void list_lru_register(struct list_lru *lru) { + if (!list_lru_memcg_aware(lru)) + return; + mutex_lock(&list_lrus_mutex); - list_add(&lru->list, &list_lrus); + list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { + if (!list_lru_memcg_aware(lru)) + return; + mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); @@ -37,11 +48,6 @@ static int lru_shrinker_id(struct list_lru *lru) return lru->shrinker_id; } -static inline bool list_lru_memcg_aware(struct list_lru *lru) -{ - return lru->memcg_aware; -} - static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) { @@ -176,13 +182,16 @@ unsigned long list_lru_count_one(struct list_lru *lru, { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - unsigned long count; + long count; rcu_read_lock(); l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); count = READ_ONCE(l->nr_items); rcu_read_unlock(); + if (unlikely(count < 0)) + count = 0; + return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); @@ -354,8 +363,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; int size = memcg_nr_cache_ids; - memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + - size * sizeof(void *), GFP_KERNEL); + memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL); if (!memcg_lrus) return -ENOMEM; @@ -389,7 +397,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, old = rcu_dereference_protected(nlru->memcg_lrus, lockdep_is_held(&list_lrus_mutex)); - new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); + new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL); if (!new) return -ENOMEM; @@ -398,19 +406,8 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, return -ENOMEM; } - memcpy(&new->lru, &old->lru, old_size * sizeof(void *)); - - /* - * The locking below allows readers that hold nlru->lock avoid taking - * rcu_read_lock (see list_lru_from_memcg_idx). - * - * Since list_lru_{add,del} may be called under an IRQ-safe lock, - * we have to use IRQ-safe primitives here to avoid deadlock. - */ - spin_lock_irq(&nlru->lock); + memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size)); rcu_assign_pointer(nlru->memcg_lrus, new); - spin_unlock_irq(&nlru->lock); - kvfree_rcu(old, rcu); return 0; } @@ -466,9 +463,6 @@ static int memcg_update_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return 0; - for_each_node(i) { if (memcg_update_list_lru_node(&lru->node[i], old_size, new_size)) @@ -491,9 +485,6 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return; - for_each_node(i) memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); @@ -506,7 +497,7 @@ int memcg_update_all_list_lrus(int new_size) int old_size = memcg_nr_cache_ids; mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &list_lrus, list) { + list_for_each_entry(lru, &memcg_list_lrus, list) { ret = memcg_update_list_lru(lru, old_size, new_size); if (ret) goto fail; @@ -515,7 +506,7 @@ out: mutex_unlock(&list_lrus_mutex); return ret; fail: - list_for_each_entry_continue_reverse(lru, &list_lrus, list) + list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list) memcg_cancel_update_list_lru(lru, old_size, new_size); goto out; } @@ -552,9 +543,6 @@ static void memcg_drain_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return; - for_each_node(i) memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg); } @@ -564,7 +552,7 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) struct list_lru *lru; mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &list_lrus, list) + list_for_each_entry(lru, &memcg_list_lrus, list) memcg_drain_list_lru(lru, src_idx, dst_memcg); mutex_unlock(&list_lrus_mutex); } diff --git a/mm/memblock.c b/mm/memblock.c index 5096500b2647..659bf0ffb086 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -366,14 +366,14 @@ void __init memblock_discard(void) addr = __pa(memblock.reserved.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); - __memblock_free_late(addr, size); + memblock_free_late(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { addr = __pa(memblock.memory.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); - __memblock_free_late(addr, size); + memblock_free_late(addr, size); } memblock_memory = NULL; @@ -472,7 +472,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) - memblock_free_ptr(old_array, old_alloc_size); + memblock_free(old_array, old_alloc_size); /* * Reserve the new array if that comes from the memblock. Otherwise, we @@ -655,6 +655,7 @@ repeat: * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region + * @flags: flags of the new region * * Add new memblock region [@base, @base + @size) to the "memory" * type. See memblock_add_range() description for mode details @@ -663,14 +664,14 @@ repeat: * 0 on success, -errno on failure. */ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, - int nid) + int nid, enum memblock_flags flags) { phys_addr_t end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__, - &base, &end, nid, (void *)_RET_IP_); + memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, + &base, &end, nid, flags, (void *)_RET_IP_); - return memblock_add_range(&memblock.memory, base, size, nid, 0); + return memblock_add_range(&memblock.memory, base, size, nid, flags); } /** @@ -796,28 +797,28 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) } /** - * memblock_free_ptr - free boot memory allocation + * memblock_free - free boot memory allocation * @ptr: starting address of the boot memory allocation * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ -void __init_memblock memblock_free_ptr(void *ptr, size_t size) +void __init_memblock memblock_free(void *ptr, size_t size) { if (ptr) - memblock_free(__pa(ptr), size); + memblock_phys_free(__pa(ptr), size); } /** - * memblock_free - free boot memory block + * memblock_phys_free - free boot memory block * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ -int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; @@ -981,6 +982,10 @@ static bool should_skip_region(struct memblock_type *type, if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) return true; + /* skip driver-managed memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) + return true; + return false; } @@ -1589,7 +1594,7 @@ void * __init memblock_alloc_try_nid( } /** - * __memblock_free_late - free pages directly to buddy allocator + * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * @@ -1597,7 +1602,7 @@ void * __init memblock_alloc_try_nid( * down, but we are still initializing the system. Pages are released directly * to the buddy allocator. */ -void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +void __init memblock_free_late(phys_addr_t base, phys_addr_t size) { phys_addr_t cursor, end; @@ -1937,7 +1942,7 @@ static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn) * memmap array. */ if (pg < pgend) - memblock_free(pg, pgend - pg); + memblock_phys_free(pg, pgend - pg); } /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dab23a71fc4..508bcea7df56 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,11 +103,6 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } -/* memcg and lruvec stats flushing */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_SPINLOCK(stats_flush_lock); - #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -239,7 +234,7 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool should_force_charge(void) +static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || (current->flags & PF_EXITING); @@ -613,6 +608,58 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static DEFINE_SPINLOCK(stats_flush_lock); +static DEFINE_PER_CPU(unsigned int, stats_updates); +static atomic_t stats_flush_threshold = ATOMIC_INIT(0); + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +{ + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) + atomic_inc(&stats_flush_threshold); +} + +static void __mem_cgroup_flush_stats(void) +{ + unsigned long flag; + + if (!spin_trylock_irqsave(&stats_flush_lock, flag)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + atomic_set(&stats_flush_threshold, 0); + spin_unlock_irqrestore(&stats_flush_lock, flag); +} + +void mem_cgroup_flush_stats(void) +{ + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + __mem_cgroup_flush_stats(); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -625,7 +672,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -653,10 +700,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, memcg = pn->memcg; /* Update memcg */ - __mod_memcg_state(memcg, idx, val); + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg); } /** @@ -758,7 +807,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, return; __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -1415,7 +1464,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -1576,7 +1625,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = should_force_charge() || out_of_memory(&oc); + ret = task_is_dying() || out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -2544,6 +2593,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; enum oom_status oom_status; unsigned long nr_reclaimed; + bool passed_oom = false; bool may_swap = true; bool drained = false; unsigned long pflags; @@ -2578,15 +2628,6 @@ retry: if (gfp_mask & __GFP_ATOMIC) goto force; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(should_force_charge())) - goto force; - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, @@ -2644,8 +2685,9 @@ retry: if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (fatal_signal_pending(current)) - goto force; + /* Avoid endless loop for tasks bypassed by the oom killer */ + if (passed_oom && task_is_dying()) + goto nomem; /* * keep retrying as long as the memcg oom killer is able to make @@ -2654,14 +2696,10 @@ retry: */ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); - switch (oom_status) { - case OOM_SUCCESS: + if (oom_status == OOM_SUCCESS) { + passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; - case OOM_FAILED: - goto force; - default: - goto nomem; } nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -2736,8 +2774,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) -static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) return; @@ -2746,7 +2783,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); } -#endif static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { @@ -2965,7 +3001,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, unsigned int nr_pages) { - struct page_counter *counter; struct mem_cgroup *memcg; int ret; @@ -2975,21 +3010,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && - !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { - - /* - * Enforce __GFP_NOFAIL allocation because callers are not - * prepared to see failures and likely do not have any failure - * handling code. - */ - if (gfp & __GFP_NOFAIL) { - page_counter_charge(&memcg->kmem, nr_pages); - goto out; - } - cancel_charge(memcg, nr_pages); - ret = -ENOMEM; - } + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_charge(&memcg->kmem, nr_pages); out: css_put(&memcg->css); @@ -3481,19 +3503,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { - int progress; - if (signal_pending(current)) return -EINTR; - progress = try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, true); - if (!progress) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) nr_retries--; - /* maybe some writeback is necessary */ - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - } return 0; @@ -3534,8 +3548,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - /* mem_cgroup_threshold() calls here from irqsafe context */ - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3610,7 +3623,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_state); memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) @@ -3627,22 +3639,18 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static_branch_enable(&memcg_kmem_enabled_key); memcg->kmemcg_id = memcg_id; - memcg->kmem_state = KMEM_ONLINE; return 0; } static void memcg_offline_kmem(struct mem_cgroup *memcg) { - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; + struct mem_cgroup *parent; int kmemcg_id; - if (memcg->kmem_state != KMEM_ONLINE) + if (memcg->kmemcg_id == -1) return; - memcg->kmem_state = KMEM_ALLOCATED; - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; @@ -3653,31 +3661,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) BUG_ON(kmemcg_id < 0); /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by + * After we have finished memcg_reparent_objcgs(), all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. + * The ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ - rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - } - rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); -} - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ - /* css_alloc() failed, offlining didn't happen */ - if (unlikely(memcg->kmem_state == KMEM_ONLINE)) - memcg_offline_kmem(memcg); + memcg->kmemcg_id = -1; } #else static int memcg_online_kmem(struct mem_cgroup *memcg) @@ -3687,22 +3679,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ -static int memcg_update_kmem_max(struct mem_cgroup *memcg, - unsigned long max) -{ - int ret; - - mutex_lock(&memcg_max_mutex); - ret = page_counter_set_max(&memcg->kmem, max); - mutex_unlock(&memcg_max_mutex); - return ret; -} - static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; @@ -3768,10 +3746,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " - "Please report your usecase to linux-mm@kvack.org if you " - "depend on this functionality.\n"); - ret = memcg_update_kmem_max(memcg, nr_pages); + /* kmem.limit_in_bytes is deprecated. */ + ret = -EOPNOTSUPP; break; case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); @@ -3916,7 +3892,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -3988,7 +3964,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4491,7 +4467,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -5324,7 +5300,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); - memcg_free_kmem(memcg); + + /* Need to offline kmem if online_css() fails */ + memcg_offline_kmem(memcg); mem_cgroup_free(memcg); } @@ -5357,21 +5335,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } -void mem_cgroup_flush_stats(void) -{ - if (!spin_trylock(&stats_flush_lock)) - return; - - cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); - spin_unlock(&stats_flush_lock); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ - mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); -} - static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5561,7 +5524,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, #endif static struct page *mc_handle_file_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + unsigned long addr, pte_t ptent) { if (!vma->vm_file) /* anonymous vma */ return NULL; @@ -5736,7 +5699,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) - page = mc_handle_file_pte(vma, addr, ptent, &ent); + page = mc_handle_file_pte(vma, addr, ptent); if (!page && !ent.val) return ret; @@ -6391,7 +6354,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ff51edd6e992..f64ebb6226cb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -673,7 +675,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, #define hwpoison_hugetlb_range NULL #endif -static struct mm_walk_ops hwp_walk_ops = { +static const struct mm_walk_ops hwp_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, }; @@ -806,12 +808,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn, return ret; } +struct page_state { + unsigned long mask; + unsigned long res; + enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ + int (*action)(struct page_state *ps, struct page *p); +}; + +/* + * Return true if page is still referenced by others, otherwise return + * false. + * + * The extra_pins is true when one extra refcount is expected. + */ +static bool has_extra_refcount(struct page_state *ps, struct page *p, + bool extra_pins) +{ + int count = page_count(p) - 1; + + if (extra_pins) + count -= 1; + + if (count > 0) { + pr_err("Memory failure: %#lx: %s still referenced by %d users\n", + page_to_pfn(p), action_page_types[ps->type], count); + return true; + } + + return false; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we * could be more sophisticated. */ -static int me_kernel(struct page *p, unsigned long pfn) +static int me_kernel(struct page_state *ps, struct page *p) { unlock_page(p); return MF_IGNORED; @@ -820,9 +854,9 @@ static int me_kernel(struct page *p, unsigned long pfn) /* * Page in unknown state. Do nothing. */ -static int me_unknown(struct page *p, unsigned long pfn) +static int me_unknown(struct page_state *ps, struct page *p) { - pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); return MF_FAILED; } @@ -830,10 +864,11 @@ static int me_unknown(struct page *p, unsigned long pfn) /* * Clean (or cleaned) page cache page. */ -static int me_pagecache_clean(struct page *p, unsigned long pfn) +static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -862,14 +897,24 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) goto out; } + /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ - ret = truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); + return ret; } @@ -878,7 +923,7 @@ out: * Issues: when the error hit a hole page the error is not properly * propagated. */ -static int me_pagecache_dirty(struct page *p, unsigned long pfn) +static int me_pagecache_dirty(struct page_state *ps, struct page *p) { struct address_space *mapping = page_mapping(p); @@ -922,7 +967,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) mapping_set_error(mapping, -EIO); } - return me_pagecache_clean(p, pfn); + return me_pagecache_clean(ps, p); } /* @@ -944,9 +989,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * Clean swap cache pages can be directly isolated. A later page fault will * bring in the known good data from disk. */ -static int me_swapcache_dirty(struct page *p, unsigned long pfn) +static int me_swapcache_dirty(struct page_state *ps, struct page *p) { int ret; + bool extra_pins = false; ClearPageDirty(p); /* Trigger EIO in shmem: */ @@ -954,10 +1000,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; unlock_page(p); + + if (ret == MF_DELAYED) + extra_pins = true; + + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + return ret; } -static int me_swapcache_clean(struct page *p, unsigned long pfn) +static int me_swapcache_clean(struct page_state *ps, struct page *p) { int ret; @@ -965,6 +1018,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; unlock_page(p); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + return ret; } @@ -974,7 +1031,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. */ -static int me_huge_page(struct page *p, unsigned long pfn) +static int me_huge_page(struct page_state *ps, struct page *p) { int res; struct page *hpage = compound_head(p); @@ -985,7 +1042,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { - res = truncate_error_page(hpage, pfn, mapping); + res = truncate_error_page(hpage, page_to_pfn(p), mapping); unlock_page(hpage); } else { res = MF_FAILED; @@ -1003,6 +1060,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) } } + if (has_extra_refcount(ps, p, false)) + res = MF_FAILED; + return res; } @@ -1028,14 +1088,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) -static struct page_state { - unsigned long mask; - unsigned long res; - enum mf_action_page_type type; - - /* Callback ->action() has to unlock the relevant page inside it. */ - int (*action)(struct page *p, unsigned long pfn); -} error_states[] = { +static struct page_state error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: @@ -1095,19 +1148,10 @@ static int page_action(struct page_state *ps, struct page *p, unsigned long pfn) { int result; - int count; /* page p should be unlocked after returning from ps->action(). */ - result = ps->action(p, pfn); + result = ps->action(ps, p); - count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == MF_DELAYED) - count--; - if (count > 0) { - pr_err("Memory failure: %#lx: %s still referenced by %d users\n", - pfn, action_page_types[ps->type], count); - result = MF_FAILED; - } action_result(pfn, ps->type, result); /* Could do more checks here if page looks ok */ @@ -1400,14 +1444,11 @@ static int identify_page_state(unsigned long pfn, struct page *p, static int try_to_split_thp_page(struct page *page, const char *msg) { lock_page(page); - if (!PageAnon(page) || unlikely(split_huge_page(page))) { + if (unlikely(split_huge_page(page))) { unsigned long pfn = page_to_pfn(page); unlock_page(page); - if (!PageAnon(page)) - pr_info("%s: %#lx: non anonymous thp\n", msg, pfn); - else - pr_info("%s: %#lx: thp split failed\n", msg, pfn); + pr_info("%s: %#lx: thp split failed\n", msg, pfn); put_page(page); return -EBUSY; } diff --git a/mm/memory.c b/mm/memory.c index bcc4b0727a63..8f1de811a1dc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -433,35 +433,39 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } +void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) +{ + spinlock_t *ptl = pmd_lock(mm, pmd); + + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + mm_inc_nr_ptes(mm); + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_rmb() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ + pmd_populate(mm, pmd, *pte); + *pte = NULL; + } + spin_unlock(ptl); +} + int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { - spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; - /* - * Ensure all pte setup (eg. pte page lock and page clearing) are - * visible before the pte is made visible to other CPUs by being - * put into page tables. - * - * The other side of the story is the pointer chasing in the page - * table walking code (when walking the page table without locking; - * ie. most of the time). Fortunately, these data accesses consist - * of a chain of data-dependent loads, meaning most CPUs (alpha - * being the notable exception) will already guarantee loads are - * seen in-order. See the alpha page table accessors for the - * smp_rmb() barriers in page table walking code. - */ - smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - - ptl = pmd_lock(mm, pmd); - if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm_inc_nr_ptes(mm); - pmd_populate(mm, pmd, new); - new = NULL; - } - spin_unlock(ptl); + pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); return 0; @@ -473,10 +477,9 @@ int __pte_alloc_kernel(pmd_t *pmd) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + smp_wmb(); /* See comment in pmd_install() */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } @@ -1333,16 +1336,8 @@ again: struct page *page; page = vm_normal_page(vma, addr, ptent); - if (unlikely(details) && page) { - /* - * unmap_shared_mapping_pages() wants to - * invalidate cache without truncating: - * unmap shared but keep private pages. - */ - if (details->check_mapping && - details->check_mapping != page_rmapping(page)) - continue; - } + if (unlikely(zap_skip_check_mapping(details, page))) + continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); @@ -1375,17 +1370,8 @@ again: is_device_exclusive_entry(entry)) { struct page *page = pfn_swap_entry_to_page(entry); - if (unlikely(details && details->check_mapping)) { - /* - * unmap_shared_mapping_pages() wants to - * invalidate cache without truncating: - * unmap shared but keep private pages. - */ - if (details->check_mapping != - page_rmapping(page)) - continue; - } - + if (unlikely(zap_skip_check_mapping(details, page))) + continue; pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; @@ -2724,19 +2710,19 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range); * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ -static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, - pte_t *page_table, pte_t orig_pte) +static inline int pte_unmap_same(struct vm_fault *vmf) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(mm, pmd); + spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(ptl); - same = pte_same(*page_table, orig_pte); + same = pte_same(*vmf->pte, vmf->orig_pte); spin_unlock(ptl); } #endif - pte_unmap(page_table); + pte_unmap(vmf->pte); + vmf->pte = NULL; return same; } @@ -3321,20 +3307,20 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, + pgoff_t first_index, + pgoff_t last_index, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; - vma_interval_tree_foreach(vma, root, - details->first_index, details->last_index) { - + vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; - zba = details->first_index; + zba = first_index; if (zba < vba) zba = vba; - zea = details->last_index; + zea = last_index; if (zea > vea) zea = vea; @@ -3360,18 +3346,22 @@ void unmap_mapping_page(struct page *page) { struct address_space *mapping = page->mapping; struct zap_details details = { }; + pgoff_t first_index; + pgoff_t last_index; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageTail(page)); - details.check_mapping = mapping; - details.first_index = page->index; - details.last_index = page->index + thp_nr_pages(page) - 1; + first_index = page->index; + last_index = page->index + thp_nr_pages(page) - 1; + + details.zap_mapping = mapping; details.single_page = page; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); + unmap_mapping_range_tree(&mapping->i_mmap, first_index, + last_index, &details); i_mmap_unlock_write(mapping); } @@ -3391,16 +3381,17 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; + pgoff_t first_index = start; + pgoff_t last_index = start + nr - 1; - details.check_mapping = even_cows ? NULL : mapping; - details.first_index = start; - details.last_index = start + nr - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; + details.zap_mapping = even_cows ? NULL : mapping; + if (last_index < first_index) + last_index = ULONG_MAX; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); + unmap_mapping_range_tree(&mapping->i_mmap, first_index, + last_index, &details); i_mmap_unlock_write(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); @@ -3488,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vm_fault_t ret = 0; void *shadow = NULL; - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) + if (!pte_unmap_same(vmf)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); @@ -3853,7 +3844,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } ret = vma->vm_ops->fault(vmf); @@ -3924,7 +3914,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -4037,17 +4026,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (likely(pmd_none(*vmf->pmd))) { - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - vmf->prealloc_pte = NULL; - } - spin_unlock(vmf->ptl); - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { + if (vmf->prealloc_pte) + pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); + else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) return VM_FAULT_OOM; - } } /* See comment in handle_pte_fault() */ @@ -4156,7 +4138,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); @@ -4831,13 +4812,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); - if (pgd_present(*pgd)) /* Another has populated it */ + if (pgd_present(*pgd)) { /* Another has populated it */ p4d_free(mm, new); - else + } else { + smp_wmb(); /* See comment in pmd_install() */ pgd_populate(mm, pgd, new); + } spin_unlock(&mm->page_table_lock); return 0; } @@ -4854,11 +4835,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); + smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); @@ -4879,14 +4859,14 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); + smp_wmb(); /* See comment in pmd_install() */ pud_populate(mm, pud, new); - } else /* Another has populated it */ + } else { /* Another has populated it */ pmd_free(mm, new); + } spin_unlock(ptl); return 0; } @@ -5423,7 +5403,6 @@ long copy_huge_page_from_user(struct page *dst_page, unsigned int pages_per_huge_page, bool allow_pagefault) { - void *src = (void *)usr_src; void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; @@ -5436,8 +5415,7 @@ long copy_huge_page_from_user(struct page *dst_page, else page_kaddr = kmap_atomic(subpage); rc = copy_from_user(page_kaddr, - (const void __user *)(src + i * PAGE_SIZE), - PAGE_SIZE); + usr_src + i * PAGE_SIZE, PAGE_SIZE); if (allow_pagefault) kunmap(subpage); else diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9fd0be32a281..852041f6be41 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -57,7 +56,7 @@ enum { ONLINE_POLICY_AUTO_MOVABLE, }; -const char *online_policy_to_str[] = { +static const char * const online_policy_to_str[] = { [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", }; @@ -220,7 +219,6 @@ static void release_memory_resource(struct resource *res) kfree(res); } -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, const char *reason) { @@ -586,10 +584,6 @@ void generic_online_page(struct page *page, unsigned int order) debug_pagealloc_map_pages(page, 1 << order); __free_pages_core(page, order); totalram_pages_add(1UL << order); -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages_add(1UL << order); -#endif } EXPORT_SYMBOL_GPL(generic_online_page); @@ -626,16 +620,11 @@ static void node_states_check_changes_online(unsigned long nr_pages, arg->status_change_nid = NUMA_NO_NODE; arg->status_change_nid_normal = NUMA_NO_NODE; - arg->status_change_nid_high = NUMA_NO_NODE; if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; -#ifdef CONFIG_HIGHMEM - if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY)) - arg->status_change_nid_high = nid; -#endif } static void node_states_set_node(int node, struct memory_notify *arg) @@ -643,9 +632,6 @@ static void node_states_set_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_set_state(node, N_NORMAL_MEMORY); - if (arg->status_change_nid_high >= 0) - node_set_state(node, N_HIGH_MEMORY); - if (arg->status_change_nid >= 0) node_set_state(node, N_MEMORY); } @@ -1163,7 +1149,6 @@ failed_addition: mem_hotplug_done(); return ret; } -#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ static void reset_node_present_pages(pg_data_t *pgdat) { @@ -1357,6 +1342,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; + enum memblock_flags memblock_flags = MEMBLOCK_NONE; struct vmem_altmap mhp_altmap = {}; struct memory_group *group = NULL; u64 start, size; @@ -1384,8 +1370,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) mem_hotplug_begin(); - if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) - memblock_add_node(start, size, nid); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) + memblock_flags = MEMBLOCK_DRIVER_MANAGED; + ret = memblock_add_node(start, size, nid, memblock_flags); + if (ret) + goto error_mem_hotplug_end; + } ret = __try_online_node(nid, false); if (ret < 0) @@ -1458,6 +1449,7 @@ error: rollback_node_hotadd(nid); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); +error_mem_hotplug_end: mem_hotplug_done(); return ret; } @@ -1803,7 +1795,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages, arg->status_change_nid = NUMA_NO_NODE; arg->status_change_nid_normal = NUMA_NO_NODE; - arg->status_change_nid_high = NUMA_NO_NODE; /* * Check whether node_states[N_NORMAL_MEMORY] will be changed. @@ -1818,24 +1809,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages, if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); -#ifdef CONFIG_HIGHMEM /* - * node_states[N_HIGH_MEMORY] contains nodes which - * have normal memory or high memory. - * Here we add the present_pages belonging to ZONE_HIGHMEM. - * If the zone is within the range of [0..ZONE_HIGHMEM), and - * we determine that the zones in that range become empty, - * we need to clear the node for N_HIGH_MEMORY. - */ - present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages; - if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages) - arg->status_change_nid_high = zone_to_nid(zone); -#endif - - /* - * We have accounted the pages from [0..ZONE_NORMAL), and - * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM - * as well. + * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM + * does not apply as we don't support 32bit. * Here we count the possible pages from ZONE_MOVABLE. * If after having accounted all the pages, we see that the nr_pages * to be offlined is over or equal to the accounted pages, @@ -1853,9 +1829,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg) if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); - if (arg->status_change_nid_high >= 0) - node_clear_state(node, N_HIGH_MEMORY); - if (arg->status_change_nid >= 0) node_clear_state(node, N_MEMORY); } @@ -2204,7 +2177,7 @@ static int __ref try_remove_memory(u64 start, u64 size) arch_remove_memory(start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { - memblock_free(start, size); + memblock_phys_free(start, size); memblock_remove(start, size); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f4b4be7af4d3..10e9c87260ed 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2206,6 +2206,88 @@ struct folio *folio_alloc(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(folio_alloc); +static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + int nodes; + unsigned long nr_pages_per_node; + int delta; + int i; + unsigned long nr_allocated; + unsigned long total_allocated = 0; + + nodes = nodes_weight(pol->nodes); + nr_pages_per_node = nr_pages / nodes; + delta = nr_pages - nodes * nr_pages_per_node; + + for (i = 0; i < nodes; i++) { + if (delta) { + nr_allocated = __alloc_pages_bulk(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node + 1, NULL, + page_array); + delta--; + } else { + nr_allocated = __alloc_pages_bulk(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node, NULL, page_array); + } + + page_array += nr_allocated; + total_allocated += nr_allocated; + } + + return total_allocated; +} + +static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + gfp_t preferred_gfp; + unsigned long nr_allocated = 0; + + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + + nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, + nr_pages, NULL, page_array); + + if (nr_allocated < nr_pages) + nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, + nr_pages - nr_allocated, NULL, + page_array + nr_allocated); + return nr_allocated; +} + +/* alloc pages bulk and mempolicy should be considered at the + * same time in some situation such as vmalloc. + * + * It can accelerate memory allocation especially interleaving + * allocate memory. + */ +unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, + unsigned long nr_pages, struct page **page_array) +{ + struct mempolicy *pol = &default_policy; + + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + if (pol->mode == MPOL_INTERLEAVE) + return alloc_pages_bulk_array_interleave(gfp, pol, + nr_pages, page_array); + + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_bulk_array_preferred_many(gfp, + numa_node_id(), pol, nr_pages, page_array); + + return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol), nr_pages, NULL, + page_array); +} + int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(vma_policy(src)); @@ -2985,64 +3067,3 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } - -bool numa_demotion_enabled = false; - -#ifdef CONFIG_SYSFS -static ssize_t numa_demotion_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%s\n", - numa_demotion_enabled? "true" : "false"); -} - -static ssize_t numa_demotion_enabled_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - numa_demotion_enabled = true; - else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - numa_demotion_enabled = false; - else - return -EINVAL; - - return count; -} - -static struct kobj_attribute numa_demotion_enabled_attr = - __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, - numa_demotion_enabled_store); - -static struct attribute *numa_attrs[] = { - &numa_demotion_enabled_attr.attr, - NULL, -}; - -static const struct attribute_group numa_attr_group = { - .attrs = numa_attrs, -}; - -static int __init numa_init_sysfs(void) -{ - int err; - struct kobject *numa_kobj; - - numa_kobj = kobject_create_and_add("numa", mm_kobj); - if (!numa_kobj) { - pr_err("failed to create numa kobject\n"); - return -ENOMEM; - } - err = sysfs_create_group(numa_kobj, &numa_attr_group); - if (err) { - pr_err("failed to register numa group\n"); - goto delete_obj; - } - return 0; - -delete_obj: - kobject_put(numa_kobj); - return err; -} -subsys_initcall(numa_init_sysfs); -#endif diff --git a/mm/migrate.c b/mm/migrate.c index efa9941ebe03..a11e948593df 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -3305,3 +3305,64 @@ static int __init migrate_on_reclaim_init(void) } late_initcall(migrate_on_reclaim_init); #endif /* CONFIG_HOTPLUG_CPU */ + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled ? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + numa_demotion_enabled = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + numa_demotion_enabled = false; + else + return -EINVAL; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif diff --git a/mm/mmap.c b/mm/mmap.c index 88dcc5c25225..b22a07f5e761 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3332,7 +3332,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { - mm->total_vm += npages; + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); if (is_exec_mapping(flags)) mm->exec_vm += npages; diff --git a/mm/mprotect.c b/mm/mprotect.c index 883e2cc85cad..e552f5e0ccbd 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -563,7 +563,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, error = -ENOMEM; if (!vma) goto out; - prev = vma->vm_prev; + if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; @@ -581,8 +581,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } } + if (start > vma->vm_start) prev = vma; + else + prev = vma->vm_prev; for (nstart = start ; ; ) { unsigned long mask_off_old_flags; diff --git a/mm/mremap.c b/mm/mremap.c index badfe17ade1f..002eec83e91e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -489,6 +489,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); + if (is_vm_hugetlb_page(vma)) + return move_hugetlb_page_tables(vma, new_vma, old_addr, + new_addr, len); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, old_addr, old_end); mmu_notifier_invalidate_range_start(&range); @@ -565,6 +569,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, bool *locked, unsigned long flags, struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap) { + long to_account = new_len - old_len; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; unsigned long vm_flags = vma->vm_flags; @@ -583,6 +588,9 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + if (unlikely(flags & MREMAP_DONTUNMAP)) + to_account = new_len; + if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) err = vma->vm_ops->may_split(vma, old_addr); @@ -604,8 +612,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (err) return err; - if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) { - if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) + if (vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT)) return -ENOMEM; } @@ -613,8 +621,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, &need_rmap_locks); if (!new_vma) { - if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) - vm_unacct_memory(new_len >> PAGE_SHIFT); + if (vm_flags & VM_ACCOUNT) + vm_unacct_memory(to_account >> PAGE_SHIFT); return -ENOMEM; } @@ -642,6 +650,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, mremap_userfaultfd_prep(new_vma, uf); } + if (is_vm_hugetlb_page(vma)) { + clear_vma_resv_huge_pages(vma); + } + /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; @@ -708,8 +720,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } static struct vm_area_struct *vma_to_resize(unsigned long addr, - unsigned long old_len, unsigned long new_len, unsigned long flags, - unsigned long *p) + unsigned long old_len, unsigned long new_len, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -736,9 +747,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return ERR_PTR(-EINVAL); - if (is_vm_hugetlb_page(vma)) - return ERR_PTR(-EINVAL); - /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) return ERR_PTR(-EFAULT); @@ -768,13 +776,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); - if (vma->vm_flags & VM_ACCOUNT) { - unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return ERR_PTR(-ENOMEM); - *p = charged; - } - return vma; } @@ -787,7 +788,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; - unsigned long charged = 0; unsigned long map_flags = 0; if (offset_in_page(new_addr)) @@ -830,7 +830,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, old_len = new_len; } - vma = vma_to_resize(addr, old_len, new_len, flags, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -853,7 +853,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); if (IS_ERR_VALUE(ret)) - goto out1; + goto out; /* We got a new mapping */ if (!(flags & MREMAP_FIXED)) @@ -862,12 +862,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, uf_unmap); - if (!(offset_in_page(ret))) - goto out; - -out1: - vm_unacct_memory(charged); - out: return ret; } @@ -899,7 +893,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; - unsigned long charged = 0; bool locked = false; bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; @@ -949,6 +942,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (mmap_write_lock_killable(current->mm)) return -EINTR; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) { + ret = EFAULT; + goto out; + } + + if (is_vm_hugetlb_page(vma)) { + struct hstate *h __maybe_unused = hstate_vma(vma); + + old_len = ALIGN(old_len, huge_page_size(h)); + new_len = ALIGN(new_len, huge_page_size(h)); + + /* addrs must be huge page aligned */ + if (addr & ~huge_page_mask(h)) + goto out; + if (new_addr & ~huge_page_mask(h)) + goto out; + + /* + * Don't allow remap expansion, because the underlying hugetlb + * reservation is not yet capable to handle split reservation. + */ + if (new_len > old_len) + goto out; + } if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) { ret = mremap_to(addr, old_len, new_addr, new_len, @@ -981,7 +999,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Ok, we need to grow.. */ - vma = vma_to_resize(addr, old_len, new_len, flags, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -992,10 +1010,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (old_len == vma->vm_end - addr) { /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + long pages = (new_len - old_len) >> PAGE_SHIFT; + + if (vma->vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, pages)) { + ret = -ENOMEM; + goto out; + } + } if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { + vm_unacct_memory(pages); ret = -ENOMEM; goto out; } @@ -1034,10 +1060,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, &locked, flags, &uf, &uf_unmap); } out: - if (offset_in_page(ret)) { - vm_unacct_memory(charged); + if (offset_in_page(ret)) locked = false; - } if (downgraded) mmap_read_unlock(current->mm); else diff --git a/mm/nommu.c b/mm/nommu.c index 41ef204e7482..55a9e48a7a02 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1638,12 +1638,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - return -ENOMEM; -} - vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 50b984d048ce..195b3661da3d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -641,6 +641,8 @@ done: static int oom_reaper(void *unused) { + set_freezable(); + while (true) { struct task_struct *tsk = NULL; @@ -1120,27 +1122,24 @@ bool out_of_memory(struct oom_control *oc) } /* - * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If oom_lock is held by somebody else, a parallel oom - * killing is already in progress so do nothing. + * The pagefault handler calls here because some allocation has failed. We have + * to take care of the memcg OOM here because this is the only safe context without + * any locks held but let the oom killer triggered from the allocation context care + * about the global OOM. */ void pagefault_out_of_memory(void) { - struct oom_control oc = { - .zonelist = NULL, - .nodemask = NULL, - .memcg = NULL, - .gfp_mask = 0, - .order = 0, - }; + static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (mem_cgroup_oom_synchronize(true)) return; - if (!mutex_trylock(&oom_lock)) + if (fatal_signal_pending(current)) return; - out_of_memory(&oc); - mutex_unlock(&oom_lock); + + if (__ratelimit(&pfoom_rs)) + pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); } SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9c64490171e0..2d498bb62248 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2366,8 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) ret = generic_writepages(mapping, wbc); if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) break; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + + /* + * Lacking an allocation context or the locality or writeback + * state of any of the inode's pages, throttle based on + * writeback activity on the local node. It's as good a + * guess as any. + */ + reclaim_throttle(NODE_DATA(numa_node_id()), + VMSCAN_THROTTLE_WRITEBACK); } /* * Usually few pages are written by now from those we've just submitted diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fee18ada46a2..c5952749ad40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -677,10 +677,8 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) { + if (order > PAGE_ALLOC_COSTLY_ORDER) order = pageblock_order; - VM_BUG_ON(order != pageblock_order); - } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif @@ -1430,14 +1428,8 @@ static inline void prefetch_buddy(struct page *page) /* * Frees a number of pages from the PCP lists - * Assumes all pages on list are in same zone, and of same order. + * Assumes all pages on list are in same zone. * count is the number of pages to free. - * - * If the zone was previously in an "all pages pinned" state then look to - * see if this freeing clears that state. - * - * And clear the zone's pages_scanned counter, to hold off the "all pages are - * pinned" detection logic. */ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) @@ -1591,7 +1583,7 @@ static void __meminit init_reserved_page(unsigned long pfn) for (zid = 0; zid < MAX_NR_ZONES; zid++) { struct zone *zone = &pgdat->node_zones[zid]; - if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + if (zone_spans_pfn(zone, pfn)) break; } __init_single_page(pfn_to_page(pfn), pfn, zid, nid); @@ -3149,9 +3141,9 @@ static void drain_local_pages_wq(struct work_struct *work) * cpu which is alright but we also have to make sure to not move to * a different one. */ - preempt_disable(); + migrate_disable(); drain_local_pages(drain->zone); - preempt_enable(); + migrate_enable(); } /* @@ -3968,6 +3960,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= @@ -4797,30 +4791,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { - /* - * If we didn't make any progress and have a lot of - * dirty + writeback pages then we should wait for - * an IO to complete to slow down the reclaim and - * prevent from pre mature OOM - */ - if (!did_some_progress) { - unsigned long write_pending; - - write_pending = zone_page_state_snapshot(zone, - NR_ZONE_WRITE_PENDING); - - if (2 * write_pending > reclaimable) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - return true; - } - } - ret = true; - goto out; + break; } } -out: /* * Memory allocation/reclaim might be called from a WQ context and the * current implementation of the WQ concurrency control doesn't @@ -4916,6 +4891,19 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; + /* + * Check for insane configurations where the cpuset doesn't contain + * any suitable zone to satisfy the request - e.g. non-movable + * GFP_HIGHUSER allocations from MOVABLE nodes only. + */ + if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) { + struct zoneref *z = first_zones_zonelist(ac->zonelist, + ac->highest_zoneidx, + &cpuset_current_mems_allowed); + if (!z->zone) + goto nopage; + } + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); @@ -5630,8 +5618,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); unsigned long addr; - if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) - gfp_mask &= ~__GFP_COMP; + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); addr = __get_free_pages(gfp_mask, order); return make_alloc_exact(addr, order, size); @@ -5655,8 +5643,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); struct page *p; - if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) - gfp_mask &= ~__GFP_COMP; + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); p = alloc_pages_node(nid, gfp_mask, order); if (!p) @@ -5998,6 +5986,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk(KERN_CONT "%s" " free:%lukB" + " boost:%lukB" " min:%lukB" " low:%lukB" " high:%lukB" @@ -6018,6 +6007,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone->watermark_boost), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -6273,7 +6263,7 @@ static void build_zonelists(pg_data_t *pgdat) */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] = load; + node_load[node] += load; node_order[nr_nodes++] = node; prev_node = node; @@ -6282,6 +6272,10 @@ static void build_zonelists(pg_data_t *pgdat) build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); + pr_info("Fallback order for Node %d: ", local_node); + for (node = 0; node < nr_nodes; node++) + pr_cont("%d ", node_order[node]); + pr_cont("\n"); } #ifdef CONFIG_HAVE_MEMORYLESS_NODES @@ -7407,6 +7401,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} static void __meminit pgdat_init_internals(struct pglist_data *pgdat) { + int i; + pgdat_resize_init(pgdat); pgdat_init_split_queue(pgdat); @@ -7415,6 +7411,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); + for (i = 0; i < NR_VMSCAN_THROTTLE; i++) + init_waitqueue_head(&pgdat->reclaim_wait[i]); + pgdat_page_ext_init(pgdat); lruvec_init(&pgdat->__lruvec); } @@ -8144,8 +8143,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char } if (pages && s) - pr_info("Freeing %s memory: %ldK\n", - s, pages << (PAGE_SHIFT - 10)); + pr_info("Freeing %s memory: %ldK\n", s, K(pages)); return pages; } @@ -8190,14 +8188,13 @@ void __init mem_init_print_info(void) ", %luK highmem" #endif ")\n", - nr_free_pages() << (PAGE_SHIFT - 10), - physpages << (PAGE_SHIFT - 10), + K(nr_free_pages()), K(physpages), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), - totalcma_pages << (PAGE_SHIFT - 10) + K(physpages - totalram_pages() - totalcma_pages), + K(totalcma_pages) #ifdef CONFIG_HIGHMEM - , totalhigh_pages() << (PAGE_SHIFT - 10) + , K(totalhigh_pages()) #endif ); } @@ -8470,7 +8467,7 @@ void setup_per_zone_wmarks(void) * 8192MB: 11584k * 16384MB: 16384k */ -int __meminit init_per_zone_wmark_min(void) +void calculate_min_free_kbytes(void) { unsigned long lowmem_kbytes; int new_min_free_kbytes; @@ -8478,16 +8475,17 @@ int __meminit init_per_zone_wmark_min(void) lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (new_min_free_kbytes > user_min_free_kbytes) { - min_free_kbytes = new_min_free_kbytes; - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 262144) - min_free_kbytes = 262144; - } else { + if (new_min_free_kbytes > user_min_free_kbytes) + min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); + else pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); - } + +} + +int __meminit init_per_zone_wmark_min(void) +{ + calculate_min_free_kbytes(); setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -8774,7 +8772,8 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; - huge = is_vm_area_hugepages(table); + if (table) + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free @@ -9371,21 +9370,21 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif +/* + * This function returns a stable result only if called under zone lock. + */ bool is_free_buddy_page(struct page *page) { - struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); - unsigned long flags; unsigned int order; - spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && buddy_order(page_head) >= order) + if (PageBuddy(page_head) && + buddy_order_unsafe(page_head) >= order) break; } - spin_unlock_irqrestore(&zone->lock, flags); return order < MAX_ORDER; } diff --git a/mm/page_ext.c b/mm/page_ext.c index 2a52fd9ed464..6242afb24d84 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -201,7 +201,7 @@ fail: panic("Out of memory"); } -#else /* CONFIG_FLATMEM */ +#else /* CONFIG_SPARSEMEM */ struct page_ext *lookup_page_ext(const struct page *page) { diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a95c2c6562d0..f67c4c70f17f 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -94,8 +94,13 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) buddy = page + (buddy_pfn - pfn); if (!is_migrate_isolate_page(buddy)) { - __isolate_free_page(page, order); - isolated_page = true; + isolated_page = !!__isolate_free_page(page, order); + /* + * Isolating a free page in an isolated pageblock + * is expected to always work as watermarks don't + * apply here. + */ + VM_WARN_ON(!isolated_page); } } } @@ -183,7 +188,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, int flags) { unsigned long pfn; - unsigned long undo_pfn; struct page *page; BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); @@ -193,25 +197,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page) { - if (set_migratetype_isolate(page, migratetype, flags)) { - undo_pfn = pfn; - goto undo; - } + if (page && set_migratetype_isolate(page, migratetype, flags)) { + undo_isolate_page_range(start_pfn, pfn, migratetype); + return -EBUSY; } } return 0; -undo: - for (pfn = start_pfn; - pfn < undo_pfn; - pfn += pageblock_nr_pages) { - struct page *page = pfn_to_online_page(pfn); - if (!page) - continue; - unset_migratetype_isolate(page, migratetype); - } - - return -EBUSY; } /* diff --git a/mm/percpu.c b/mm/percpu.c index e0a986818903..f5b2c2ea5a54 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2472,7 +2472,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - memblock_free_early(__pa(ai), ai->__ai_size); + memblock_free(ai, ai->__ai_size); } /** @@ -3134,7 +3134,7 @@ out_free_areas: out_free: pcpu_free_alloc_info(ai); if (areas) - memblock_free_early(__pa(areas), areas_size); + memblock_free(areas, areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ @@ -3256,7 +3256,7 @@ enomem: free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: - memblock_free_early(__pa(pages), pages_size); + memblock_free(pages, pages_size); pcpu_free_alloc_info(ai); return rc; } @@ -3286,7 +3286,7 @@ static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { - memblock_free_early(__pa(ptr), size); + memblock_free(ptr, size); } void __init setup_per_cpu_areas(void) diff --git a/mm/readahead.c b/mm/readahead.c index e71e719e36c9..6ae5693de28c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -308,7 +308,7 @@ void force_page_cache_ra(struct readahead_control *ractl, * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra - * 1-8 page = 32k initial, > 8 page = 128k initial + * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { diff --git a/mm/rmap.c b/mm/rmap.c index 3a1059c284c3..163ac4e6bcee 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1807,6 +1807,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (is_zone_device_page(page)) { + unsigned long pfn = page_to_pfn(page); swp_entry_t entry; pte_t swp_pte; @@ -1815,8 +1816,11 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - entry = make_readable_migration_entry( - page_to_pfn(page)); + entry = pte_to_swp_entry(pteval); + if (is_writable_device_private_entry(entry)) + entry = make_writable_migration_entry(pfn); + else + entry = make_readable_migration_entry(pfn); swp_pte = swp_entry_to_pte(entry); /* diff --git a/mm/shmem.c b/mm/shmem.c index 17e344e26e73..23c91a8beb78 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -855,9 +855,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) return swapped << PAGE_SHIFT; /* Here comes the more involved part */ - return shmem_partial_swap_usage(mapping, - linear_page_index(vma, vma->vm_start), - linear_page_index(vma, vma->vm_end)); + return shmem_partial_swap_usage(mapping, vma->vm_pgoff, + vma->vm_pgoff + vma_pages(vma)); } /* @@ -2426,7 +2425,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - SetPageDirty(page); unlock_page(page); return 0; out_delete_from_cache: @@ -2458,6 +2456,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@ -2468,7 +2467,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (*pagep && PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + ret = -EIO; + } + + return ret; } static int @@ -2555,6 +2562,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (sgp == SGP_CACHE) set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } } /* @@ -3116,7 +3129,8 @@ static const char *shmem_get_link(struct dentry *dentry, page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@ -3124,6 +3138,11 @@ static const char *shmem_get_link(struct dentry *dentry, error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (page && PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@ -3774,6 +3793,13 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, @@ -3784,7 +3810,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops); @@ -4195,6 +4221,10 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, page = ERR_PTR(error); else unlock_page(page); + + if (PageHWPoison(page)) + page = ERR_PTR(-EIO); + return page; #else /* diff --git a/mm/slab.c b/mm/slab.c index 874b3f8fe80d..da132a9ae6f8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3900,8 +3900,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) if (err) goto end; - if (limit && shared && batchcount) - goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -3944,7 +3942,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) limit = 32; #endif batchcount = (limit + 1) / 2; -skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); end: if (err) @@ -4207,19 +4204,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, n <= cachep->useroffset - offset + cachep->usersize) return; - /* - * If the copy is still within the allocated object, produce - * a warning instead of rejecting the copy. This is intended - * to be a temporary method to find any missing usercopy - * whitelists. - */ - if (usercopy_fallback && - offset <= cachep->object_size && - n <= cachep->object_size - offset) { - usercopy_warn("SLAB object", cachep->name, to_user, offset, n); - return; - } - usercopy_abort("SLAB object", cachep->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ diff --git a/mm/slab_common.c b/mm/slab_common.c index ec2bb0beed75..e5d080a93009 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,14 +37,6 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; -#ifdef CONFIG_HARDENED_USERCOPY -bool usercopy_fallback __ro_after_init = - IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK); -module_param(usercopy_fallback, bool, 0400); -MODULE_PARM_DESC(usercopy_fallback, - "WARN instead of reject usercopy whitelist violations"); -#endif - static LIST_HEAD(slab_caches_to_rcu_destroy); static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); static DECLARE_WORK(slab_caches_to_rcu_destroy_work, diff --git a/mm/slub.c b/mm/slub.c index e87fd492a65b..f7368bfffb7a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -354,7 +354,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - prefetch(object + s->offset); + prefetchw(object + s->offset); } static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) @@ -414,6 +414,29 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +#ifdef CONFIG_SLUB_CPU_PARTIAL +static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +{ + unsigned int nr_pages; + + s->cpu_partial = nr_objects; + + /* + * We take the number of objects but actually limit the number of + * pages on the per cpu partial list, in order to limit excessive + * growth of the list. For simplicity we assume that the pages will + * be half-full. + */ + nr_pages = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); + s->cpu_partial_pages = nr_pages; +} +#else +static inline void +slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +{ +} +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + /* * Per slab locking using the pagelock */ @@ -2052,7 +2075,7 @@ static inline void remove_partial(struct kmem_cache_node *n, */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, - int mode, int *objects) + int mode) { void *freelist; unsigned long counters; @@ -2068,7 +2091,6 @@ static inline void *acquire_slab(struct kmem_cache *s, freelist = page->freelist; counters = page->counters; new.counters = counters; - *objects = new.objects - new.inuse; if (mode) { new.inuse = page->objects; new.freelist = NULL; @@ -2106,9 +2128,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, { struct page *page, *page2; void *object = NULL; - unsigned int available = 0; unsigned long flags; - int objects; + unsigned int partial_pages = 0; /* * Racy check. If we mistakenly see no partial slabs then we @@ -2126,11 +2147,10 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!pfmemalloc_match(page, gfpflags)) continue; - t = acquire_slab(s, n, page, object == NULL, &objects); + t = acquire_slab(s, n, page, object == NULL); if (!t) break; - available += objects; if (!object) { *ret_page = page; stat(s, ALLOC_FROM_PARTIAL); @@ -2138,10 +2158,15 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, } else { put_cpu_partial(s, page, 0); stat(s, CPU_PARTIAL_NODE); + partial_pages++; } +#ifdef CONFIG_SLUB_CPU_PARTIAL if (!kmem_cache_has_cpu_partial(s) - || available > slub_cpu_partial(s) / 2) + || partial_pages > s->cpu_partial_pages / 2) break; +#else + break; +#endif } spin_unlock_irqrestore(&n->list_lock, flags); @@ -2546,14 +2571,13 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) struct page *page_to_unfreeze = NULL; unsigned long flags; int pages = 0; - int pobjects = 0; local_lock_irqsave(&s->cpu_slab->lock, flags); oldpage = this_cpu_read(s->cpu_slab->partial); if (oldpage) { - if (drain && oldpage->pobjects > slub_cpu_partial(s)) { + if (drain && oldpage->pages >= s->cpu_partial_pages) { /* * Partial array is full. Move the existing set to the * per node partial list. Postpone the actual unfreezing @@ -2562,16 +2586,13 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page_to_unfreeze = oldpage; oldpage = NULL; } else { - pobjects = oldpage->pobjects; pages = oldpage->pages; } } pages++; - pobjects += page->objects - page->inuse; page->pages = pages; - page->pobjects = pobjects; page->next = oldpage; this_cpu_write(s->cpu_slab->partial, page); @@ -3522,7 +3543,9 @@ static inline void free_nonslab_page(struct page *page, void *object) { unsigned int order = compound_order(page); - VM_BUG_ON_PAGE(!PageCompound(page), page); + if (WARN_ON_ONCE(!PageCompound(page))) + pr_warn_once("object pointer: 0x%p\n", object); + kfree_hook(object); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); __free_pages(page, order); @@ -3989,6 +4012,8 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static void set_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL + unsigned int nr_objects; + /* * cpu_partial determined the maximum number of objects kept in the * per cpu partial lists of a processor. @@ -3998,24 +4023,22 @@ static void set_cpu_partial(struct kmem_cache *s) * filled up again with minimal effort. The slab will never hit the * per node partial lists and therefore no locking will be required. * - * This setting also determines - * - * A) The number of objects from per cpu partial slabs dumped to the - * per node list when we reach the limit. - * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch - * 50% to keep some capacity around for frees. + * For backwards compatibility reasons, this is determined as number + * of objects, even though we now limit maximum number of pages, see + * slub_set_cpu_partial() */ if (!kmem_cache_has_cpu_partial(s)) - slub_set_cpu_partial(s, 0); + nr_objects = 0; else if (s->size >= PAGE_SIZE) - slub_set_cpu_partial(s, 2); + nr_objects = 6; else if (s->size >= 1024) - slub_set_cpu_partial(s, 6); + nr_objects = 24; else if (s->size >= 256) - slub_set_cpu_partial(s, 13); + nr_objects = 52; else - slub_set_cpu_partial(s, 30); + nr_objects = 120; + + slub_set_cpu_partial(s, nr_objects); #endif } @@ -4466,7 +4489,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, { struct kmem_cache *s; unsigned int offset; - size_t object_size; bool is_kfence = is_kfence_address(ptr); ptr = kasan_reset_tag(ptr); @@ -4499,19 +4521,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, n <= s->useroffset - offset + s->usersize) return; - /* - * If the copy is still within the allocated object, produce - * a warning instead of rejecting the copy. This is intended - * to be a temporary method to find any missing usercopy - * whitelists. - */ - object_size = slab_ksize(s); - if (usercopy_fallback && - offset <= object_size && n <= object_size - offset) { - usercopy_warn("SLUB object", s->name, to_user, offset, n); - return; - } - usercopy_abort("SLUB object", s->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ @@ -5390,7 +5399,12 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sysfs_emit(buf, "%u\n", slub_cpu_partial(s)); + unsigned int nr_partial = 0; +#ifdef CONFIG_SLUB_CPU_PARTIAL + nr_partial = s->cpu_partial; +#endif + + return sysfs_emit(buf, "%u\n", nr_partial); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -5461,12 +5475,12 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (page) { + if (page) pages += page->pages; - objects += page->pobjects; - } } + /* Approximate half-full pages , see slub_set_cpu_partial() */ + objects = (pages * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages); #ifdef CONFIG_SMP @@ -5474,9 +5488,12 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) struct page *page; page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (page) + if (page) { + pages = READ_ONCE(page->pages); + objects = (pages * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", - cpu, page->pobjects, page->pages); + cpu, objects, pages); + } } #endif len += sysfs_emit_at(buf, len, "\n"); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bdce883f9286..db6df27c852a 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -76,7 +76,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in __pte_alloc(). */ + /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); diff --git a/mm/sparse.c b/mm/sparse.c index 120bc8ea5293..e5c84b0cf0c9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -451,7 +451,7 @@ static void *sparsemap_buf_end __meminitdata; static inline void __meminit sparse_buffer_free(unsigned long size) { WARN_ON(!sparsemap_buf || size == 0); - memblock_free_early(__pa(sparsemap_buf), size); + memblock_free(sparsemap_buf, size); } static void __init sparse_buffer_init(unsigned long size, int nid) diff --git a/mm/swap.c b/mm/swap.c index 8ff9ba7cf2de..1841c24682f8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -135,18 +135,27 @@ EXPORT_SYMBOL(__put_page); * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru * - * Release a list of pages which are strung together on page.lru. Currently - * used by read_cache_pages() and related error recovery code. + * Release a list of pages which are strung together on page.lru. */ void put_pages_list(struct list_head *pages) { - while (!list_empty(pages)) { - struct page *victim; + struct page *page, *next; - victim = lru_to_page(pages); - list_del(&victim->lru); - put_page(victim); + list_for_each_entry_safe(page, next, pages, lru) { + if (!put_page_testzero(page)) { + list_del(&page->lru); + continue; + } + if (PageHead(page)) { + list_del(&page->lru); + __put_compound_page(page); + continue; + } + /* Cannot be PageLRU because it's passed to us using the lru */ + __ClearPageWaiters(page); } + + free_unref_page_list(pages); } EXPORT_SYMBOL(put_pages_list); diff --git a/mm/swapfile.c b/mm/swapfile.c index 41c9e92f1f00..e59e08ef46e1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2763,7 +2763,7 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; - unsigned int bytes, inuse; + unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); @@ -2775,7 +2775,7 @@ static int swap_show(struct seq_file *swap, void *v) file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", + seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", @@ -3118,7 +3118,7 @@ static bool swap_discardable(struct swap_info_struct *si) { struct request_queue *q = bdev_get_queue(si->bdev); - if (!q || !blk_queue_discard(q)) + if (!blk_queue_discard(q)) return false; return true; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 36e5f6ab976f..0780c2a57ff1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -69,10 +69,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, pgoff_t offset, max_off; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - if (writable || !page_in_cache) - _dst_pte = pte_mkdirty(_dst_pte); if (writable) { if (wp_copy) _dst_pte = pte_mkuffd_wp(_dst_pte); @@ -233,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, goto out; } + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, page, false, wp_copy); if (ret) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e8a807c78110..d2a00ad4e1dd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1195,18 +1195,14 @@ find_vmap_lowest_match(unsigned long size, { struct vmap_area *va; struct rb_node *node; - unsigned long length; /* Start from the root. */ node = free_vmap_area_root.rb_node; - /* Adjust the search size for alignment overhead. */ - length = size + align - 1; - while (node) { va = rb_entry(node, struct vmap_area, rb_node); - if (get_subtree_max_size(node->rb_left) >= length && + if (get_subtree_max_size(node->rb_left) >= size && vstart < va->va_start) { node = node->rb_left; } else { @@ -1216,9 +1212,9 @@ find_vmap_lowest_match(unsigned long size, /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is - * equal or bigger to the requested search length. + * equal or bigger to the requested search size. */ - if (get_subtree_max_size(node->rb_right) >= length) { + if (get_subtree_max_size(node->rb_right) >= size) { node = node->rb_right; continue; } @@ -1226,15 +1222,23 @@ find_vmap_lowest_match(unsigned long size, /* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen - * only once due to "vstart" restriction. + * due to "vstart" restriction or an alignment overhead + * that is bigger then PAGE_SIZE. */ while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; - if (get_subtree_max_size(node->rb_right) >= length && + if (get_subtree_max_size(node->rb_right) >= size && vstart <= va->va_start) { + /* + * Shift the vstart forward. Please note, we update it with + * parent's start address adding "1" because we do not want + * to enter same sub-tree after it has already been checked + * and no suitable free block found there. + */ + vstart = va->va_start + 1; node = node->rb_right; break; } @@ -1265,7 +1269,7 @@ find_vmap_lowest_linear_match(unsigned long size, } static void -find_vmap_lowest_match_check(unsigned long size) +find_vmap_lowest_match_check(unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; @@ -1274,8 +1278,8 @@ find_vmap_lowest_match_check(unsigned long size) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, 1, vstart); - va_2 = find_vmap_lowest_linear_match(size, 1, vstart); + va_1 = find_vmap_lowest_match(size, align, vstart); + va_2 = find_vmap_lowest_linear_match(size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1454,7 +1458,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align, return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(size); + find_vmap_lowest_match_check(size, align); #endif return nva_start_addr; @@ -2272,15 +2276,22 @@ void __init vm_area_add_early(struct vm_struct *vm) */ void __init vm_area_register_early(struct vm_struct *vm, size_t align) { - static size_t vm_init_off __initdata; - unsigned long addr; + unsigned long addr = ALIGN(VMALLOC_START, align); + struct vm_struct *cur, **p; - addr = ALIGN(VMALLOC_START + vm_init_off, align); - vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; + BUG_ON(vmap_initialized); + for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { + if ((unsigned long)cur->addr - addr >= vm->size) + break; + addr = ALIGN((unsigned long)cur->addr + cur->size, align); + } + + BUG_ON(addr > VMALLOC_END - vm->size); vm->addr = (void *)addr; - - vm_area_add_early(vm); + vm->next = *p; + *p = vm; + kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } static void vmap_init_free_space(void) @@ -2743,6 +2754,13 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); + /* + * Your top guard is someone else's bottom guard. Not having a top + * guard compromises someone else's mappings too. + */ + if (WARN_ON_ONCE(flags & VM_NO_GUARD)) + flags &= ~VM_NO_GUARD; + if (count > totalram_pages()) return NULL; @@ -2825,7 +2843,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * to fails, fallback to a single page allocator that is * more permissive. */ - if (!order && nid != NUMA_NO_NODE) { + if (!order) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -2837,8 +2855,20 @@ vm_area_alloc_pages(gfp_t gfp, int nid, */ nr_pages_request = min(100U, nr_pages - nr_allocated); - nr = alloc_pages_bulk_array_node(gfp, nid, - nr_pages_request, pages + nr_allocated); + /* memory allocation should consider mempolicy, we can't + * wrongly use nearest node when nid == NUMA_NO_NODE, + * otherwise memory may be allocated in only one node, + * but mempolcy want to alloc memory by interleaving. + */ + if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) + nr = alloc_pages_bulk_array_mempolicy(gfp, + nr_pages_request, + pages + nr_allocated); + + else + nr = alloc_pages_bulk_array_node(gfp, nid, + nr_pages_request, + pages + nr_allocated); nr_allocated += nr; cond_resched(); @@ -2850,7 +2880,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } - } else if (order) + } else /* * Compound pages required for remap_vmalloc_page if * high-order pages. @@ -2860,6 +2890,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { + if (fatal_signal_pending(current)) + break; + if (nid == NUMA_NO_NODE) page = alloc_pages(gfp, order); else @@ -2887,6 +2920,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t orig_gfp_mask = gfp_mask; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; @@ -2907,7 +2941,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!area->pages) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); @@ -2927,7 +2961,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via __vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, page order %u, failed to allocate pages", area->nr_pages * PAGE_SIZE, page_order); goto fail; @@ -2935,7 +2969,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (vmap_pages_range(addr, addr + size, prot, area->pages, page_shift) < 0) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; @@ -2961,8 +2995,16 @@ fail: * @caller: caller's return address * * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * allocator with @gfp_mask flags. Please note that the full set of gfp + * flags are not supported. GFP_KERNEL would be a preferred allocation mode + * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not + * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka + * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka + * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported). + * __GFP_NOWARN can be used to suppress error messages about failures. + * + * Map them into contiguous kernel virtual space, using a pagetable + * protection of @prot. * * Return: the address of the area or %NULL on failure */ @@ -3856,6 +3898,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; + unsigned int step = 1U << vm_area_page_order(v); if (!counters) return; @@ -3867,9 +3910,8 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) memset(counters, 0, nr_node_ids * sizeof(unsigned int)); - for (nr = 0; nr < v->nr_pages; nr++) - counters[page_to_nid(v->pages[nr])]++; - + for (nr = 0; nr < v->nr_pages; nr += step) + counters[page_to_nid(v->pages[nr])] += step; for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); @@ -3905,7 +3947,7 @@ static int s_show(struct seq_file *m, void *p) (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); - return 0; + goto final; } v = va->vm; @@ -3946,6 +3988,7 @@ static int s_show(struct seq_file *m, void *p) /* * As a final step, dump "unpurged" areas. */ +final: if (list_is_last(&va->list, &vmap_area_list)) show_purge_info(m); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 76518e4166dc..b52644771cc4 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -308,7 +308,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, * asserted for a second in which subsequent * pressure events can occur. */ - memcg->socket_pressure = jiffies + HZ; + WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); } } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 71f178f85f5b..ef4a6dc7f000 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1021,6 +1021,91 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) +{ + wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; + long timeout, ret; + DEFINE_WAIT(wait); + + /* + * Do not throttle IO workers, kthreads other than kswapd or + * workqueues. They may be required for reclaim to make + * forward progress (e.g. journalling workqueues or kthreads). + */ + if (!current_is_kswapd() && + current->flags & (PF_IO_WORKER|PF_KTHREAD)) + return; + + /* + * These figures are pulled out of thin air. + * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many + * parallel reclaimers which is a short-lived event so the timeout is + * short. Failing to make progress or waiting on writeback are + * potentially long-lived events so use a longer timeout. This is shaky + * logic as a failure to make progress could be due to anything from + * writeback to a slow device to excessive references pages at the tail + * of the inactive LRU. + */ + switch(reason) { + case VMSCAN_THROTTLE_WRITEBACK: + timeout = HZ/10; + + if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { + WRITE_ONCE(pgdat->nr_reclaim_start, + node_page_state(pgdat, NR_THROTTLED_WRITTEN)); + } + + break; + case VMSCAN_THROTTLE_NOPROGRESS: + timeout = HZ/2; + break; + case VMSCAN_THROTTLE_ISOLATED: + timeout = HZ/50; + break; + default: + WARN_ON_ONCE(1); + timeout = HZ; + break; + } + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = schedule_timeout(timeout); + finish_wait(wqh, &wait); + + if (reason == VMSCAN_THROTTLE_WRITEBACK) + atomic_dec(&pgdat->nr_writeback_throttled); + + trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), + jiffies_to_usecs(timeout - ret), + reason); +} + +/* + * Account for pages written if tasks are throttled waiting on dirty + * pages to clean. If enough pages have been cleaned since throttling + * started then wakeup the throttled tasks. + */ +void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, + int nr_throttled) +{ + unsigned long nr_written; + + node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); + + /* + * This is an inaccurate read as the per-cpu deltas may not + * be synchronised. However, given that the system is + * writeback throttled, it is not worth taking the penalty + * of getting an accurate count. At worst, the throttle + * timeout guarantees forward progress. + */ + nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - + READ_ONCE(pgdat->nr_reclaim_start); + + if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) + wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); +} + /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -1352,7 +1437,6 @@ static unsigned int demote_page_list(struct list_head *demote_pages, { int target_nid = next_demotion_node(pgdat->node_id); unsigned int nr_succeeded; - int err; if (list_empty(demote_pages)) return 0; @@ -1361,7 +1445,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages, return 0; /* Demotion ignores all cpuset and mempolicy settings */ - err = migrate_pages(demote_pages, alloc_demote_page, NULL, + migrate_pages(demote_pages, alloc_demote_page, NULL, target_nid, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -1427,9 +1511,8 @@ retry: /* * The number of dirty pages determines if a node is marked - * reclaim_congested which affects wait_iff_congested. kswapd - * will stall and start writing pages if the tail of the LRU - * is all dirty unqueued pages. + * reclaim_congested. kswapd will stall and start writing + * pages if the tail of the LRU is all dirty unqueued pages. */ page_check_dirty_writeback(page, &dirty, &writeback); if (dirty || writeback) @@ -2135,6 +2218,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; + bool too_many; if (current_is_kswapd()) return 0; @@ -2158,7 +2242,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; - return isolated > inactive; + too_many = isolated > inactive; + + /* Wake up tasks throttled due to too_many_isolated. */ + if (!too_many) + wake_throttle_isolated(pgdat); + + return too_many; } /* @@ -2267,8 +2357,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; /* wait a bit for the reclaimer. */ - msleep(100); stalled = true; + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) @@ -3196,19 +3286,19 @@ again: * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it * implies that pages are cycling through the LRU - * faster than they are written so also forcibly stall. + * faster than they are written so forcibly stall + * until some pages complete writeback. */ if (sc->nr.immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } /* - * Tag a node/memcg as congested if all the dirty pages - * scanned were backed by a congested BDI and - * wait_iff_congested will stall. + * Tag a node/memcg as congested if all the dirty pages were marked + * for writeback and immediate reclaim (counted in nr.congested). * * Legacy memcg will stall in page writeback so avoid forcibly - * stalling in wait_iff_congested(). + * stalling in reclaim_throttle(). */ if ((current_is_kswapd() || (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && @@ -3216,15 +3306,15 @@ again: set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); /* - * Stall direct reclaim for IO completions if underlying BDIs - * and node is congested. Allow kswapd to continue until it + * Stall direct reclaim for IO completions if the lruvec is + * node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) - wait_iff_congested(BLK_RW_ASYNC, HZ/10); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc)) @@ -3272,6 +3362,36 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); } +static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) +{ + /* + * If reclaim is making progress greater than 12% efficiency then + * wake all the NOPROGRESS throttled tasks. + */ + if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { + wait_queue_head_t *wqh; + + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; + if (waitqueue_active(wqh)) + wake_up(wqh); + + return; + } + + /* + * Do not throttle kswapd on NOPROGRESS as it will throttle on + * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under + * writeback and marked for immediate reclaim at the tail of + * the LRU. + */ + if (current_is_kswapd()) + return; + + /* Throttle if making no progress at high prioities. */ + if (sc->priority < DEF_PRIORITY - 2) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); +} + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -3356,6 +3476,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); + consider_reclaim_throttle(zone->zone_pgdat, sc); } /* @@ -4302,6 +4423,7 @@ static int kswapd(void *p) WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); + atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { bool ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index 8ce2620344b2..d701c335628c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -165,6 +165,34 @@ atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_zone_stat); EXPORT_SYMBOL(vm_node_stat); +#ifdef CONFIG_NUMA +static void fold_vm_zone_numa_events(struct zone *zone) +{ + unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; + int cpu; + enum numa_stat_item item; + + for_each_online_cpu(cpu) { + struct per_cpu_zonestat *pzstats; + + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); + } + + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_event_add(zone_numa_events[item], zone, item); +} + +void fold_vm_numa_events(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + fold_vm_zone_numa_events(zone); +} +#endif + #ifdef CONFIG_SMP int calculate_pressure_threshold(struct zone *zone) @@ -771,34 +799,6 @@ static int fold_diff(int *zone_diff, int *node_diff) return changes; } -#ifdef CONFIG_NUMA -static void fold_vm_zone_numa_events(struct zone *zone) -{ - unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; - int cpu; - enum numa_stat_item item; - - for_each_online_cpu(cpu) { - struct per_cpu_zonestat *pzstats; - - pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); - for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) - zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); - } - - for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) - zone_numa_event_add(zone_numa_events[item], zone, item); -} - -void fold_vm_numa_events(void) -{ - struct zone *zone; - - for_each_populated_zone(zone) - fold_vm_zone_numa_events(zone); -} -#endif - /* * Update the zone counters for the current cpu. * @@ -1070,8 +1070,13 @@ static void fill_contig_page_info(struct zone *zone, for (order = 0; order < MAX_ORDER; order++) { unsigned long blocks; - /* Count number of free blocks */ - blocks = zone->free_area[order].nr_free; + /* + * Count number of free blocks. + * + * Access to nr_free is lockless as nr_free is used only for + * diagnostic purposes. Use data_race to avoid KCSAN warning. + */ + blocks = data_race(zone->free_area[order].nr_free); info->free_blocks_total += blocks; /* Count free base pages */ @@ -1225,6 +1230,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", + "nr_throttled_written", "nr_kernel_misc_reclaimable", "nr_foll_pin_acquired", "nr_foll_pin_released", @@ -1445,7 +1451,11 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + /* + * Access to nr_free is lockless as nr_free is used only for + * printing purposes. Use data_race to avoid KCSAN warning. + */ + seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free)); seq_putc(m, '\n'); } @@ -1656,6 +1666,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n pages free %lu" + "\n boost %lu" "\n min %lu" "\n low %lu" "\n high %lu" @@ -1664,6 +1675,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n managed %lu" "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), + zone->watermark_boost, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), @@ -2179,7 +2191,7 @@ static void extfrag_show_print(struct seq_file *m, for (order = 0; order < MAX_ORDER; ++order) { fill_contig_page_info(zone, order, &info); index = __fragmentation_index(order, &info); - seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); } seq_putc(m, '\n'); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 68e8831068f4..b897ce3b399a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1830,10 +1830,11 @@ static inline void zs_pool_dec_isolated(struct zs_pool *pool) VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); atomic_long_dec(&pool->isolated_pages); /* - * There's no possibility of racing, since wait_for_isolated_drain() - * checks the isolated count under &class->lock after enqueuing - * on migration_wait. + * Checking pool->destroying must happen after atomic_long_dec() + * for pool->isolated_pages above. Paired with the smp_mb() in + * zs_unregister_migration(). */ + smp_mb__after_atomic(); if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) wake_up_all(&pool->migration_wait); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc7f419184aa..3c6498dab6bd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -260,7 +260,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2fffcf2b54f3..319dd7bbfe33 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -78,7 +78,6 @@ #include #include #include -#include #include #include #include diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e62b40bd349e..816f74dadfa3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 896b8f5bc885..04a060ac7fdf 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ec0f52567c16..35928fefae33 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/samples/Kconfig b/samples/Kconfig index b0503ef058d3..bec3528aa2de 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -26,7 +26,7 @@ config SAMPLE_TRACE_PRINTK config SAMPLE_FTRACE_DIRECT tristate "Build register_ftrace_direct() example" depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m - depends on X86_64 # has x86_64 inlined asm + depends on HAVE_SAMPLE_FTRACE_DIRECT help This builds an ftrace direct function example that hooks to wake_up_process and prints the parameters. @@ -120,6 +120,15 @@ config SAMPLE_CONNECTOR with it. See also Documentation/driver-api/connector.rst +config SAMPLE_FANOTIFY_ERROR + bool "Build fanotify error monitoring sample" + depends on FANOTIFY && CC_CAN_LINK && HEADERS_INSTALL + help + When enabled, this builds an example code that uses the + FAN_FS_ERROR fanotify mechanism to monitor filesystem + errors. + See also Documentation/admin-guide/filesystem-monitoring.rst. + config SAMPLE_HIDRAW bool "hidraw sample" depends on CC_CAN_LINK && HEADERS_INSTALL @@ -224,3 +233,9 @@ config SAMPLE_WATCH_QUEUE sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function. endif # SAMPLES + +config HAVE_SAMPLE_FTRACE_DIRECT + bool + +config HAVE_SAMPLE_FTRACE_MULTI_DIRECT + bool diff --git a/samples/Makefile b/samples/Makefile index 087e0988ccc5..b7b98307c2b4 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -5,6 +5,7 @@ subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ +obj-$(CONFIG_SAMPLE_FANOTIFY_ERROR) += fanotify/ subdir-$(CONFIG_SAMPLE_HIDRAW) += hidraw obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += hw_breakpoint/ obj-$(CONFIG_SAMPLE_KDB) += kdb/ @@ -21,6 +22,7 @@ subdir-$(CONFIG_SAMPLE_TIMER) += timers obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ +obj-$(CONFIG_SAMPLE_FTRACE_MULTI_DIRECT) += ftrace/ obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ subdir-$(CONFIG_SAMPLE_UHID) += uhid obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ diff --git a/samples/fanotify/Makefile b/samples/fanotify/Makefile new file mode 100644 index 000000000000..e20db1bdde3b --- /dev/null +++ b/samples/fanotify/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +userprogs-always-y += fs-monitor + +userccflags += -I usr/include -Wall + diff --git a/samples/fanotify/fs-monitor.c b/samples/fanotify/fs-monitor.c new file mode 100644 index 000000000000..608db24c471e --- /dev/null +++ b/samples/fanotify/fs-monitor.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021, Collabora Ltd. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef FAN_FS_ERROR +#define FAN_FS_ERROR 0x00008000 +#define FAN_EVENT_INFO_TYPE_ERROR 5 + +struct fanotify_event_info_error { + struct fanotify_event_info_header hdr; + __s32 error; + __u32 error_count; +}; +#endif + +#ifndef FILEID_INO32_GEN +#define FILEID_INO32_GEN 1 +#endif + +#ifndef FILEID_INVALID +#define FILEID_INVALID 0xff +#endif + +static void print_fh(struct file_handle *fh) +{ + int i; + uint32_t *h = (uint32_t *) fh->f_handle; + + printf("\tfh: "); + for (i = 0; i < fh->handle_bytes; i++) + printf("%hhx", fh->f_handle[i]); + printf("\n"); + + printf("\tdecoded fh: "); + if (fh->handle_type == FILEID_INO32_GEN) + printf("inode=%u gen=%u\n", h[0], h[1]); + else if (fh->handle_type == FILEID_INVALID && !fh->handle_bytes) + printf("Type %d (Superblock error)\n", fh->handle_type); + else + printf("Type %d (Unknown)\n", fh->handle_type); + +} + +static void handle_notifications(char *buffer, int len) +{ + struct fanotify_event_metadata *event = + (struct fanotify_event_metadata *) buffer; + struct fanotify_event_info_header *info; + struct fanotify_event_info_error *err; + struct fanotify_event_info_fid *fid; + int off; + + for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) { + + if (event->mask != FAN_FS_ERROR) { + printf("unexpected FAN MARK: %llx\n", + (unsigned long long)event->mask); + goto next_event; + } + + if (event->fd != FAN_NOFD) { + printf("Unexpected fd (!= FAN_NOFD)\n"); + goto next_event; + } + + printf("FAN_FS_ERROR (len=%d)\n", event->event_len); + + for (off = sizeof(*event) ; off < event->event_len; + off += info->len) { + info = (struct fanotify_event_info_header *) + ((char *) event + off); + + switch (info->info_type) { + case FAN_EVENT_INFO_TYPE_ERROR: + err = (struct fanotify_event_info_error *) info; + + printf("\tGeneric Error Record: len=%d\n", + err->hdr.len); + printf("\terror: %d\n", err->error); + printf("\terror_count: %d\n", err->error_count); + break; + + case FAN_EVENT_INFO_TYPE_FID: + fid = (struct fanotify_event_info_fid *) info; + + printf("\tfsid: %x%x\n", + fid->fsid.val[0], fid->fsid.val[1]); + print_fh((struct file_handle *) &fid->handle); + break; + + default: + printf("\tUnknown info type=%d len=%d:\n", + info->info_type, info->len); + } + } +next_event: + printf("---\n\n"); + } +} + +int main(int argc, char **argv) +{ + int fd; + + char buffer[BUFSIZ]; + + if (argc < 2) { + printf("Missing path argument\n"); + return 1; + } + + fd = fanotify_init(FAN_CLASS_NOTIF|FAN_REPORT_FID, O_RDONLY); + if (fd < 0) + errx(1, "fanotify_init"); + + if (fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, + FAN_FS_ERROR, AT_FDCWD, argv[1])) { + errx(1, "fanotify_mark"); + } + + while (1) { + int n = read(fd, buffer, BUFSIZ); + + if (n < 0) + errx(1, "read"); + + handle_notifications(buffer, n); + } + + return 0; +} diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index ab1d1c05c288..e8a3f8520a44 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o -obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-multi.o +obj-$(CONFIG_SAMPLE_FTRACE_MULTI_DIRECT) += ftrace-direct-multi.o CFLAGS_sample-trace-array.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index 5b9a09957c6e..690e4a9ff333 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -2,6 +2,7 @@ #include #include #include +#include void my_direct_func1(void) { @@ -18,6 +19,8 @@ extern void my_tramp2(void *); static unsigned long my_ip = (unsigned long)schedule; +#ifdef CONFIG_X86_64 + asm ( " .pushsection .text, \"ax\", @progbits\n" " .type my_tramp1, @function\n" @@ -41,6 +44,47 @@ asm ( " .popsection\n" ); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func1\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp1, .-my_tramp1\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func2\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + static unsigned long my_tramp = (unsigned long)my_tramp1; static unsigned long tramps[2] = { (unsigned long)my_tramp1, diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index 3f0079c9bd6f..6e0de725bf22 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -3,6 +3,7 @@ #include /* for handle_mm_fault() */ #include +#include void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -13,6 +14,8 @@ void my_direct_func(struct vm_area_struct *vma, extern void my_tramp(void *); +#ifdef CONFIG_X86_64 + asm ( " .pushsection .text, \"ax\", @progbits\n" " .type my_tramp, @function\n" @@ -33,6 +36,31 @@ asm ( " .popsection\n" ); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ static int __init ftrace_direct_init(void) { diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index a2729d1ef17f..a30aa42ec76a 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -3,6 +3,7 @@ #include /* for wake_up_process() */ #include +#include void my_direct_func(struct task_struct *p) { @@ -11,6 +12,8 @@ void my_direct_func(struct task_struct *p) extern void my_tramp(void *); +#ifdef CONFIG_X86_64 + asm ( " .pushsection .text, \"ax\", @progbits\n" " .type my_tramp, @function\n" @@ -27,6 +30,31 @@ asm ( " .popsection\n" ); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ static int __init ftrace_direct_init(void) { diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c27d2312cfc3..88cb294dc447 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -489,7 +489,8 @@ our $Attribute = qr{ ____cacheline_aligned| ____cacheline_aligned_in_smp| ____cacheline_internodealigned_in_smp| - __weak + __weak| + __alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) }x; our $Modifier; our $Inline = qr{inline|__always_inline|noinline|__inline|__inline__}; diff --git a/scripts/decodecode b/scripts/decodecode index 31d884e35f2f..c711a196511c 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -126,7 +126,7 @@ if [ $marker -ne 0 ]; then fi echo Code starting with the faulting instruction > $T.aa echo =========================================== >> $T.aa -code=`echo $code | sed -e 's/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'` +code=`echo $code | sed -e 's/\r//;s/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'` echo -n " .$type 0x" > $T.s echo $code >> $T.s disas $T 0 diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 17fdc620d548..acf6ea711299 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -178,6 +178,7 @@ assum||assume assumtpion||assumption asuming||assuming asycronous||asynchronous +asychronous||asynchronous asynchnous||asynchronous asynchromous||asynchronous asymetric||asymmetric @@ -241,6 +242,7 @@ beter||better betweeen||between bianries||binaries bitmast||bitmask +bitwiedh||bitwidth boardcast||broadcast borad||board boundry||boundary @@ -265,7 +267,10 @@ calucate||calculate calulate||calculate cancelation||cancellation cancle||cancel +cant||can't +cant'||can't canot||cannot +cann't||can't capabilites||capabilities capabilties||capabilities capabilty||capability @@ -501,6 +506,7 @@ disble||disable disgest||digest disired||desired dispalying||displaying +dissable||disable diplay||display directon||direction direcly||directly @@ -595,6 +601,7 @@ exceded||exceeded exceds||exceeds exceeed||exceed excellant||excellent +exchnage||exchange execeeded||exceeded execeeds||exceeds exeed||exceed @@ -938,6 +945,7 @@ migrateable||migratable milliseonds||milliseconds minium||minimum minimam||minimum +minimun||minimum miniumum||minimum minumum||minimum misalinged||misaligned @@ -956,6 +964,7 @@ mmnemonic||mnemonic mnay||many modfiy||modify modifer||modifier +modul||module modulues||modules momery||memory memomry||memory @@ -1154,6 +1163,7 @@ programable||programmable programers||programmers programm||program programms||programs +progres||progress progresss||progress prohibitted||prohibited prohibitting||prohibiting @@ -1328,6 +1338,7 @@ servive||service setts||sets settting||setting shapshot||snapshot +shoft||shift shotdown||shutdown shoud||should shouldnt||shouldn't @@ -1439,6 +1450,7 @@ syfs||sysfs symetric||symmetric synax||syntax synchonized||synchronized +sychronization||synchronization synchronuously||synchronously syncronize||synchronize syncronized||synchronized @@ -1521,6 +1533,7 @@ unexpexted||unexpected unfortunatelly||unfortunately unifiy||unify uniterrupted||uninterrupted +uninterruptable||uninterruptible unintialized||uninitialized unitialized||uninitialized unkmown||unknown @@ -1553,6 +1566,7 @@ unuseful||useless unvalid||invalid upate||update upsupported||unsupported +useable||usable usefule||useful usefull||useful usege||usage @@ -1574,6 +1588,7 @@ varient||variant vaule||value verbse||verbose veify||verify +verfication||verification veriosn||version verisons||versions verison||version @@ -1586,6 +1601,7 @@ visiters||visitors vitual||virtual vunerable||vulnerable wakeus||wakeups +was't||wasn't wathdog||watchdog wating||waiting wiat||wait diff --git a/security/Kconfig b/security/Kconfig index fe6c0395fa02..0b847f435beb 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -163,20 +163,6 @@ config HARDENED_USERCOPY or are part of the kernel text. This kills entire classes of heap overflow exploits and similar kernel memory exposures. -config HARDENED_USERCOPY_FALLBACK - bool "Allow usercopy whitelist violations to fallback to object size" - depends on HARDENED_USERCOPY - default y - help - This is a temporary option that allows missing usercopy whitelists - to be discovered via a WARN() to the kernel log, instead of - rejecting the copy, falling back to non-whitelisted hardened - usercopy that checks the slab allocation size instead of the - whitelist size. This option will be removed once it seems like - all missing usercopy whitelists have been identified and fixed. - Booting with "slab_common.usercopy_fallback=Y/N" can change - this setting. - config HARDENED_USERCOPY_PAGESPAN bool "Refuse to copy allocations that span multiple pages" depends on HARDENED_USERCOPY diff --git a/tools/testing/selftests/damon/debugfs_attrs.sh b/tools/testing/selftests/damon/debugfs_attrs.sh index bfabb19dc0d3..196b6640bf37 100644 --- a/tools/testing/selftests/damon/debugfs_attrs.sh +++ b/tools/testing/selftests/damon/debugfs_attrs.sh @@ -57,6 +57,19 @@ test_write_fail "$file" "1 2 3 5 4" "$orig_content" \ test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written" echo "$orig_content" > "$file" +# Test schemes file +# ================= + +file="$DBGFS/schemes" +orig_content=$(cat "$file") + +test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \ + "$orig_content" "valid input" +test_write_fail "$file" "1 2 +3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines" +test_write_succ "$file" "" "$orig_content" "disabling" +echo "$orig_content" > "$file" + # Test target_ids file # ==================== diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc index 84285a6f60b0..dc7ade196798 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc @@ -22,6 +22,9 @@ ppc64*) ppc*) ARG1=%r3 ;; +s390*) + ARG1=%r2 +;; *) echo "Please implement other architecture here" exit_untested diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc index 474ca1a9a088..47d84b5cb6ca 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc @@ -32,6 +32,10 @@ ppc*) GOODREG=%r3 BADREG=%msr ;; +s390*) + GOODREG=%r2 + BADREG=%s2 +;; *) echo "Please implement other architecture here" exit_untested diff --git a/tools/testing/selftests/memory-hotplug/config b/tools/testing/selftests/memory-hotplug/config index a7e8cd5bb265..1eef042a31e1 100644 --- a/tools/testing/selftests/memory-hotplug/config +++ b/tools/testing/selftests/memory-hotplug/config @@ -1,5 +1,4 @@ CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y CONFIG_NOTIFIER_ERROR_INJECTION=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m CONFIG_MEMORY_HOTREMOVE=y diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index b02eac613fdd..2e7e86e85282 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap +hugepage-mremap hugepage-shm khugepaged map_hugetlb diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index d9605bd10f2d..1607322a112c 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -29,6 +29,7 @@ TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugepage-mmap +TEST_GEN_FILES += hugepage-mremap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += khugepaged TEST_GEN_FILES += madv_populate diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c new file mode 100644 index 000000000000..257df94697a5 --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mremap: + * + * Example of remapping huge page memory in a user application using the + * mremap system call. Code assumes a hugetlbfs filesystem is mounted + * at './huge'. The code will use 10MB worth of huge pages. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include /* Definition of O_* constants */ +#include /* Definition of SYS_* constants */ +#include +#include + +#define LENGTH (1UL * 1024 * 1024 * 1024) + +#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) +#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < LENGTH; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +static void register_region_with_uffd(char *addr, size_t len) +{ + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd"); + exit(1); + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API"); + exit(1); + } + + /* Create a private anonymous mapping. The memory will be + * demand-zero paged--that is, not yet allocated. When we + * actually touch the memory, it will be allocated via + * the userfaultfd. + */ + + addr = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Address returned by mmap() = %p\n", addr); + + /* Register the memory range of the mapping we just created for + * handling by the userfaultfd object. In mode, we request to track + * missing pages (i.e., pages that have not yet been faulted in). + */ + + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(1); + } +} + +int main(void) +{ + int ret = 0; + + int fd = open("/huge/test", O_CREAT | O_RDWR, 0755); + + if (fd < 0) { + perror("Open failed"); + exit(1); + } + + /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ + unsigned long suggested_addr = 0x7eaa40000000; + void *haddr = mmap((void *)suggested_addr, LENGTH, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map haddr: Returned address is %p\n", haddr); + if (haddr == MAP_FAILED) { + perror("mmap1"); + exit(1); + } + + /* mmap again to a dummy address to hopefully trigger pmd sharing. */ + suggested_addr = 0x7daa40000000; + void *daddr = mmap((void *)suggested_addr, LENGTH, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map daddr: Returned address is %p\n", daddr); + if (daddr == MAP_FAILED) { + perror("mmap3"); + exit(1); + } + + suggested_addr = 0x7faa40000000; + void *vaddr = + mmap((void *)suggested_addr, LENGTH, PROTECTION, FLAGS, -1, 0); + printf("Map vaddr: Returned address is %p\n", vaddr); + if (vaddr == MAP_FAILED) { + perror("mmap2"); + exit(1); + } + + register_region_with_uffd(haddr, LENGTH); + + void *addr = mremap(haddr, LENGTH, LENGTH, + MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); + if (addr == MAP_FAILED) { + perror("mremap"); + exit(1); + } + + printf("Mremap: Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr); + ret = read_bytes(addr); + + munmap(addr, LENGTH); + + return ret; +} diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index b61dcdb44c5b..1436e1a9a3d3 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -5,6 +5,10 @@ #include #include #include +#include +#include +#include +#include #include "../kselftest.h" #include "../../../../include/vdso/time64.h" @@ -18,6 +22,15 @@ #define KSM_MERGE_ACROSS_NODES_DEFAULT true #define MB (1ul << 20) +#define PAGE_SHIFT 12 +#define HPAGE_SHIFT 21 + +#define PAGE_SIZE (1 << PAGE_SHIFT) +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) + struct ksm_sysfs { unsigned long max_page_sharing; unsigned long merge_across_nodes; @@ -34,6 +47,7 @@ enum ksm_test_name { CHECK_KSM_ZERO_PAGE_MERGE, CHECK_KSM_NUMA_MERGE, KSM_MERGE_TIME, + KSM_MERGE_TIME_HUGE_PAGES, KSM_COW_TIME }; @@ -99,6 +113,9 @@ static void print_help(void) " -U (page unmerging)\n" " -P evaluate merging time and speed.\n" " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " For this test, the size of duplicated memory area (in MiB)\n" " must be provided using -s option\n" " -C evaluate the time required to break COW of merged pages.\n\n"); @@ -354,12 +371,34 @@ err_out: return KSFT_FAIL; } +static int get_next_mem_node(int node) +{ + + long node_size; + int mem_node = 0; + int i, max_node = numa_max_node(); + + for (i = node + 1; i <= max_node + node; i++) { + mem_node = i % (max_node + 1); + node_size = numa_node_size(mem_node, NULL); + if (node_size > 0) + break; + } + return mem_node; +} + +static int get_first_mem_node(void) +{ + return get_next_mem_node(numa_max_node()); +} + static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, size_t page_size) { void *numa1_map_ptr, *numa2_map_ptr; struct timespec start_time; int page_count = 2; + int first_node; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { perror("clock_gettime"); @@ -370,7 +409,7 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a perror("NUMA support not enabled"); return KSFT_SKIP; } - if (numa_max_node() < 1) { + if (numa_num_configured_nodes() <= 1) { printf("At least 2 NUMA nodes must be available\n"); return KSFT_SKIP; } @@ -378,8 +417,9 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a return KSFT_FAIL; /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ - numa1_map_ptr = numa_alloc_onnode(page_size, 0); - numa2_map_ptr = numa_alloc_onnode(page_size, 1); + first_node = get_first_mem_node(); + numa1_map_ptr = numa_alloc_onnode(page_size, first_node); + numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); if (!numa1_map_ptr || !numa2_map_ptr) { perror("numa_alloc_onnode"); return KSFT_FAIL; @@ -416,6 +456,101 @@ err_out: return KSFT_FAIL; } +int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + errx(2, "mmap transhuge"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) + err(2, "read pagemap"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} + +static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr, *map_ptr_orig; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + int pagemap_fd, n_normal_pages, n_huge_pages; + + map_size *= MB; + size_t len = map_size; + + len -= len % HPAGE_SIZE; + map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); + map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; + + if (map_ptr_orig == MAP_FAILED) + err(2, "initial mmap"); + + if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + n_normal_pages = 0; + n_huge_pages = 0; + for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { + if (allocate_transhuge(p, pagemap_fd) < 0) + n_normal_pages++; + else + n_huge_pages++; + } + printf("Number of normal pages: %d\n", n_normal_pages); + printf("Number of huge pages: %d\n", n_huge_pages); + + memset(map_ptr, '*', len); + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_FAIL; +} + static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) { void *map_ptr; @@ -541,7 +676,7 @@ int main(int argc, char *argv[]) bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; long size_MB = 0; - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPC")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCH")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -595,6 +730,9 @@ int main(int argc, char *argv[]) case 'P': test_name = KSM_MERGE_TIME; break; + case 'H': + test_name = KSM_MERGE_TIME_HUGE_PAGES; + break; case 'C': test_name = KSM_COW_TIME; break; @@ -647,6 +785,14 @@ int main(int argc, char *argv[]) ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; + case KSM_MERGE_TIME_HUGE_PAGES: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; case KSM_COW_TIME: ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, page_size); diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c index b959e4ebdad4..3ee0e8275600 100644 --- a/tools/testing/selftests/vm/madv_populate.c +++ b/tools/testing/selftests/vm/madv_populate.c @@ -14,12 +14,11 @@ #include #include #include +#include #include #include "../kselftest.h" -#if defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) - /* * For now, we're using 2 MiB of private anonymous memory for all tests. */ @@ -328,15 +327,3 @@ int main(int argc, char **argv) err, ksft_test_num()); return ksft_exit_pass(); } - -#else /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */ - -#warning "missing MADV_POPULATE_READ or MADV_POPULATE_WRITE definition" - -int main(int argc, char **argv) -{ - ksft_print_header(); - ksft_exit_skip("MADV_POPULATE_READ or MADV_POPULATE_WRITE not defined\n"); -} - -#endif /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */ diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 45e803af7c77..a24d30af3094 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -108,6 +108,17 @@ else echo "[PASS]" fi +echo "-----------------------" +echo "running hugepage-mremap" +echo "-----------------------" +./hugepage-mremap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c index fd7f1b4a96f9..5e4c036f6ad3 100644 --- a/tools/testing/selftests/vm/transhuge-stress.c +++ b/tools/testing/selftests/vm/transhuge-stress.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, - len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); + ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 60aa1a4fc69b..8a09057d2f22 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "../kselftest.h" @@ -307,37 +308,24 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) } struct uffd_test_ops { - unsigned long expected_ioctls; void (*allocate_area)(void **alloc_area); void (*release_pages)(char *rel_area); void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); }; -#define SHMEM_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ - (1 << _UFFDIO_COPY) | \ - (1 << _UFFDIO_ZEROPAGE)) - -#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ - (1 << _UFFDIO_COPY) | \ - (1 << _UFFDIO_ZEROPAGE) | \ - (1 << _UFFDIO_WRITEPROTECT)) - static struct uffd_test_ops anon_uffd_test_ops = { - .expected_ioctls = ANON_EXPECTED_IOCTLS, .allocate_area = anon_allocate_area, .release_pages = anon_release_pages, .alias_mapping = noop_alias_mapping, }; static struct uffd_test_ops shmem_uffd_test_ops = { - .expected_ioctls = SHMEM_EXPECTED_IOCTLS, .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, .alias_mapping = shmem_alias_mapping, }; static struct uffd_test_ops hugetlb_uffd_test_ops = { - .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE), .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, @@ -345,6 +333,43 @@ static struct uffd_test_ops hugetlb_uffd_test_ops = { static struct uffd_test_ops *uffd_test_ops; +static inline uint64_t uffd_minor_feature(void) +{ + if (test_type == TEST_HUGETLB && map_shared) + return UFFD_FEATURE_MINOR_HUGETLBFS; + else if (test_type == TEST_SHMEM) + return UFFD_FEATURE_MINOR_SHMEM; + else + return 0; +} + +static uint64_t get_expected_ioctls(uint64_t mode) +{ + uint64_t ioctls = UFFD_API_RANGE_IOCTLS; + + if (test_type == TEST_HUGETLB) + ioctls &= ~(1 << _UFFDIO_ZEROPAGE); + + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); + + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) + ioctls &= ~(1 << _UFFDIO_CONTINUE); + + return ioctls; +} + +static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) +{ + uint64_t expected = get_expected_ioctls(mode); + uint64_t actual = ioctls & expected; + + if (actual != expected) { + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, + expected, actual); + } +} + static void userfaultfd_open(uint64_t *features) { struct uffdio_api uffdio_api; @@ -405,7 +430,7 @@ static void uffd_test_ctx_clear(void) munmap_area((void **)&area_dst_alias); } -static void uffd_test_ctx_init_ext(uint64_t *features) +static void uffd_test_ctx_init(uint64_t features) { unsigned long nr, cpu; @@ -414,7 +439,7 @@ static void uffd_test_ctx_init_ext(uint64_t *features) uffd_test_ops->allocate_area((void **)&area_src); uffd_test_ops->allocate_area((void **)&area_dst); - userfaultfd_open(features); + userfaultfd_open(&features); count_verify = malloc(nr_pages * sizeof(unsigned long long)); if (!count_verify) @@ -462,11 +487,6 @@ static void uffd_test_ctx_init_ext(uint64_t *features) err("pipe"); } -static inline void uffd_test_ctx_init(uint64_t features) -{ - uffd_test_ctx_init_ext(&features); -} - static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; @@ -518,22 +538,10 @@ static void continue_range(int ufd, __u64 start, __u64 len) static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; - struct random_data rand; unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */ - int32_t rand_nr; unsigned long long count; - char randstate[64]; - unsigned int seed; - if (bounces & BOUNCE_RANDOM) { - seed = (unsigned int) time(NULL) - bounces; - if (!(bounces & BOUNCE_RACINGFAULTS)) - seed += cpu; - bzero(&rand, sizeof(rand)); - bzero(&randstate, sizeof(randstate)); - if (initstate_r(seed, randstate, sizeof(randstate), &rand)) - err("initstate_r failed"); - } else { + if (!(bounces & BOUNCE_RANDOM)) { page_nr = -bounces; if (!(bounces & BOUNCE_RACINGFAULTS)) page_nr += cpu * nr_pages_per_cpu; @@ -541,15 +549,8 @@ static void *locking_thread(void *arg) while (!finished) { if (bounces & BOUNCE_RANDOM) { - if (random_r(&rand, &rand_nr)) - err("random_r failed"); - page_nr = rand_nr; - if (sizeof(page_nr) > sizeof(rand_nr)) { - if (random_r(&rand, &rand_nr)) - err("random_r failed"); - page_nr |= (((unsigned long) rand_nr) << 16) << - 16; - } + if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) + err("getrandom failed"); } else page_nr += 1; page_nr %= nr_pages; @@ -1030,11 +1031,9 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) { struct uffdio_zeropage uffdio_zeropage; int ret; - unsigned long has_zeropage; + bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); __s64 res; - has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE); - if (offset >= nr_pages * page_size) err("unexpected offset %lu", offset); uffdio_zeropage.range.start = (unsigned long) area_dst + offset; @@ -1074,7 +1073,6 @@ static int uffdio_zeropage(int ufd, unsigned long offset) static int userfaultfd_zeropage_test(void) { struct uffdio_register uffdio_register; - unsigned long expected_ioctls; printf("testing UFFDIO_ZEROPAGE: "); fflush(stdout); @@ -1089,9 +1087,8 @@ static int userfaultfd_zeropage_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) - err("unexpected missing ioctl for anon memory"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); if (uffdio_zeropage(uffd, 0)) if (my_bcmp(area_dst, zeropage, page_size)) @@ -1104,7 +1101,6 @@ static int userfaultfd_zeropage_test(void) static int userfaultfd_events_test(void) { struct uffdio_register uffdio_register; - unsigned long expected_ioctls; pthread_t uffd_mon; int err, features; pid_t pid; @@ -1128,9 +1124,8 @@ static int userfaultfd_events_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) - err("unexpected missing ioctl for anon memory"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) err("uffd_poll_thread create"); @@ -1158,7 +1153,6 @@ static int userfaultfd_events_test(void) static int userfaultfd_sig_test(void) { struct uffdio_register uffdio_register; - unsigned long expected_ioctls; unsigned long userfaults; pthread_t uffd_mon; int err, features; @@ -1182,9 +1176,8 @@ static int userfaultfd_sig_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) - err("unexpected missing ioctl for anon memory"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); if (faulting_process(1)) err("faulting process failed"); @@ -1219,14 +1212,12 @@ static int userfaultfd_sig_test(void) static int userfaultfd_minor_test(void) { struct uffdio_register uffdio_register; - unsigned long expected_ioctls; unsigned long p; pthread_t uffd_mon; uint8_t expected_byte; void *expected_page; char c; struct uffd_stats stats = { 0 }; - uint64_t req_features, features_out; if (!test_uffdio_minor) return 0; @@ -1234,21 +1225,7 @@ static int userfaultfd_minor_test(void) printf("testing minor faults: "); fflush(stdout); - if (test_type == TEST_HUGETLB) - req_features = UFFD_FEATURE_MINOR_HUGETLBFS; - else if (test_type == TEST_SHMEM) - req_features = UFFD_FEATURE_MINOR_SHMEM; - else - return 1; - - features_out = req_features; - uffd_test_ctx_init_ext(&features_out); - /* If kernel reports required features aren't supported, skip test. */ - if ((features_out & req_features) != req_features) { - printf("skipping test due to lack of feature support\n"); - fflush(stdout); - return 0; - } + uffd_test_ctx_init(uffd_minor_feature()); uffdio_register.range.start = (unsigned long)area_dst_alias; uffdio_register.range.len = nr_pages * page_size; @@ -1256,10 +1233,8 @@ static int userfaultfd_minor_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - expected_ioctls |= 1 << _UFFDIO_CONTINUE; - if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) - err("unexpected missing ioctl(s)"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); /* * After registering with UFFD, populate the non-UFFD-registered side of @@ -1456,8 +1431,6 @@ static int userfaultfd_stress(void) pthread_attr_setstacksize(&attr, 16*1024*1024); while (bounces--) { - unsigned long expected_ioctls; - printf("bounces: %d, mode:", bounces); if (bounces & BOUNCE_RANDOM) printf(" rnd"); @@ -1485,10 +1458,8 @@ static int userfaultfd_stress(void) uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure"); - expected_ioctls = uffd_test_ops->expected_ioctls; - if ((uffdio_register.ioctls & expected_ioctls) != - expected_ioctls) - err("unexpected missing ioctl for anon memory"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); if (area_dst_alias) { uffdio_register.range.start = (unsigned long) @@ -1609,6 +1580,8 @@ unsigned long default_huge_page_size(void) static void set_test_type(const char *type) { + uint64_t features = UFFD_API_FEATURES; + if (!strcmp(type, "anon")) { test_type = TEST_ANON; uffd_test_ops = &anon_uffd_test_ops; @@ -1642,6 +1615,22 @@ static void set_test_type(const char *type) if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 > page_size) err("Impossible to run this test"); + + /* + * Whether we can test certain features depends not just on test type, + * but also on whether or not this particular kernel supports the + * feature. + */ + + userfaultfd_open(&features); + + test_uffdio_wp = test_uffdio_wp && + (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); + test_uffdio_minor = test_uffdio_minor && + (features & uffd_minor_feature()); + + close(uffd); + uffd = -1; } static void sigalrm(int sig) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index f62f10c988db..b1ed76d9a979 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -390,7 +390,7 @@ static void show_page_range(unsigned long voffset, unsigned long offset, if (opt_pid) printf("%lx\t", voff); if (opt_file) - printf("%lu\t", voff); + printf("%lx\t", voff); if (opt_list_cgroup) printf("@%llu\t", (unsigned long long)cgroup0); if (opt_list_mapcnt) @@ -418,7 +418,7 @@ static void show_page(unsigned long voffset, unsigned long offset, if (opt_pid) printf("%lx\t", voffset); if (opt_file) - printf("%lu\t", voffset); + printf("%lx\t", voffset); if (opt_list_cgroup) printf("@%llu\t", (unsigned long long)cgroup); if (opt_list_mapcnt) @@ -967,22 +967,19 @@ static struct sigaction sigbus_action = { .sa_flags = SA_SIGINFO, }; -static void walk_file(const char *name, const struct stat *st) +static void walk_file_range(const char *name, int fd, + unsigned long off, unsigned long end) { uint8_t vec[PAGEMAP_BATCH]; uint64_t buf[PAGEMAP_BATCH], flags; uint64_t cgroup = 0; uint64_t mapcnt = 0; unsigned long nr_pages, pfn, i; - off_t off, end = st->st_size; - int fd; ssize_t len; void *ptr; int first = 1; - fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); - - for (off = 0; off < end; off += len) { + for (; off < end; off += len) { nr_pages = (end - off + page_size - 1) / page_size; if (nr_pages > PAGEMAP_BATCH) nr_pages = PAGEMAP_BATCH; @@ -1037,12 +1034,26 @@ got_sigbus: if (first && opt_list) { first = 0; flush_page_range(); - show_file(name, st); } add_page(off / page_size + i, pfn, flags, cgroup, mapcnt, buf[i]); } } +} + +static void walk_file(const char *name, const struct stat *st) +{ + int i; + int fd; + + fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); + + if (!nr_addr_ranges) + add_addr_range(0, st->st_size / page_size); + + for (i = 0; i < nr_addr_ranges; i++) + walk_file_range(name, fd, opt_offset[i] * page_size, + (opt_offset[i] + opt_size[i]) * page_size); close(fd); } @@ -1062,10 +1073,10 @@ int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f) return 0; } +struct stat st; + static void walk_page_cache(void) { - struct stat st; - kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); sigaction(SIGBUS, &sigbus_action, NULL); @@ -1362,6 +1373,11 @@ int main(int argc, char *argv[]) if (opt_list) printf("\n\n"); + if (opt_file) { + show_file(opt_file, &st); + printf("\n"); + } + show_summary(); if (opt_list_mapcnt) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 0e75f22c9475..9ebb84a9c731 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -5,6 +5,8 @@ * Example use: * cat /sys/kernel/debug/page_owner > page_owner_full.txt * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + * Or sort by total memory: + * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt * * See Documentation/vm/page_owner.rst */ @@ -16,14 +18,18 @@ #include #include #include +#include +#include struct block_list { char *txt; int len; int num; + int page_num; }; - +static int sort_by_memory; +static regex_t order_pattern; static struct block_list *list; static int list_size; static int max_size; @@ -59,12 +65,50 @@ static int compare_num(const void *p1, const void *p2) return l2->num - l1->num; } +static int compare_page_num(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l2->page_num - l1->page_num; +} + +static int get_page_num(char *buf) +{ + int err, val_len, order_val; + char order_str[4] = {0}; + char *endptr; + regmatch_t pmatch[2]; + + err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL); + if (err != 0 || pmatch[1].rm_so == -1) { + printf("no order pattern in %s\n", buf); + return 0; + } + val_len = pmatch[1].rm_eo - pmatch[1].rm_so; + if (val_len > 2) /* max_order should not exceed 2 digits */ + goto wrong_order; + + memcpy(order_str, buf + pmatch[1].rm_so, val_len); + + errno = 0; + order_val = strtol(order_str, &endptr, 10); + if (errno != 0 || endptr == order_str || *endptr != '\0') + goto wrong_order; + + return 1 << order_val; + +wrong_order: + printf("wrong order in follow buf:\n%s\n", buf); + return 0; +} + static void add_list(char *buf, int len) { if (list_size != 0 && len == list[list_size-1].len && memcmp(buf, list[list_size-1].txt, len) == 0) { list[list_size-1].num++; + list[list_size-1].page_num += get_page_num(buf); return; } if (list_size == max_size) { @@ -74,6 +118,7 @@ static void add_list(char *buf, int len) list[list_size].txt = malloc(len+1); list[list_size].len = len; list[list_size].num = 1; + list[list_size].page_num = get_page_num(buf); memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; list_size++; @@ -85,6 +130,13 @@ static void add_list(char *buf, int len) #define BUF_SIZE (128 * 1024) +static void usage(void) +{ + printf("Usage: ./page_owner_sort [-m] \n" + "-m Sort by total memory. If this option is unset, sort by times\n" + ); +} + int main(int argc, char **argv) { FILE *fin, *fout; @@ -92,18 +144,36 @@ int main(int argc, char **argv) int ret, i, count; struct block_list *list2; struct stat st; + int err; + int opt; - if (argc < 3) { - printf("Usage: ./program \n"); + while ((opt = getopt(argc, argv, "m")) != -1) + switch (opt) { + case 'm': + sort_by_memory = 1; + break; + default: + usage(); + exit(1); + } + + if (optind >= (argc - 1)) { + usage(); + exit(1); + } + + fin = fopen(argv[optind], "r"); + fout = fopen(argv[optind + 1], "w"); + if (!fin || !fout) { + usage(); perror("open: "); exit(1); } - fin = fopen(argv[1], "r"); - fout = fopen(argv[2], "w"); - if (!fin || !fout) { - printf("Usage: ./program \n"); - perror("open: "); + err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE); + if (err != 0 || order_pattern.re_nsub != 1) { + printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n", + argv[0], err); exit(1); } @@ -145,13 +215,19 @@ int main(int argc, char **argv) list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; + list2[count-1].page_num += list[i].page_num; } } - qsort(list2, count, sizeof(list[0]), compare_num); + if (sort_by_memory) + qsort(list2, count, sizeof(list[0]), compare_page_num); + else + qsort(list2, count, sizeof(list[0]), compare_num); for (i = 0; i < count; i++) - fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt); + fprintf(fout, "%d times, %d pages:\n%s\n", + list2[i].num, list2[i].page_num, list2[i].txt); + regfree(&order_pattern); return 0; }