Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "257 patches. Subsystems affected by this patch series: scripts, ocfs2, vfs, and mm (slab-generic, slab, slub, kconfig, dax, kasan, debug, pagecache, gup, swap, memcg, pagemap, mprotect, mremap, iomap, tracing, vmalloc, pagealloc, memory-failure, hugetlb, userfaultfd, vmscan, tools, memblock, oom-kill, hugetlbfs, migration, thp, readahead, nommu, ksm, vmstat, madvise, memory-hotplug, rmap, zsmalloc, highmem, zram, cleanups, kfence, and damon)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (257 commits) mm/damon: remove return value from before_terminate callback mm/damon: fix a few spelling mistakes in comments and a pr_debug message mm/damon: simplify stop mechanism Docs/admin-guide/mm/pagemap: wordsmith page flags descriptions Docs/admin-guide/mm/damon/start: simplify the content Docs/admin-guide/mm/damon/start: fix a wrong link Docs/admin-guide/mm/damon/start: fix wrong example commands mm/damon/dbgfs: add adaptive_targets list check before enable monitor_on mm/damon: remove unnecessary variable initialization Documentation/admin-guide/mm/damon: add a document for DAMON_RECLAIM mm/damon: introduce DAMON-based Reclamation (DAMON_RECLAIM) selftests/damon: support watermarks mm/damon/dbgfs: support watermarks mm/damon/schemes: activate schemes based on a watermarks mechanism tools/selftests/damon: update for regions prioritization of schemes mm/damon/dbgfs: support prioritization weights mm/damon/vaddr,paddr: support pageout prioritization mm/damon/schemes: prioritize regions within the quotas mm/damon/selftests: support schemes quotas mm/damon/dbgfs: support quotas of schemes ...
This commit is contained in:
commit
512b7931ad
@ -328,6 +328,14 @@ as idle::
|
||||
From now on, any pages on zram are idle pages. The idle mark
|
||||
will be removed until someone requests access of the block.
|
||||
IOW, unless there is access request, those pages are still idle pages.
|
||||
Additionally, when CONFIG_ZRAM_MEMORY_TRACKING is enabled pages can be
|
||||
marked as idle based on how long (in seconds) it's been since they were
|
||||
last accessed::
|
||||
|
||||
echo 86400 > /sys/block/zramX/idle
|
||||
|
||||
In this example all pages which haven't been accessed in more than 86400
|
||||
seconds (one day) will be marked idle.
|
||||
|
||||
Admin can request writeback of those idle pages at right timing via::
|
||||
|
||||
|
@ -87,10 +87,8 @@ Brief summary of control files.
|
||||
memory.oom_control set/show oom controls.
|
||||
memory.numa_stat show the number of memory usage per numa
|
||||
node
|
||||
memory.kmem.limit_in_bytes set/show hard limit for kernel memory
|
||||
This knob is deprecated and shouldn't be
|
||||
used. It is planned that this be removed in
|
||||
the foreseeable future.
|
||||
memory.kmem.limit_in_bytes This knob is deprecated and writing to
|
||||
it will return -ENOTSUPP.
|
||||
memory.kmem.usage_in_bytes show current kernel memory allocation
|
||||
memory.kmem.failcnt show the number of kernel memory usage
|
||||
hits limits
|
||||
@ -518,11 +516,6 @@ will be charged as a new owner of it.
|
||||
charged file caches. Some out-of-use page caches may keep charged until
|
||||
memory pressure happens. If you want to avoid that, force_empty will be useful.
|
||||
|
||||
Also, note that when memory.kmem.limit_in_bytes is set the charges due to
|
||||
kernel pages will still be seen. This is not considered a failure and the
|
||||
write will still return success. In this case, it is expected that
|
||||
memory.kmem.usage_in_bytes == memory.usage_in_bytes.
|
||||
|
||||
5.2 stat file
|
||||
-------------
|
||||
|
||||
|
@ -1582,8 +1582,10 @@
|
||||
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
|
||||
|
||||
hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation
|
||||
of gigantic hugepages.
|
||||
Format: nn[KMGTPE]
|
||||
of gigantic hugepages. Or using node format, the size
|
||||
of a CMA area per node can be specified.
|
||||
Format: nn[KMGTPE] or (node format)
|
||||
<node>:nn[KMGTPE][,<node>:nn[KMGTPE]]
|
||||
|
||||
Reserve a CMA area of given size and allocate gigantic
|
||||
hugepages using the CMA allocator. If enabled, the
|
||||
@ -1594,9 +1596,11 @@
|
||||
the number of pages of hugepagesz to be allocated.
|
||||
If this is the first HugeTLB parameter on the command
|
||||
line, it specifies the number of pages to allocate for
|
||||
the default huge page size. See also
|
||||
Documentation/admin-guide/mm/hugetlbpage.rst.
|
||||
Format: <integer>
|
||||
the default huge page size. If using node format, the
|
||||
number of pages to allocate per-node can be specified.
|
||||
See also Documentation/admin-guide/mm/hugetlbpage.rst.
|
||||
Format: <integer> or (node format)
|
||||
<node>:<integer>[,<node>:<integer>]
|
||||
|
||||
hugepagesz=
|
||||
[HW] The size of the HugeTLB pages. This is used in
|
||||
|
@ -13,3 +13,4 @@ optimize those.
|
||||
|
||||
start
|
||||
usage
|
||||
reclaim
|
||||
|
235
Documentation/admin-guide/mm/damon/reclaim.rst
Normal file
235
Documentation/admin-guide/mm/damon/reclaim.rst
Normal file
@ -0,0 +1,235 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================
|
||||
DAMON-based Reclamation
|
||||
=======================
|
||||
|
||||
DAMON-based Reclamation (DAMON_RECLAIM) is a static kernel module that aimed to
|
||||
be used for proactive and lightweight reclamation under light memory pressure.
|
||||
It doesn't aim to replace the LRU-list based page_granularity reclamation, but
|
||||
to be selectively used for different level of memory pressure and requirements.
|
||||
|
||||
Where Proactive Reclamation is Required?
|
||||
========================================
|
||||
|
||||
On general memory over-committed systems, proactively reclaiming cold pages
|
||||
helps saving memory and reducing latency spikes that incurred by the direct
|
||||
reclaim of the process or CPU consumption of kswapd, while incurring only
|
||||
minimal performance degradation [1]_ [2]_ .
|
||||
|
||||
Free Pages Reporting [3]_ based memory over-commit virtualization systems are
|
||||
good example of the cases. In such systems, the guest VMs reports their free
|
||||
memory to host, and the host reallocates the reported memory to other guests.
|
||||
As a result, the memory of the systems are fully utilized. However, the
|
||||
guests could be not so memory-frugal, mainly because some kernel subsystems and
|
||||
user-space applications are designed to use as much memory as available. Then,
|
||||
guests could report only small amount of memory as free to host, results in
|
||||
memory utilization drop of the systems. Running the proactive reclamation in
|
||||
guests could mitigate this problem.
|
||||
|
||||
How It Works?
|
||||
=============
|
||||
|
||||
DAMON_RECLAIM finds memory regions that didn't accessed for specific time
|
||||
duration and page out. To avoid it consuming too much CPU for the paging out
|
||||
operation, a speed limit can be configured. Under the speed limit, it pages
|
||||
out memory regions that didn't accessed longer time first. System
|
||||
administrators can also configure under what situation this scheme should
|
||||
automatically activated and deactivated with three memory pressure watermarks.
|
||||
|
||||
Interface: Module Parameters
|
||||
============================
|
||||
|
||||
To use this feature, you should first ensure your system is running on a kernel
|
||||
that is built with ``CONFIG_DAMON_RECLAIM=y``.
|
||||
|
||||
To let sysadmins enable or disable it and tune for the given system,
|
||||
DAMON_RECLAIM utilizes module parameters. That is, you can put
|
||||
``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
|
||||
proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
|
||||
|
||||
Note that the parameter values except ``enabled`` are applied only when
|
||||
DAMON_RECLAIM starts. Therefore, if you want to apply new parameter values in
|
||||
runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable
|
||||
it via ``enabled`` parameter file. Writing of the new values to proper
|
||||
parameter values should be done before the re-enablement.
|
||||
|
||||
Below are the description of each parameter.
|
||||
|
||||
enabled
|
||||
-------
|
||||
|
||||
Enable or disable DAMON_RECLAIM.
|
||||
|
||||
You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
|
||||
Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could do
|
||||
no real monitoring and reclamation due to the watermarks-based activation
|
||||
condition. Refer to below descriptions for the watermarks parameter for this.
|
||||
|
||||
min_age
|
||||
-------
|
||||
|
||||
Time threshold for cold memory regions identification in microseconds.
|
||||
|
||||
If a memory region is not accessed for this or longer time, DAMON_RECLAIM
|
||||
identifies the region as cold, and reclaims it.
|
||||
|
||||
120 seconds by default.
|
||||
|
||||
quota_ms
|
||||
--------
|
||||
|
||||
Limit of time for the reclamation in milliseconds.
|
||||
|
||||
DAMON_RECLAIM tries to use only up to this time within a time window
|
||||
(quota_reset_interval_ms) for trying reclamation of cold pages. This can be
|
||||
used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, the
|
||||
limit is disabled.
|
||||
|
||||
10 ms by default.
|
||||
|
||||
quota_sz
|
||||
--------
|
||||
|
||||
Limit of size of memory for the reclamation in bytes.
|
||||
|
||||
DAMON_RECLAIM charges amount of memory which it tried to reclaim within a time
|
||||
window (quota_reset_interval_ms) and makes no more than this limit is tried.
|
||||
This can be used for limiting consumption of CPU and IO. If this value is
|
||||
zero, the limit is disabled.
|
||||
|
||||
128 MiB by default.
|
||||
|
||||
quota_reset_interval_ms
|
||||
-----------------------
|
||||
|
||||
The time/size quota charge reset interval in milliseconds.
|
||||
|
||||
The charget reset interval for the quota of time (quota_ms) and size
|
||||
(quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than
|
||||
quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
|
||||
milliseconds.
|
||||
|
||||
1 second by default.
|
||||
|
||||
wmarks_interval
|
||||
---------------
|
||||
|
||||
Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
|
||||
enabled but inactive due to its watermarks rule.
|
||||
|
||||
wmarks_high
|
||||
-----------
|
||||
|
||||
Free memory rate (per thousand) for the high watermark.
|
||||
|
||||
If free memory of the system in bytes per thousand bytes is higher than this,
|
||||
DAMON_RECLAIM becomes inactive, so it does nothing but only periodically checks
|
||||
the watermarks.
|
||||
|
||||
wmarks_mid
|
||||
----------
|
||||
|
||||
Free memory rate (per thousand) for the middle watermark.
|
||||
|
||||
If free memory of the system in bytes per thousand bytes is between this and
|
||||
the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring and
|
||||
the reclaiming.
|
||||
|
||||
wmarks_low
|
||||
----------
|
||||
|
||||
Free memory rate (per thousand) for the low watermark.
|
||||
|
||||
If free memory of the system in bytes per thousand bytes is lower than this,
|
||||
DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks the
|
||||
watermarks. In the case, the system falls back to the LRU-list based page
|
||||
granularity reclamation logic.
|
||||
|
||||
sample_interval
|
||||
---------------
|
||||
|
||||
Sampling interval for the monitoring in microseconds.
|
||||
|
||||
The sampling interval of DAMON for the cold memory monitoring. Please refer to
|
||||
the DAMON documentation (:doc:`usage`) for more detail.
|
||||
|
||||
aggr_interval
|
||||
-------------
|
||||
|
||||
Aggregation interval for the monitoring in microseconds.
|
||||
|
||||
The aggregation interval of DAMON for the cold memory monitoring. Please
|
||||
refer to the DAMON documentation (:doc:`usage`) for more detail.
|
||||
|
||||
min_nr_regions
|
||||
--------------
|
||||
|
||||
Minimum number of monitoring regions.
|
||||
|
||||
The minimal number of monitoring regions of DAMON for the cold memory
|
||||
monitoring. This can be used to set lower-bound of the monitoring quality.
|
||||
But, setting this too high could result in increased monitoring overhead.
|
||||
Please refer to the DAMON documentation (:doc:`usage`) for more detail.
|
||||
|
||||
max_nr_regions
|
||||
--------------
|
||||
|
||||
Maximum number of monitoring regions.
|
||||
|
||||
The maximum number of monitoring regions of DAMON for the cold memory
|
||||
monitoring. This can be used to set upper-bound of the monitoring overhead.
|
||||
However, setting this too low could result in bad monitoring quality. Please
|
||||
refer to the DAMON documentation (:doc:`usage`) for more detail.
|
||||
|
||||
monitor_region_start
|
||||
--------------------
|
||||
|
||||
Start of target memory region in physical address.
|
||||
|
||||
The start physical address of memory region that DAMON_RECLAIM will do work
|
||||
against. That is, DAMON_RECLAIM will find cold memory regions in this region
|
||||
and reclaims. By default, biggest System RAM is used as the region.
|
||||
|
||||
monitor_region_end
|
||||
------------------
|
||||
|
||||
End of target memory region in physical address.
|
||||
|
||||
The end physical address of memory region that DAMON_RECLAIM will do work
|
||||
against. That is, DAMON_RECLAIM will find cold memory regions in this region
|
||||
and reclaims. By default, biggest System RAM is used as the region.
|
||||
|
||||
kdamond_pid
|
||||
-----------
|
||||
|
||||
PID of the DAMON thread.
|
||||
|
||||
If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else,
|
||||
-1.
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
Below runtime example commands make DAMON_RECLAIM to find memory regions that
|
||||
not accessed for 30 seconds or more and pages out. The reclamation is limited
|
||||
to be done only up to 1 GiB per second to avoid DAMON_RECLAIM consuming too
|
||||
much CPU time for the paging out operation. It also asks DAMON_RECLAIM to do
|
||||
nothing if the system's free memory rate is more than 50%, but start the real
|
||||
works if it becomes lower than 40%. If DAMON_RECLAIM doesn't make progress and
|
||||
therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to
|
||||
do nothing again, so that we can fall back to the LRU-list based page
|
||||
granularity reclamation. ::
|
||||
|
||||
# cd /sys/modules/damon_reclaim/parameters
|
||||
# echo 30000000 > min_age
|
||||
# echo $((1 * 1024 * 1024 * 1024)) > quota_sz
|
||||
# echo 1000 > quota_reset_interval_ms
|
||||
# echo 500 > wmarks_high
|
||||
# echo 400 > wmarks_mid
|
||||
# echo 200 > wmarks_low
|
||||
# echo Y > enabled
|
||||
|
||||
.. [1] https://research.google/pubs/pub48551/
|
||||
.. [2] https://lwn.net/Articles/787611/
|
||||
.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
|
@ -6,39 +6,9 @@ Getting Started
|
||||
|
||||
This document briefly describes how you can use DAMON by demonstrating its
|
||||
default user space tool. Please note that this document describes only a part
|
||||
of its features for brevity. Please refer to :doc:`usage` for more details.
|
||||
|
||||
|
||||
TL; DR
|
||||
======
|
||||
|
||||
Follow the commands below to monitor and visualize the memory access pattern of
|
||||
your workload. ::
|
||||
|
||||
# # build the kernel with CONFIG_DAMON_*=y, install it, and reboot
|
||||
# mount -t debugfs none /sys/kernel/debug/
|
||||
# git clone https://github.com/awslabs/damo
|
||||
# ./damo/damo record $(pidof <your workload>)
|
||||
# ./damo/damo report heat --plot_ascii
|
||||
|
||||
The final command draws the access heatmap of ``<your workload>``. The heatmap
|
||||
shows which memory region (x-axis) is accessed when (y-axis) and how frequently
|
||||
(number; the higher the more accesses have been observed). ::
|
||||
|
||||
111111111111111111111111111111111111111111111111111111110000
|
||||
111121111111111111111111111111211111111111111111111111110000
|
||||
000000000000000000000000000000000000000000000000001555552000
|
||||
000000000000000000000000000000000000000000000222223555552000
|
||||
000000000000000000000000000000000000000011111677775000000000
|
||||
000000000000000000000000000000000000000488888000000000000000
|
||||
000000000000000000000000000000000177888400000000000000000000
|
||||
000000000000000000000000000046666522222100000000000000000000
|
||||
000000000000000000000014444344444300000000000000000000000000
|
||||
000000000000000002222245555510000000000000000000000000000000
|
||||
# access_frequency: 0 1 2 3 4 5 6 7 8 9
|
||||
# x-axis: space (140286319947776-140286426374096: 101.496 MiB)
|
||||
# y-axis: time (605442256436361-605479951866441: 37.695430s)
|
||||
# resolution: 60x10 (1.692 MiB and 3.770s for each character)
|
||||
of its features for brevity. Please refer to the usage `doc
|
||||
<https://github.com/awslabs/damo/blob/next/USAGE.md>`_ of the tool for more
|
||||
details.
|
||||
|
||||
|
||||
Prerequisites
|
||||
@ -91,24 +61,74 @@ pattern in the ``damon.data`` file.
|
||||
Visualizing Recorded Patterns
|
||||
=============================
|
||||
|
||||
The following three commands visualize the recorded access patterns and save
|
||||
the results as separate image files. ::
|
||||
You can visualize the pattern in a heatmap, showing which memory region
|
||||
(x-axis) got accessed when (y-axis) and how frequently (number).::
|
||||
|
||||
$ damo report heats --heatmap access_pattern_heatmap.png
|
||||
$ damo report wss --range 0 101 1 --plot wss_dist.png
|
||||
$ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
|
||||
$ sudo damo report heats --heatmap stdout
|
||||
22222222222222222222222222222222222222211111111111111111111111111111111111111100
|
||||
44444444444444444444444444444444444444434444444444444444444444444444444444443200
|
||||
44444444444444444444444444444444444444433444444444444444444444444444444444444200
|
||||
33333333333333333333333333333333333333344555555555555555555555555555555555555200
|
||||
33333333333333333333333333333333333344444444444444444444444444444444444444444200
|
||||
22222222222222222222222222222222222223355555555555555555555555555555555555555200
|
||||
00000000000000000000000000000000000000288888888888888888888888888888888888888400
|
||||
00000000000000000000000000000000000000288888888888888888888888888888888888888400
|
||||
33333333333333333333333333333333333333355555555555555555555555555555555555555200
|
||||
88888888888888888888888888888888888888600000000000000000000000000000000000000000
|
||||
88888888888888888888888888888888888888600000000000000000000000000000000000000000
|
||||
33333333333333333333333333333333333333444444444444444444444444444444444444443200
|
||||
00000000000000000000000000000000000000288888888888888888888888888888888888888400
|
||||
[...]
|
||||
# access_frequency: 0 1 2 3 4 5 6 7 8 9
|
||||
# x-axis: space (139728247021568-139728453431248: 196.848 MiB)
|
||||
# y-axis: time (15256597248362-15326899978162: 1 m 10.303 s)
|
||||
# resolution: 80x40 (2.461 MiB and 1.758 s for each character)
|
||||
|
||||
- ``access_pattern_heatmap.png`` will visualize the data access pattern in a
|
||||
heatmap, showing which memory region (y-axis) got accessed when (x-axis)
|
||||
and how frequently (color).
|
||||
- ``wss_dist.png`` will show the distribution of the working set size.
|
||||
- ``wss_chron_change.png`` will show how the working set size has
|
||||
chronologically changed.
|
||||
You can also visualize the distribution of the working set size, sorted by the
|
||||
size.::
|
||||
|
||||
You can view the visualizations of this example workload at [1]_.
|
||||
Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_.
|
||||
$ sudo damo report wss --range 0 101 10
|
||||
# <percentile> <wss>
|
||||
# target_id 18446632103789443072
|
||||
# avr: 107.708 MiB
|
||||
0 0 B | |
|
||||
10 95.328 MiB |**************************** |
|
||||
20 95.332 MiB |**************************** |
|
||||
30 95.340 MiB |**************************** |
|
||||
40 95.387 MiB |**************************** |
|
||||
50 95.387 MiB |**************************** |
|
||||
60 95.398 MiB |**************************** |
|
||||
70 95.398 MiB |**************************** |
|
||||
80 95.504 MiB |**************************** |
|
||||
90 190.703 MiB |********************************************************* |
|
||||
100 196.875 MiB |***********************************************************|
|
||||
|
||||
.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
|
||||
.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
|
||||
.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
|
||||
.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html
|
||||
Using ``--sortby`` option with the above command, you can show how the working
|
||||
set size has chronologically changed.::
|
||||
|
||||
$ sudo damo report wss --range 0 101 10 --sortby time
|
||||
# <percentile> <wss>
|
||||
# target_id 18446632103789443072
|
||||
# avr: 107.708 MiB
|
||||
0 3.051 MiB | |
|
||||
10 190.703 MiB |***********************************************************|
|
||||
20 95.336 MiB |***************************** |
|
||||
30 95.328 MiB |***************************** |
|
||||
40 95.387 MiB |***************************** |
|
||||
50 95.332 MiB |***************************** |
|
||||
60 95.320 MiB |***************************** |
|
||||
70 95.398 MiB |***************************** |
|
||||
80 95.398 MiB |***************************** |
|
||||
90 95.340 MiB |***************************** |
|
||||
100 95.398 MiB |***************************** |
|
||||
|
||||
|
||||
Data Access Pattern Aware Memory Management
|
||||
===========================================
|
||||
|
||||
Below three commands make every memory region of size >=4K that doesn't
|
||||
accessed for >=60 seconds in your workload to be swapped out. ::
|
||||
|
||||
$ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme
|
||||
$ echo "4K max 0 0 60s max pageout" >> test_scheme
|
||||
$ damo schemes -c test_scheme <pid of your workload>
|
||||
|
@ -10,15 +10,16 @@ DAMON provides below three interfaces for different users.
|
||||
This is for privileged people such as system administrators who want a
|
||||
just-working human-friendly interface. Using this, users can use the DAMON’s
|
||||
major features in a human-friendly way. It may not be highly tuned for
|
||||
special cases, though. It supports only virtual address spaces monitoring.
|
||||
special cases, though. It supports both virtual and physical address spaces
|
||||
monitoring.
|
||||
- *debugfs interface.*
|
||||
This is for privileged user space programmers who want more optimized use of
|
||||
DAMON. Using this, users can use DAMON’s major features by reading
|
||||
from and writing to special debugfs files. Therefore, you can write and use
|
||||
your personalized DAMON debugfs wrapper programs that reads/writes the
|
||||
debugfs files instead of you. The DAMON user space tool is also a reference
|
||||
implementation of such programs. It supports only virtual address spaces
|
||||
monitoring.
|
||||
implementation of such programs. It supports both virtual and physical
|
||||
address spaces monitoring.
|
||||
- *Kernel Space Programming Interface.*
|
||||
This is for kernel space programmers. Using this, users can utilize every
|
||||
feature of DAMON most flexibly and efficiently by writing kernel space
|
||||
@ -34,8 +35,9 @@ the reason, this document describes only the debugfs interface
|
||||
debugfs Interface
|
||||
=================
|
||||
|
||||
DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under
|
||||
its debugfs directory, ``<debugfs>/damon/``.
|
||||
DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes`` and ``monitor_on`` under its debugfs directory,
|
||||
``<debugfs>/damon/``.
|
||||
|
||||
|
||||
Attributes
|
||||
@ -71,9 +73,106 @@ check it again::
|
||||
# cat target_ids
|
||||
42 4242
|
||||
|
||||
Users can also monitor the physical memory address space of the system by
|
||||
writing a special keyword, "``paddr\n``" to the file. Because physical address
|
||||
space monitoring doesn't support multiple targets, reading the file will show a
|
||||
fake value, ``42``, as below::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo paddr > target_ids
|
||||
# cat target_ids
|
||||
42
|
||||
|
||||
Note that setting the target ids doesn't start the monitoring.
|
||||
|
||||
|
||||
Initial Monitoring Target Regions
|
||||
---------------------------------
|
||||
|
||||
In case of the virtual address space monitoring, DAMON automatically sets and
|
||||
updates the monitoring target regions so that entire memory mappings of target
|
||||
processes can be covered. However, users can want to limit the monitoring
|
||||
region to specific address ranges, such as the heap, the stack, or specific
|
||||
file-mapped area. Or, some users can know the initial access pattern of their
|
||||
workloads and therefore want to set optimal initial regions for the 'adaptive
|
||||
regions adjustment'.
|
||||
|
||||
In contrast, DAMON do not automatically sets and updates the monitoring target
|
||||
regions in case of physical memory monitoring. Therefore, users should set the
|
||||
monitoring target regions by themselves.
|
||||
|
||||
In such cases, users can explicitly set the initial monitoring target regions
|
||||
as they want, by writing proper values to the ``init_regions`` file. Each line
|
||||
of the input should represent one region in below form.::
|
||||
|
||||
<target id> <start address> <end address>
|
||||
|
||||
The ``target id`` should already in ``target_ids`` file, and the regions should
|
||||
be passed in address order. For example, below commands will set a couple of
|
||||
address ranges, ``1-100`` and ``100-200`` as the initial monitoring target
|
||||
region of process 42, and another couple of address ranges, ``20-40`` and
|
||||
``50-100`` as that of process 4242.::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo "42 1 100
|
||||
42 100 200
|
||||
4242 20 40
|
||||
4242 50 100" > init_regions
|
||||
|
||||
Note that this sets the initial monitoring target regions only. In case of
|
||||
virtual memory monitoring, DAMON will automatically updates the boundary of the
|
||||
regions after one ``regions update interval``. Therefore, users should set the
|
||||
``regions update interval`` large enough in this case, if they don't want the
|
||||
update.
|
||||
|
||||
|
||||
Schemes
|
||||
-------
|
||||
|
||||
For usual DAMON-based data access aware memory management optimizations, users
|
||||
would simply want the system to apply a memory management action to a memory
|
||||
region of a specific size having a specific access frequency for a specific
|
||||
time. DAMON receives such formalized operation schemes from the user and
|
||||
applies those to the target processes. It also counts the total number and
|
||||
size of regions that each scheme is applied. This statistics can be used for
|
||||
online analysis or tuning of the schemes.
|
||||
|
||||
Users can get and set the schemes by reading from and writing to ``schemes``
|
||||
debugfs file. Reading the file also shows the statistics of each scheme. To
|
||||
the file, each of the schemes should be represented in each line in below form:
|
||||
|
||||
min-size max-size min-acc max-acc min-age max-age action
|
||||
|
||||
Note that the ranges are closed interval. Bytes for the size of regions
|
||||
(``min-size`` and ``max-size``), number of monitored accesses per aggregate
|
||||
interval for access frequency (``min-acc`` and ``max-acc``), number of
|
||||
aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a
|
||||
predefined integer for memory management actions should be used. The supported
|
||||
numbers and their meanings are as below.
|
||||
|
||||
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||
- 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
|
||||
- 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
|
||||
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
- 5: Do nothing but count the statistics
|
||||
|
||||
You can disable schemes by simply writing an empty string to the file. For
|
||||
example, below commands applies a scheme saying "If a memory region of size in
|
||||
[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
|
||||
interval in [10, 20], page out the region", check the entered scheme again, and
|
||||
finally remove the scheme. ::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo "4096 8192 0 5 10 20 2" > schemes
|
||||
# cat schemes
|
||||
4096 8192 0 5 10 20 2 0 0
|
||||
# echo > schemes
|
||||
|
||||
The last two integers in the 4th line of above example is the total number and
|
||||
the total size of the regions that the scheme is applied.
|
||||
|
||||
|
||||
Turning On/Off
|
||||
--------------
|
||||
|
||||
|
@ -128,7 +128,9 @@ hugepages
|
||||
implicitly specifies the number of huge pages of default size to
|
||||
allocate. If the number of huge pages of default size is implicitly
|
||||
specified, it can not be overwritten by a hugepagesz,hugepages
|
||||
parameter pair for the default size.
|
||||
parameter pair for the default size. This parameter also has a
|
||||
node format. The node format specifies the number of huge pages
|
||||
to allocate on specific nodes.
|
||||
|
||||
For example, on an architecture with 2M default huge page size::
|
||||
|
||||
@ -138,6 +140,14 @@ hugepages
|
||||
indicating that the hugepages=512 parameter is ignored. If a hugepages
|
||||
parameter is preceded by an invalid hugepagesz parameter, it will
|
||||
be ignored.
|
||||
|
||||
Node format example::
|
||||
|
||||
hugepagesz=2M hugepages=0:1,1:2
|
||||
|
||||
It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1.
|
||||
If the node number is invalid, the parameter will be ignored.
|
||||
|
||||
default_hugepagesz
|
||||
Specify the default huge page size. This parameter can
|
||||
only be specified once on the command line. default_hugepagesz can
|
||||
@ -234,8 +244,12 @@ will exist, of the form::
|
||||
|
||||
hugepages-${size}kB
|
||||
|
||||
Inside each of these directories, the same set of files will exist::
|
||||
Inside each of these directories, the set of files contained in ``/proc``
|
||||
will exist. In addition, two additional interfaces for demoting huge
|
||||
pages may exist::
|
||||
|
||||
demote
|
||||
demote_size
|
||||
nr_hugepages
|
||||
nr_hugepages_mempolicy
|
||||
nr_overcommit_hugepages
|
||||
@ -243,7 +257,29 @@ Inside each of these directories, the same set of files will exist::
|
||||
resv_hugepages
|
||||
surplus_hugepages
|
||||
|
||||
which function as described above for the default huge page-sized case.
|
||||
The demote interfaces provide the ability to split a huge page into
|
||||
smaller huge pages. For example, the x86 architecture supports both
|
||||
1GB and 2MB huge pages sizes. A 1GB huge page can be split into 512
|
||||
2MB huge pages. Demote interfaces are not available for the smallest
|
||||
huge page size. The demote interfaces are:
|
||||
|
||||
demote_size
|
||||
is the size of demoted pages. When a page is demoted a corresponding
|
||||
number of huge pages of demote_size will be created. By default,
|
||||
demote_size is set to the next smaller huge page size. If there are
|
||||
multiple smaller huge page sizes, demote_size can be set to any of
|
||||
these smaller sizes. Only huge page sizes less than the current huge
|
||||
pages size are allowed.
|
||||
|
||||
demote
|
||||
is used to demote a number of huge pages. A user with root privileges
|
||||
can write to this file. It may not be possible to demote the
|
||||
requested number of huge pages. To determine how many pages were
|
||||
actually demoted, compare the value of nr_hugepages before and after
|
||||
writing to the demote interface. demote is a write only interface.
|
||||
|
||||
The interfaces which are the same as in ``/proc`` (all except demote and
|
||||
demote_size) function as described above for the default huge page-sized case.
|
||||
|
||||
.. _mem_policy_and_hp_alloc:
|
||||
|
||||
|
@ -37,5 +37,7 @@ the Linux memory management.
|
||||
numaperf
|
||||
pagemap
|
||||
soft-dirty
|
||||
swap_numa
|
||||
transhuge
|
||||
userfaultfd
|
||||
zswap
|
||||
|
@ -165,9 +165,8 @@ Or alternatively::
|
||||
|
||||
% echo 1 > /sys/devices/system/memory/memoryXXX/online
|
||||
|
||||
The kernel will select the target zone automatically, usually defaulting to
|
||||
``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel
|
||||
command line or if the memory block would intersect the ZONE_MOVABLE already.
|
||||
The kernel will select the target zone automatically, depending on the
|
||||
configured ``online_policy``.
|
||||
|
||||
One can explicitly request to associate an offline memory block with
|
||||
ZONE_MOVABLE by::
|
||||
@ -198,6 +197,9 @@ Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or
|
||||
|
||||
% echo online > /sys/devices/system/memory/auto_online_blocks
|
||||
|
||||
Similarly to manual onlining, with ``online`` the kernel will select the
|
||||
target zone automatically, depending on the configured ``online_policy``.
|
||||
|
||||
Modifying the auto-online behavior will only affect all subsequently added
|
||||
memory blocks only.
|
||||
|
||||
@ -393,11 +395,16 @@ command line parameters are relevant:
|
||||
======================== =======================================================
|
||||
``memhp_default_state`` configure auto-onlining by essentially setting
|
||||
``/sys/devices/system/memory/auto_online_blocks``.
|
||||
``movablecore`` configure automatic zone selection of the kernel. When
|
||||
set, the kernel will default to ZONE_MOVABLE, unless
|
||||
other zones can be kept contiguous.
|
||||
``movable_node`` configure automatic zone selection in the kernel when
|
||||
using the ``contig-zones`` online policy. When
|
||||
set, the kernel will default to ZONE_MOVABLE when
|
||||
onlining a memory block, unless other zones can be kept
|
||||
contiguous.
|
||||
======================== =======================================================
|
||||
|
||||
See Documentation/admin-guide/kernel-parameters.txt for a more generic
|
||||
description of these command line parameters.
|
||||
|
||||
Module Parameters
|
||||
------------------
|
||||
|
||||
@ -410,24 +417,118 @@ them with ``memory_hotplug.`` such as::
|
||||
|
||||
and they can be observed (and some even modified at runtime) via::
|
||||
|
||||
/sys/modules/memory_hotplug/parameters/
|
||||
/sys/module/memory_hotplug/parameters/
|
||||
|
||||
The following module parameters are currently defined:
|
||||
|
||||
======================== =======================================================
|
||||
``memmap_on_memory`` read-write: Allocate memory for the memmap from the
|
||||
added memory block itself. Even if enabled, actual
|
||||
support depends on various other system properties and
|
||||
should only be regarded as a hint whether the behavior
|
||||
would be desired.
|
||||
================================ ===============================================
|
||||
``memmap_on_memory`` read-write: Allocate memory for the memmap from
|
||||
the added memory block itself. Even if enabled,
|
||||
actual support depends on various other system
|
||||
properties and should only be regarded as a
|
||||
hint whether the behavior would be desired.
|
||||
|
||||
While allocating the memmap from the memory block
|
||||
itself makes memory hotplug less likely to fail and
|
||||
keeps the memmap on the same NUMA node in any case, it
|
||||
can fragment physical memory in a way that huge pages
|
||||
in bigger granularity cannot be formed on hotplugged
|
||||
memory.
|
||||
======================== =======================================================
|
||||
While allocating the memmap from the memory
|
||||
block itself makes memory hotplug less likely
|
||||
to fail and keeps the memmap on the same NUMA
|
||||
node in any case, it can fragment physical
|
||||
memory in a way that huge pages in bigger
|
||||
granularity cannot be formed on hotplugged
|
||||
memory.
|
||||
``online_policy`` read-write: Set the basic policy used for
|
||||
automatic zone selection when onlining memory
|
||||
blocks without specifying a target zone.
|
||||
``contig-zones`` has been the kernel default
|
||||
before this parameter was added. After an
|
||||
online policy was configured and memory was
|
||||
online, the policy should not be changed
|
||||
anymore.
|
||||
|
||||
When set to ``contig-zones``, the kernel will
|
||||
try keeping zones contiguous. If a memory block
|
||||
intersects multiple zones or no zone, the
|
||||
behavior depends on the ``movable_node`` kernel
|
||||
command line parameter: default to ZONE_MOVABLE
|
||||
if set, default to the applicable kernel zone
|
||||
(usually ZONE_NORMAL) if not set.
|
||||
|
||||
When set to ``auto-movable``, the kernel will
|
||||
try onlining memory blocks to ZONE_MOVABLE if
|
||||
possible according to the configuration and
|
||||
memory device details. With this policy, one
|
||||
can avoid zone imbalances when eventually
|
||||
hotplugging a lot of memory later and still
|
||||
wanting to be able to hotunplug as much as
|
||||
possible reliably, very desirable in
|
||||
virtualized environments. This policy ignores
|
||||
the ``movable_node`` kernel command line
|
||||
parameter and isn't really applicable in
|
||||
environments that require it (e.g., bare metal
|
||||
with hotunpluggable nodes) where hotplugged
|
||||
memory might be exposed via the
|
||||
firmware-provided memory map early during boot
|
||||
to the system instead of getting detected,
|
||||
added and onlined later during boot (such as
|
||||
done by virtio-mem or by some hypervisors
|
||||
implementing emulated DIMMs). As one example, a
|
||||
hotplugged DIMM will be onlined either
|
||||
completely to ZONE_MOVABLE or completely to
|
||||
ZONE_NORMAL, not a mixture.
|
||||
As another example, as many memory blocks
|
||||
belonging to a virtio-mem device will be
|
||||
onlined to ZONE_MOVABLE as possible,
|
||||
special-casing units of memory blocks that can
|
||||
only get hotunplugged together. *This policy
|
||||
does not protect from setups that are
|
||||
problematic with ZONE_MOVABLE and does not
|
||||
change the zone of memory blocks dynamically
|
||||
after they were onlined.*
|
||||
``auto_movable_ratio`` read-write: Set the maximum MOVABLE:KERNEL
|
||||
memory ratio in % for the ``auto-movable``
|
||||
online policy. Whether the ratio applies only
|
||||
for the system across all NUMA nodes or also
|
||||
per NUMA nodes depends on the
|
||||
``auto_movable_numa_aware`` configuration.
|
||||
|
||||
All accounting is based on present memory pages
|
||||
in the zones combined with accounting per
|
||||
memory device. Memory dedicated to the CMA
|
||||
allocator is accounted as MOVABLE, although
|
||||
residing on one of the kernel zones. The
|
||||
possible ratio depends on the actual workload.
|
||||
The kernel default is "301" %, for example,
|
||||
allowing for hotplugging 24 GiB to a 8 GiB VM
|
||||
and automatically onlining all hotplugged
|
||||
memory to ZONE_MOVABLE in many setups. The
|
||||
additional 1% deals with some pages being not
|
||||
present, for example, because of some firmware
|
||||
allocations.
|
||||
|
||||
Note that ZONE_NORMAL memory provided by one
|
||||
memory device does not allow for more
|
||||
ZONE_MOVABLE memory for a different memory
|
||||
device. As one example, onlining memory of a
|
||||
hotplugged DIMM to ZONE_NORMAL will not allow
|
||||
for another hotplugged DIMM to get onlined to
|
||||
ZONE_MOVABLE automatically. In contrast, memory
|
||||
hotplugged by a virtio-mem device that got
|
||||
onlined to ZONE_NORMAL will allow for more
|
||||
ZONE_MOVABLE memory within *the same*
|
||||
virtio-mem device.
|
||||
``auto_movable_numa_aware`` read-write: Configure whether the
|
||||
``auto_movable_ratio`` in the ``auto-movable``
|
||||
online policy also applies per NUMA
|
||||
node in addition to the whole system across all
|
||||
NUMA nodes. The kernel default is "Y".
|
||||
|
||||
Disabling NUMA awareness can be helpful when
|
||||
dealing with NUMA nodes that should be
|
||||
completely hotunpluggable, onlining the memory
|
||||
completely to ZONE_MOVABLE automatically if
|
||||
possible.
|
||||
|
||||
Parameter availability depends on CONFIG_NUMA.
|
||||
================================ ===============================================
|
||||
|
||||
ZONE_MOVABLE
|
||||
============
|
||||
|
@ -90,13 +90,14 @@ Short descriptions to the page flags
|
||||
====================================
|
||||
|
||||
0 - LOCKED
|
||||
page is being locked for exclusive access, e.g. by undergoing read/write IO
|
||||
The page is being locked for exclusive access, e.g. by undergoing read/write
|
||||
IO.
|
||||
7 - SLAB
|
||||
page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
|
||||
The page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator.
|
||||
When compound page is used, SLUB/SLQB will only set this flag on the head
|
||||
page; SLOB will not flag it at all.
|
||||
10 - BUDDY
|
||||
a free memory block managed by the buddy system allocator
|
||||
A free memory block managed by the buddy system allocator.
|
||||
The buddy system organizes free memory in blocks of various orders.
|
||||
An order N block has 2^N physically contiguous pages, with the BUDDY flag
|
||||
set for and _only_ for the first page.
|
||||
@ -112,65 +113,65 @@ Short descriptions to the page flags
|
||||
16 - COMPOUND_TAIL
|
||||
A compound page tail (see description above).
|
||||
17 - HUGE
|
||||
this is an integral part of a HugeTLB page
|
||||
This is an integral part of a HugeTLB page.
|
||||
19 - HWPOISON
|
||||
hardware detected memory corruption on this page: don't touch the data!
|
||||
Hardware detected memory corruption on this page: don't touch the data!
|
||||
20 - NOPAGE
|
||||
no page frame exists at the requested address
|
||||
No page frame exists at the requested address.
|
||||
21 - KSM
|
||||
identical memory pages dynamically shared between one or more processes
|
||||
Identical memory pages dynamically shared between one or more processes.
|
||||
22 - THP
|
||||
contiguous pages which construct transparent hugepages
|
||||
Contiguous pages which construct transparent hugepages.
|
||||
23 - OFFLINE
|
||||
page is logically offline
|
||||
The page is logically offline.
|
||||
24 - ZERO_PAGE
|
||||
zero page for pfn_zero or huge_zero page
|
||||
Zero page for pfn_zero or huge_zero page.
|
||||
25 - IDLE
|
||||
page has not been accessed since it was marked idle (see
|
||||
The page has not been accessed since it was marked idle (see
|
||||
:ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
|
||||
Note that this flag may be stale in case the page was accessed via
|
||||
a PTE. To make sure the flag is up-to-date one has to read
|
||||
``/sys/kernel/mm/page_idle/bitmap`` first.
|
||||
26 - PGTABLE
|
||||
page is in use as a page table
|
||||
The page is in use as a page table.
|
||||
|
||||
IO related page flags
|
||||
---------------------
|
||||
|
||||
1 - ERROR
|
||||
IO error occurred
|
||||
IO error occurred.
|
||||
3 - UPTODATE
|
||||
page has up-to-date data
|
||||
The page has up-to-date data.
|
||||
ie. for file backed page: (in-memory data revision >= on-disk one)
|
||||
4 - DIRTY
|
||||
page has been written to, hence contains new data
|
||||
The page has been written to, hence contains new data.
|
||||
i.e. for file backed page: (in-memory data revision > on-disk one)
|
||||
8 - WRITEBACK
|
||||
page is being synced to disk
|
||||
The page is being synced to disk.
|
||||
|
||||
LRU related page flags
|
||||
----------------------
|
||||
|
||||
5 - LRU
|
||||
page is in one of the LRU lists
|
||||
The page is in one of the LRU lists.
|
||||
6 - ACTIVE
|
||||
page is in the active LRU list
|
||||
The page is in the active LRU list.
|
||||
18 - UNEVICTABLE
|
||||
page is in the unevictable (non-)LRU list It is somehow pinned and
|
||||
The page is in the unevictable (non-)LRU list It is somehow pinned and
|
||||
not a candidate for LRU page reclaims, e.g. ramfs pages,
|
||||
shmctl(SHM_LOCK) and mlock() memory segments
|
||||
shmctl(SHM_LOCK) and mlock() memory segments.
|
||||
2 - REFERENCED
|
||||
page has been referenced since last LRU list enqueue/requeue
|
||||
The page has been referenced since last LRU list enqueue/requeue.
|
||||
9 - RECLAIM
|
||||
page will be reclaimed soon after its pageout IO completed
|
||||
The page will be reclaimed soon after its pageout IO completed.
|
||||
11 - MMAP
|
||||
a memory mapped page
|
||||
A memory mapped page.
|
||||
12 - ANON
|
||||
a memory mapped page that is not part of a file
|
||||
A memory mapped page that is not part of a file.
|
||||
13 - SWAPCACHE
|
||||
page is mapped to swap space, i.e. has an associated swap entry
|
||||
The page is mapped to swap space, i.e. has an associated swap entry.
|
||||
14 - SWAPBACKED
|
||||
page is backed by swap/RAM
|
||||
The page is backed by swap/RAM.
|
||||
|
||||
The page-types tool in the tools/vm directory can be used to query the
|
||||
above flags.
|
||||
|
@ -57,7 +57,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
|
||||
unsigned long start_pfn;
|
||||
unsigned long nr_pages;
|
||||
int status_change_nid_normal;
|
||||
int status_change_nid_high;
|
||||
int status_change_nid;
|
||||
}
|
||||
|
||||
@ -65,8 +64,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
|
||||
- nr_pages is # of pages of online/offline memory.
|
||||
- status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
|
||||
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
||||
- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
|
||||
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
||||
- status_change_nid is set node id when N_MEMORY of nodemask is (will be)
|
||||
set/clear. It means a new(memoryless) node gets new memory by online and a
|
||||
node loses all memory. If this is -1, then nodemask status is not changed.
|
||||
|
@ -231,10 +231,14 @@ Guarded allocations are set up based on the sample interval. After expiration
|
||||
of the sample interval, the next allocation through the main allocator (SLAB or
|
||||
SLUB) returns a guarded allocation from the KFENCE object pool (allocation
|
||||
sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and
|
||||
the next allocation is set up after the expiration of the interval. To "gate" a
|
||||
KFENCE allocation through the main allocator's fast-path without overhead,
|
||||
KFENCE relies on static branches via the static keys infrastructure. The static
|
||||
branch is toggled to redirect the allocation to KFENCE.
|
||||
the next allocation is set up after the expiration of the interval.
|
||||
|
||||
When using ``CONFIG_KFENCE_STATIC_KEYS=y``, KFENCE allocations are "gated"
|
||||
through the main allocator's fast-path by relying on static branches via the
|
||||
static keys infrastructure. The static branch is toggled to redirect the
|
||||
allocation to KFENCE. Depending on sample interval, target workloads, and
|
||||
system architecture, this may perform better than the simple dynamic branch.
|
||||
Careful benchmarking is recommended.
|
||||
|
||||
KFENCE objects each reside on a dedicated page, at either the left or right
|
||||
page boundaries selected at random. The pages to the left and right of the
|
||||
@ -269,6 +273,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
|
||||
first, and the chances of detecting use-after-frees of recently freed objects
|
||||
is increased.
|
||||
|
||||
If pool utilization reaches 75% (default) or above, to reduce the risk of the
|
||||
pool eventually being fully occupied by allocated objects yet ensure diverse
|
||||
coverage of allocations, KFENCE limits currently covered allocations of the
|
||||
same source from further filling up the pool. The "source" of an allocation is
|
||||
based on its partial allocation stack trace. A side-effect is that this also
|
||||
limits frequent long-lived allocations (e.g. pagecache) of the same source
|
||||
filling up the pool permanently, which is the most common risk for the pool
|
||||
becoming full and the sampled allocation rate dropping to zero. The threshold
|
||||
at which to start limiting currently covered allocations can be configured via
|
||||
the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
|
||||
|
||||
Interface
|
||||
---------
|
||||
|
||||
|
@ -63,7 +63,6 @@ memory_notify结构体的指针::
|
||||
unsigned long start_pfn;
|
||||
unsigned long nr_pages;
|
||||
int status_change_nid_normal;
|
||||
int status_change_nid_high;
|
||||
int status_change_nid;
|
||||
}
|
||||
|
||||
@ -74,9 +73,6 @@ memory_notify结构体的指针::
|
||||
- status_change_nid_normal是当nodemask的N_NORMAL_MEMORY被设置/清除时设置节
|
||||
点id,如果是-1,则nodemask状态不改变。
|
||||
|
||||
- status_change_nid_high是当nodemask的N_HIGH_MEMORY被设置/清除时设置的节点
|
||||
id,如果这个值为-1,那么nodemask状态不会改变。
|
||||
|
||||
- status_change_nid是当nodemask的N_MEMORY被(将)设置/清除时设置的节点id。这
|
||||
意味着一个新的(没上线的)节点通过联机获得新的内存,而一个节点失去了所有的内
|
||||
存。如果这个值为-1,那么nodemask的状态就不会改变。
|
||||
|
@ -35,13 +35,17 @@ two parts:
|
||||
1. Identification of the monitoring target address range for the address space.
|
||||
2. Access check of specific address range in the target space.
|
||||
|
||||
DAMON currently provides the implementation of the primitives for only the
|
||||
virtual address spaces. Below two subsections describe how it works.
|
||||
DAMON currently provides the implementations of the primitives for the physical
|
||||
and virtual address spaces. Below two subsections describe how those work.
|
||||
|
||||
|
||||
VMA-based Target Address Range Construction
|
||||
-------------------------------------------
|
||||
|
||||
This is only for the virtual address space primitives implementation. That for
|
||||
the physical address space simply asks users to manually set the monitoring
|
||||
target address ranges.
|
||||
|
||||
Only small parts in the super-huge virtual address space of the processes are
|
||||
mapped to the physical memory and accessed. Thus, tracking the unmapped
|
||||
address regions is just wasteful. However, because DAMON can deal with some
|
||||
@ -71,15 +75,18 @@ to make a reasonable trade-off. Below shows this in detail::
|
||||
PTE Accessed-bit Based Access Check
|
||||
-----------------------------------
|
||||
|
||||
The implementation for the virtual address space uses PTE Accessed-bit for
|
||||
basic access checks. It finds the relevant PTE Accessed bit from the address
|
||||
by walking the page table for the target task of the address. In this way, the
|
||||
implementation finds and clears the bit for next sampling target address and
|
||||
checks whether the bit set again after one sampling period. This could disturb
|
||||
other kernel subsystems using the Accessed bits, namely Idle page tracking and
|
||||
the reclaim logic. To avoid such disturbances, DAMON makes it mutually
|
||||
exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page
|
||||
flags to solve the conflict with the reclaim logic, as Idle page tracking does.
|
||||
Both of the implementations for physical and virtual address spaces use PTE
|
||||
Accessed-bit for basic access checks. Only one difference is the way of
|
||||
finding the relevant PTE Accessed bit(s) from the address. While the
|
||||
implementation for the virtual address walks the page table for the target task
|
||||
of the address, the implementation for the physical address walks every page
|
||||
table having a mapping to the address. In this way, the implementations find
|
||||
and clear the bit(s) for next sampling target address and checks whether the
|
||||
bit(s) set again after one sampling period. This could disturb other kernel
|
||||
subsystems using the Accessed bits, namely Idle page tracking and the reclaim
|
||||
logic. To avoid such disturbances, DAMON makes it mutually exclusive with Idle
|
||||
page tracking and uses ``PG_idle`` and ``PG_young`` page flags to solve the
|
||||
conflict with the reclaim logic, as Idle page tracking does.
|
||||
|
||||
|
||||
Address Space Independent Core Mechanisms
|
||||
|
@ -36,10 +36,9 @@ constructions and actual access checks can be implemented and configured on the
|
||||
DAMON core by the users. In this way, DAMON users can monitor any address
|
||||
space with any access check technique.
|
||||
|
||||
Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based
|
||||
Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based
|
||||
implementations of the address space dependent functions for the virtual memory
|
||||
by default, for a reference and convenient use. In near future, we will
|
||||
provide those for physical memory address space.
|
||||
and the physical memory by default, for a reference and convenient use.
|
||||
|
||||
|
||||
Can I simply monitor page granularity?
|
||||
|
@ -27,4 +27,3 @@ workloads and systems.
|
||||
faq
|
||||
design
|
||||
api
|
||||
plans
|
||||
|
@ -3,27 +3,11 @@ Linux Memory Management Documentation
|
||||
=====================================
|
||||
|
||||
This is a collection of documents about the Linux memory management (mm)
|
||||
subsystem. If you are looking for advice on simply allocating memory,
|
||||
see the :ref:`memory_allocation`.
|
||||
|
||||
User guides for MM features
|
||||
===========================
|
||||
|
||||
The following documents provide guides for controlling and tuning
|
||||
various features of the Linux memory management
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
swap_numa
|
||||
zswap
|
||||
|
||||
Kernel developers MM documentation
|
||||
==================================
|
||||
|
||||
The below documents describe MM internals with different level of
|
||||
details ranging from notes and mailing list responses to elaborate
|
||||
descriptions of data structures and algorithms.
|
||||
subsystem internals with different level of details ranging from notes and
|
||||
mailing list responses for elaborating descriptions of data structures and
|
||||
algorithms. If you are looking for advice on simply allocating memory, see the
|
||||
:ref:`memory_allocation`. For controlling and tuning guides, see the
|
||||
:doc:`admin guide <../admin-guide/mm/index>`.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
@ -85,5 +85,26 @@ Usage
|
||||
cat /sys/kernel/debug/page_owner > page_owner_full.txt
|
||||
./page_owner_sort page_owner_full.txt sorted_page_owner.txt
|
||||
|
||||
The general output of ``page_owner_full.txt`` is as follows:
|
||||
|
||||
Page allocated via order XXX, ...
|
||||
PFN XXX ...
|
||||
// Detailed stack
|
||||
|
||||
Page allocated via order XXX, ...
|
||||
PFN XXX ...
|
||||
// Detailed stack
|
||||
|
||||
The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
|
||||
in buf, uses regexp to extract the page order value, counts the times
|
||||
and pages of buf, and finally sorts them according to the times.
|
||||
|
||||
See the result about who allocated each page
|
||||
in the ``sorted_page_owner.txt``.
|
||||
in the ``sorted_page_owner.txt``. General output:
|
||||
|
||||
XXX times, XXX pages:
|
||||
Page allocated via order XXX, ...
|
||||
// Detailed stack
|
||||
|
||||
By default, ``page_owner_sort`` is sorted according to the times of buf.
|
||||
If you want to sort by the pages nums of buf, use the ``-m`` parameter.
|
||||
|
@ -5220,7 +5220,7 @@ F: net/ax25/ax25_timer.c
|
||||
F: net/ax25/sysctl_net_ax25.c
|
||||
|
||||
DATA ACCESS MONITOR
|
||||
M: SeongJae Park <sjpark@amazon.de>
|
||||
M: SeongJae Park <sj@kernel.org>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: Documentation/admin-guide/mm/damon/
|
||||
|
15
Makefile
15
Makefile
@ -1011,6 +1011,21 @@ ifdef CONFIG_CC_IS_GCC
|
||||
KBUILD_CFLAGS += -Wno-maybe-uninitialized
|
||||
endif
|
||||
|
||||
ifdef CONFIG_CC_IS_GCC
|
||||
# The allocators already balk at large sizes, so silence the compiler
|
||||
# warnings for bounds checks involving those possible values. While
|
||||
# -Wno-alloc-size-larger-than would normally be used here, earlier versions
|
||||
# of gcc (<9.1) weirdly don't handle the option correctly when _other_
|
||||
# warnings are produced (?!). Using -Walloc-size-larger-than=SIZE_MAX
|
||||
# doesn't work (as it is documented to), silently resolving to "0" prior to
|
||||
# version 9.1 (and producing an error more recently). Numeric values larger
|
||||
# than PTRDIFF_MAX also don't work prior to version 9.1, which are silently
|
||||
# ignored, continuing to default to PTRDIFF_MAX. So, left with no other
|
||||
# choice, we must perform a versioned check to disable this warning.
|
||||
# https://lore.kernel.org/lkml/20210824115859.187f272f@canb.auug.org.au
|
||||
KBUILD_CFLAGS += $(call cc-ifversion, -ge, 0901, -Wno-alloc-size-larger-than)
|
||||
endif
|
||||
|
||||
# disable invalid "can't wrap" optimizations for signed / pointers
|
||||
KBUILD_CFLAGS += -fno-strict-overflow
|
||||
|
||||
|
@ -233,7 +233,7 @@ albacore_init_arch(void)
|
||||
unsigned long size;
|
||||
|
||||
size = initrd_end - initrd_start;
|
||||
memblock_free(__pa(initrd_start), PAGE_ALIGN(size));
|
||||
memblock_free((void *)initrd_start, PAGE_ALIGN(size));
|
||||
if (!move_initrd(pci_mem))
|
||||
printk("irongate_init_arch: initrd too big "
|
||||
"(%ldK)\ndisabling initrd\n",
|
||||
|
@ -59,13 +59,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
|
||||
|
||||
low_mem_sz = size;
|
||||
in_use = 1;
|
||||
memblock_add_node(base, size, 0);
|
||||
memblock_add_node(base, size, 0, MEMBLOCK_NONE);
|
||||
} else {
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
high_mem_start = base;
|
||||
high_mem_sz = size;
|
||||
in_use = 1;
|
||||
memblock_add_node(base, size, 1);
|
||||
memblock_add_node(base, size, 1, MEMBLOCK_NONE);
|
||||
memblock_reserve(base, size);
|
||||
#endif
|
||||
}
|
||||
@ -173,7 +173,7 @@ static void __init highmem_init(void)
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
unsigned long tmp;
|
||||
|
||||
memblock_free(high_mem_start, high_mem_sz);
|
||||
memblock_phys_free(high_mem_start, high_mem_sz);
|
||||
for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++)
|
||||
free_highmem_page(pfn_to_page(tmp));
|
||||
#endif
|
||||
|
@ -339,7 +339,7 @@ err_fabric:
|
||||
err_sysctrl:
|
||||
iounmap(relocation);
|
||||
err_reloc:
|
||||
memblock_free(hip04_boot_method[0], hip04_boot_method[1]);
|
||||
memblock_phys_free(hip04_boot_method[0], hip04_boot_method[1]);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
@ -158,7 +158,7 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align)
|
||||
panic("Failed to steal %pa bytes at %pS\n",
|
||||
&size, (void *)_RET_IP_);
|
||||
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
memblock_remove(phys, size);
|
||||
|
||||
return phys;
|
||||
|
@ -1163,6 +1163,10 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
|
||||
def_bool y
|
||||
depends on NUMA
|
||||
|
||||
config NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||
def_bool y
|
||||
depends on NUMA
|
||||
|
||||
source "kernel/Kconfig.hz"
|
||||
|
||||
config ARCH_SPARSEMEM_ENABLE
|
||||
|
@ -287,6 +287,22 @@ static void __init kasan_init_depth(void)
|
||||
init_task.kasan_depth = 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KASAN_VMALLOC
|
||||
void __init kasan_populate_early_vm_area_shadow(void *start, unsigned long size)
|
||||
{
|
||||
unsigned long shadow_start, shadow_end;
|
||||
|
||||
if (!is_vmalloc_or_module_addr(start))
|
||||
return;
|
||||
|
||||
shadow_start = (unsigned long)kasan_mem_to_shadow(start);
|
||||
shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
|
||||
shadow_end = (unsigned long)kasan_mem_to_shadow(start + size);
|
||||
shadow_end = ALIGN(shadow_end, PAGE_SIZE);
|
||||
kasan_map_populate(shadow_start, shadow_end, NUMA_NO_NODE);
|
||||
}
|
||||
#endif
|
||||
|
||||
void __init kasan_init(void)
|
||||
{
|
||||
kasan_init_shadow();
|
||||
|
@ -738,8 +738,8 @@ void __init paging_init(void)
|
||||
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
|
||||
init_mm.pgd = swapper_pg_dir;
|
||||
|
||||
memblock_free(__pa_symbol(init_pg_dir),
|
||||
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
|
||||
memblock_phys_free(__pa_symbol(init_pg_dir),
|
||||
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
|
||||
|
||||
memblock_allow_resize();
|
||||
}
|
||||
|
@ -153,7 +153,7 @@ find_memory (void)
|
||||
efi_memmap_walk(find_max_min_low_pfn, NULL);
|
||||
max_pfn = max_low_pfn;
|
||||
|
||||
memblock_add_node(0, PFN_PHYS(max_low_pfn), 0);
|
||||
memblock_add_node(0, PFN_PHYS(max_low_pfn), 0, MEMBLOCK_NONE);
|
||||
|
||||
find_initrd();
|
||||
|
||||
|
@ -378,7 +378,7 @@ int __init register_active_ranges(u64 start, u64 len, int nid)
|
||||
#endif
|
||||
|
||||
if (start < end)
|
||||
memblock_add_node(__pa(start), end - start, nid);
|
||||
memblock_add_node(__pa(start), end - start, nid, MEMBLOCK_NONE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -174,7 +174,8 @@ void __init cf_bootmem_alloc(void)
|
||||
m68k_memory[0].addr = _rambase;
|
||||
m68k_memory[0].size = _ramend - _rambase;
|
||||
|
||||
memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0);
|
||||
memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0,
|
||||
MEMBLOCK_NONE);
|
||||
|
||||
/* compute total pages in system */
|
||||
num_pages = PFN_DOWN(_ramend - _rambase);
|
||||
|
@ -410,7 +410,8 @@ void __init paging_init(void)
|
||||
|
||||
min_addr = m68k_memory[0].addr;
|
||||
max_addr = min_addr + m68k_memory[0].size;
|
||||
memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0);
|
||||
memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0,
|
||||
MEMBLOCK_NONE);
|
||||
for (i = 1; i < m68k_num_memory;) {
|
||||
if (m68k_memory[i].addr < min_addr) {
|
||||
printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n",
|
||||
@ -421,7 +422,8 @@ void __init paging_init(void)
|
||||
(m68k_num_memory - i) * sizeof(struct m68k_mem_info));
|
||||
continue;
|
||||
}
|
||||
memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i);
|
||||
memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i,
|
||||
MEMBLOCK_NONE);
|
||||
addr = m68k_memory[i].addr + m68k_memory[i].size;
|
||||
if (addr > max_addr)
|
||||
max_addr = addr;
|
||||
|
@ -77,7 +77,9 @@ void __init szmem(unsigned int node)
|
||||
(u32)node_id, mem_type, mem_start, mem_size);
|
||||
pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
|
||||
start_pfn, end_pfn, num_physpages);
|
||||
memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(node_psize), node);
|
||||
memblock_add_node(PFN_PHYS(start_pfn),
|
||||
PFN_PHYS(node_psize), node,
|
||||
MEMBLOCK_NONE);
|
||||
break;
|
||||
case SYSTEM_RAM_RESERVED:
|
||||
pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n",
|
||||
|
@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
|
||||
|
||||
static void __init pcpu_fc_free(void *ptr, size_t size)
|
||||
{
|
||||
memblock_free_early(__pa(ptr), size);
|
||||
memblock_free(ptr, size);
|
||||
}
|
||||
|
||||
void __init setup_per_cpu_areas(void)
|
||||
|
@ -341,7 +341,8 @@ static void __init szmem(void)
|
||||
continue;
|
||||
}
|
||||
memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)),
|
||||
PFN_PHYS(slot_psize), node);
|
||||
PFN_PHYS(slot_psize), node,
|
||||
MEMBLOCK_NONE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -69,10 +69,10 @@ static void __init ip30_mem_init(void)
|
||||
total_mem += size;
|
||||
|
||||
if (addr >= IP30_REAL_MEMORY_START)
|
||||
memblock_free(addr, size);
|
||||
memblock_phys_free(addr, size);
|
||||
else if ((addr + size) > IP30_REAL_MEMORY_START)
|
||||
memblock_free(IP30_REAL_MEMORY_START,
|
||||
size - IP30_MAX_PROM_MEMORY);
|
||||
memblock_phys_free(IP30_REAL_MEMORY_START,
|
||||
size - IP30_MAX_PROM_MEMORY);
|
||||
}
|
||||
pr_info("Detected %luMB of physical memory.\n", MEM_SHIFT(total_mem));
|
||||
}
|
||||
|
@ -274,7 +274,6 @@ CONFIG_NLS_UTF8=y
|
||||
CONFIG_ENCRYPTED_KEYS=y
|
||||
CONFIG_SECURITY=y
|
||||
CONFIG_HARDENED_USERCOPY=y
|
||||
# CONFIG_HARDENED_USERCOPY_FALLBACK is not set
|
||||
CONFIG_HARDENED_USERCOPY_PAGESPAN=y
|
||||
CONFIG_FORTIFY_SOURCE=y
|
||||
CONFIG_SECURITY_LOCKDOWN_LSM=y
|
||||
|
@ -31,7 +31,7 @@ struct machdep_calls {
|
||||
#ifdef CONFIG_PM
|
||||
void (*iommu_restore)(void);
|
||||
#endif
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
unsigned long (*memory_block_size)(void);
|
||||
#endif
|
||||
#endif /* CONFIG_PPC64 */
|
||||
|
@ -6,21 +6,8 @@
|
||||
#include <linux/elf.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
|
||||
|
||||
#include <asm-generic/sections.h>
|
||||
|
||||
extern bool init_mem_is_free;
|
||||
|
||||
static inline int arch_is_kernel_initmem_freed(unsigned long addr)
|
||||
{
|
||||
if (!init_mem_is_free)
|
||||
return 0;
|
||||
|
||||
return addr >= (unsigned long)__init_begin &&
|
||||
addr < (unsigned long)__init_end;
|
||||
}
|
||||
|
||||
extern char __head_end[];
|
||||
|
||||
#ifdef __powerpc64__
|
||||
|
@ -1095,8 +1095,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char
|
||||
|
||||
cpufeatures_setup_finished();
|
||||
|
||||
memblock_free(__pa(dt_cpu_features),
|
||||
sizeof(struct dt_cpu_feature)*nr_dt_cpu_features);
|
||||
memblock_free(dt_cpu_features,
|
||||
sizeof(struct dt_cpu_feature) * nr_dt_cpu_features);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -322,8 +322,8 @@ void __init free_unused_pacas(void)
|
||||
|
||||
new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
|
||||
if (new_ptrs_size < paca_ptrs_size)
|
||||
memblock_free(__pa(paca_ptrs) + new_ptrs_size,
|
||||
paca_ptrs_size - new_ptrs_size);
|
||||
memblock_phys_free(__pa(paca_ptrs) + new_ptrs_size,
|
||||
paca_ptrs_size - new_ptrs_size);
|
||||
|
||||
paca_nr_cpu_ids = nr_cpu_ids;
|
||||
paca_ptrs_size = new_ptrs_size;
|
||||
@ -331,8 +331,8 @@ void __init free_unused_pacas(void)
|
||||
#ifdef CONFIG_PPC_BOOK3S_64
|
||||
if (early_radix_enabled()) {
|
||||
/* Ugly fixup, see new_slb_shadow() */
|
||||
memblock_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr),
|
||||
sizeof(struct slb_shadow));
|
||||
memblock_phys_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr),
|
||||
sizeof(struct slb_shadow));
|
||||
paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL;
|
||||
}
|
||||
#endif
|
||||
|
@ -822,7 +822,7 @@ static void __init smp_setup_pacas(void)
|
||||
set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]);
|
||||
}
|
||||
|
||||
memblock_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32));
|
||||
memblock_free(cpu_to_phys_id, nr_cpu_ids * sizeof(u32));
|
||||
cpu_to_phys_id = NULL;
|
||||
}
|
||||
#endif
|
||||
|
@ -812,7 +812,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
|
||||
|
||||
static void __init pcpu_free_bootmem(void *ptr, size_t size)
|
||||
{
|
||||
memblock_free(__pa(ptr), size);
|
||||
memblock_free(ptr, size);
|
||||
}
|
||||
|
||||
static int pcpu_cpu_distance(unsigned int from, unsigned int to)
|
||||
@ -912,7 +912,7 @@ void __init setup_per_cpu_areas(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
unsigned long memory_block_size_bytes(void)
|
||||
{
|
||||
if (ppc_md.memory_block_size)
|
||||
|
@ -229,17 +229,22 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
|
||||
m->hstate = hstate;
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool __init hugetlb_node_alloc_supported(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int __init alloc_bootmem_huge_page(struct hstate *h)
|
||||
int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
|
||||
{
|
||||
|
||||
#ifdef CONFIG_PPC_BOOK3S_64
|
||||
if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
|
||||
return pseries_alloc_bootmem_huge_page(h);
|
||||
#endif
|
||||
return __alloc_bootmem_huge_page(h);
|
||||
return __alloc_bootmem_huge_page(h, nid);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_PPC_BOOK3S_64
|
||||
|
@ -2981,7 +2981,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
|
||||
if (!phb->hose) {
|
||||
pr_err(" Can't allocate PCI controller for %pOF\n",
|
||||
np);
|
||||
memblock_free(__pa(phb), sizeof(struct pnv_phb));
|
||||
memblock_free(phb, sizeof(struct pnv_phb));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -440,7 +440,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
|
||||
}
|
||||
#endif /* CONFIG_KEXEC_CORE */
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
static unsigned long pnv_memory_block_size(void)
|
||||
{
|
||||
/*
|
||||
@ -553,7 +553,7 @@ define_machine(powernv) {
|
||||
#ifdef CONFIG_KEXEC_CORE
|
||||
.kexec_cpu_down = pnv_kexec_cpu_down,
|
||||
#endif
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
.memory_block_size = pnv_memory_block_size,
|
||||
#endif
|
||||
};
|
||||
|
@ -1088,7 +1088,7 @@ define_machine(pseries) {
|
||||
.machine_kexec = pSeries_machine_kexec,
|
||||
.kexec_cpu_down = pseries_kexec_cpu_down,
|
||||
#endif
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
.memory_block_size = pseries_memory_block_size,
|
||||
#endif
|
||||
};
|
||||
|
@ -57,8 +57,7 @@ void __init svm_swiotlb_init(void)
|
||||
return;
|
||||
|
||||
|
||||
memblock_free_early(__pa(vstart),
|
||||
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
|
||||
memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
|
||||
panic("SVM: Cannot allocate SWIOTLB buffer");
|
||||
}
|
||||
|
||||
|
@ -230,13 +230,13 @@ static void __init init_resources(void)
|
||||
|
||||
/* Clean-up any unused pre-allocated resources */
|
||||
if (res_idx >= 0)
|
||||
memblock_free(__pa(mem_res), (res_idx + 1) * sizeof(*mem_res));
|
||||
memblock_free(mem_res, (res_idx + 1) * sizeof(*mem_res));
|
||||
return;
|
||||
|
||||
error:
|
||||
/* Better an empty resource tree than an inconsistent one */
|
||||
release_child_resources(&iomem_resource);
|
||||
memblock_free(__pa(mem_res), mem_res_sz);
|
||||
memblock_free(mem_res, mem_res_sz);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2,20 +2,8 @@
|
||||
#ifndef _S390_SECTIONS_H
|
||||
#define _S390_SECTIONS_H
|
||||
|
||||
#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
|
||||
|
||||
#include <asm-generic/sections.h>
|
||||
|
||||
extern bool initmem_freed;
|
||||
|
||||
static inline int arch_is_kernel_initmem_freed(unsigned long addr)
|
||||
{
|
||||
if (!initmem_freed)
|
||||
return 0;
|
||||
return addr >= (unsigned long)__init_begin &&
|
||||
addr < (unsigned long)__init_end;
|
||||
}
|
||||
|
||||
/*
|
||||
* .boot.data section contains variables "shared" between the decompressor and
|
||||
* the decompressed kernel. The decompressor will store values in them, and
|
||||
|
@ -593,7 +593,8 @@ static void __init setup_resources(void)
|
||||
* part of the System RAM resource.
|
||||
*/
|
||||
if (crashk_res.end) {
|
||||
memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0);
|
||||
memblock_add_node(crashk_res.start, resource_size(&crashk_res),
|
||||
0, MEMBLOCK_NONE);
|
||||
memblock_reserve(crashk_res.start, resource_size(&crashk_res));
|
||||
insert_resource(&iomem_resource, &crashk_res);
|
||||
}
|
||||
@ -693,7 +694,7 @@ static void __init reserve_crashkernel(void)
|
||||
}
|
||||
|
||||
if (register_memory_notifier(&kdump_mem_nb)) {
|
||||
memblock_free(crash_base, crash_size);
|
||||
memblock_phys_free(crash_base, crash_size);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -748,7 +749,7 @@ static void __init free_mem_detect_info(void)
|
||||
|
||||
get_mem_detect_reserved(&start, &size);
|
||||
if (size)
|
||||
memblock_free(start, size);
|
||||
memblock_phys_free(start, size);
|
||||
}
|
||||
|
||||
static const char * __init get_mem_info_source(void)
|
||||
@ -793,7 +794,7 @@ static void __init check_initrd(void)
|
||||
if (initrd_data.start && initrd_data.size &&
|
||||
!memblock_is_region_memory(initrd_data.start, initrd_data.size)) {
|
||||
pr_err("The initial RAM disk does not fit into the memory\n");
|
||||
memblock_free(initrd_data.start, initrd_data.size);
|
||||
memblock_phys_free(initrd_data.start, initrd_data.size);
|
||||
initrd_start = initrd_end = 0;
|
||||
}
|
||||
#endif
|
||||
@ -890,7 +891,7 @@ static void __init setup_randomness(void)
|
||||
|
||||
if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
|
||||
add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
|
||||
memblock_free((unsigned long) vmms, PAGE_SIZE);
|
||||
memblock_phys_free((unsigned long)vmms, PAGE_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -723,7 +723,7 @@ void __init smp_save_dump_cpus(void)
|
||||
/* Get the CPU registers */
|
||||
smp_save_cpu_regs(sa, addr, is_boot_cpu, page);
|
||||
}
|
||||
memblock_free(page, PAGE_SIZE);
|
||||
memblock_phys_free(page, PAGE_SIZE);
|
||||
diag_amode31_ops.diag308_reset();
|
||||
pcpu_set_smt(0);
|
||||
}
|
||||
@ -880,7 +880,7 @@ void __init smp_detect_cpus(void)
|
||||
|
||||
/* Add CPUs present at boot */
|
||||
__smp_rescan_cpus(info, true);
|
||||
memblock_free_early((unsigned long)info, sizeof(*info));
|
||||
memblock_phys_free((unsigned long)info, sizeof(*info));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -64,7 +64,7 @@ void __init setup_uv(void)
|
||||
}
|
||||
|
||||
if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) {
|
||||
memblock_free(uv_stor_base, uv_info.uv_base_stor_len);
|
||||
memblock_phys_free(uv_stor_base, uv_info.uv_base_stor_len);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
|
@ -58,8 +58,6 @@ unsigned long empty_zero_page, zero_page_mask;
|
||||
EXPORT_SYMBOL(empty_zero_page);
|
||||
EXPORT_SYMBOL(zero_page_mask);
|
||||
|
||||
bool initmem_freed;
|
||||
|
||||
static void __init setup_zero_pages(void)
|
||||
{
|
||||
unsigned int order;
|
||||
@ -214,7 +212,6 @@ void __init mem_init(void)
|
||||
|
||||
void free_initmem(void)
|
||||
{
|
||||
initmem_freed = true;
|
||||
__set_memory((unsigned long)_sinittext,
|
||||
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
|
||||
SET_MEMORY_RW | SET_MEMORY_NX);
|
||||
|
@ -399,5 +399,5 @@ void __init kasan_copy_shadow_mapping(void)
|
||||
|
||||
void __init kasan_free_early_identity(void)
|
||||
{
|
||||
memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
|
||||
memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
|
||||
}
|
||||
|
@ -560,7 +560,7 @@ static void __init ap325rxa_mv_mem_reserve(void)
|
||||
if (!phys)
|
||||
panic("Failed to allocate CEU memory\n");
|
||||
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
memblock_remove(phys, size);
|
||||
|
||||
ceu_dma_membase = phys;
|
||||
|
@ -1502,7 +1502,7 @@ static void __init ecovec_mv_mem_reserve(void)
|
||||
if (!phys)
|
||||
panic("Failed to allocate CEU0 memory\n");
|
||||
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
memblock_remove(phys, size);
|
||||
ceu0_dma_membase = phys;
|
||||
|
||||
@ -1510,7 +1510,7 @@ static void __init ecovec_mv_mem_reserve(void)
|
||||
if (!phys)
|
||||
panic("Failed to allocate CEU1 memory\n");
|
||||
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
memblock_remove(phys, size);
|
||||
ceu1_dma_membase = phys;
|
||||
}
|
||||
|
@ -633,7 +633,7 @@ static void __init kfr2r09_mv_mem_reserve(void)
|
||||
if (!phys)
|
||||
panic("Failed to allocate CEU memory\n");
|
||||
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
memblock_remove(phys, size);
|
||||
|
||||
ceu_dma_membase = phys;
|
||||
|
@ -633,7 +633,7 @@ static void __init migor_mv_mem_reserve(void)
|
||||
if (!phys)
|
||||
panic("Failed to allocate CEU memory\n");
|
||||
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
memblock_remove(phys, size);
|
||||
|
||||
ceu_dma_membase = phys;
|
||||
|
@ -966,7 +966,7 @@ static void __init ms7724se_mv_mem_reserve(void)
|
||||
if (!phys)
|
||||
panic("Failed to allocate CEU0 memory\n");
|
||||
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
memblock_remove(phys, size);
|
||||
ceu0_dma_membase = phys;
|
||||
|
||||
@ -974,7 +974,7 @@ static void __init ms7724se_mv_mem_reserve(void)
|
||||
if (!phys)
|
||||
panic("Failed to allocate CEU1 memory\n");
|
||||
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
memblock_remove(phys, size);
|
||||
ceu1_dma_membase = phys;
|
||||
}
|
||||
|
@ -1567,7 +1567,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
|
||||
|
||||
static void __init pcpu_free_bootmem(void *ptr, size_t size)
|
||||
{
|
||||
memblock_free(__pa(ptr), size);
|
||||
memblock_free(ptr, size);
|
||||
}
|
||||
|
||||
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
|
||||
|
@ -47,7 +47,7 @@ void __init mem_init(void)
|
||||
*/
|
||||
brk_end = (unsigned long) UML_ROUND_UP(sbrk(0));
|
||||
map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
|
||||
memblock_free(__pa(brk_end), uml_reserved - brk_end);
|
||||
memblock_free((void *)brk_end, uml_reserved - brk_end);
|
||||
uml_reserved = brk_end;
|
||||
|
||||
/* this will put all low memory onto the freelists */
|
||||
|
@ -63,7 +63,7 @@ config X86
|
||||
select ARCH_CLOCKSOURCE_INIT
|
||||
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
|
||||
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
|
||||
select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
|
||||
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
|
||||
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
|
||||
@ -1627,7 +1627,7 @@ config ARCH_SELECT_MEMORY_MODEL
|
||||
|
||||
config ARCH_MEMORY_PROBE
|
||||
bool "Enable sysfs memory/probe interface"
|
||||
depends on X86_64 && MEMORY_HOTPLUG
|
||||
depends on MEMORY_HOTPLUG
|
||||
help
|
||||
This option enables a sysfs memory/probe interface for testing.
|
||||
See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
|
||||
@ -2423,7 +2423,7 @@ endmenu
|
||||
|
||||
config ARCH_HAS_ADD_PAGES
|
||||
def_bool y
|
||||
depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
depends on ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
|
||||
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
|
||||
def_bool y
|
||||
|
@ -322,7 +322,7 @@ static void __init reserve_initrd(void)
|
||||
|
||||
relocate_initrd();
|
||||
|
||||
memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
|
||||
memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
|
||||
}
|
||||
|
||||
#else
|
||||
@ -521,7 +521,7 @@ static void __init reserve_crashkernel(void)
|
||||
}
|
||||
|
||||
if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
|
||||
memblock_free(crash_base, crash_size);
|
||||
memblock_phys_free(crash_base, crash_size);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
|
||||
|
||||
static void __init pcpu_fc_free(void *ptr, size_t size)
|
||||
{
|
||||
memblock_free_ptr(ptr, size);
|
||||
memblock_free(ptr, size);
|
||||
}
|
||||
|
||||
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
|
||||
|
@ -618,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start,
|
||||
*/
|
||||
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
|
||||
map_end);
|
||||
memblock_free(addr, PMD_SIZE);
|
||||
memblock_phys_free(addr, PMD_SIZE);
|
||||
real_end = addr + PMD_SIZE;
|
||||
|
||||
/* step_size need to be small so pgt_buf from BRK could cover it */
|
||||
|
@ -779,37 +779,6 @@ void __init mem_init(void)
|
||||
test_wp_bit();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
int arch_add_memory(int nid, u64 start, u64 size,
|
||||
struct mhp_params *params)
|
||||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* The page tables were already mapped at boot so if the caller
|
||||
* requests a different mapping type then we must change all the
|
||||
* pages with __set_memory_prot().
|
||||
*/
|
||||
if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) {
|
||||
ret = __set_memory_prot(start, nr_pages, params->pgprot);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return __add_pages(nid, start_pfn, nr_pages, params);
|
||||
}
|
||||
|
||||
void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
|
||||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
|
||||
__remove_pages(start_pfn, nr_pages, altmap);
|
||||
}
|
||||
#endif
|
||||
|
||||
int kernel_set_to_readonly __read_mostly;
|
||||
|
||||
static void mark_nxdata_nx(void)
|
||||
|
@ -49,7 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
|
||||
p = early_alloc(PMD_SIZE, nid, false);
|
||||
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
|
||||
return;
|
||||
memblock_free_ptr(p, PMD_SIZE);
|
||||
memblock_free(p, PMD_SIZE);
|
||||
}
|
||||
|
||||
p = early_alloc(PAGE_SIZE, nid, true);
|
||||
@ -85,7 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
|
||||
p = early_alloc(PUD_SIZE, nid, false);
|
||||
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
|
||||
return;
|
||||
memblock_free_ptr(p, PUD_SIZE);
|
||||
memblock_free(p, PUD_SIZE);
|
||||
}
|
||||
|
||||
p = early_alloc(PAGE_SIZE, nid, true);
|
||||
|
@ -355,7 +355,7 @@ void __init numa_reset_distance(void)
|
||||
|
||||
/* numa_distance could be 1LU marking allocation failure, test cnt */
|
||||
if (numa_distance_cnt)
|
||||
memblock_free_ptr(numa_distance, size);
|
||||
memblock_free(numa_distance, size);
|
||||
numa_distance_cnt = 0;
|
||||
numa_distance = NULL; /* enable table creation */
|
||||
}
|
||||
|
@ -517,7 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
|
||||
}
|
||||
|
||||
/* free the copied physical distance table */
|
||||
memblock_free_ptr(phys_dist, phys_size);
|
||||
memblock_free(phys_dist, phys_size);
|
||||
return;
|
||||
|
||||
no_emu:
|
||||
|
@ -1025,7 +1025,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
|
||||
for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
|
||||
make_lowmem_page_readwrite(vaddr);
|
||||
|
||||
memblock_free(paddr, size);
|
||||
memblock_phys_free(paddr, size);
|
||||
}
|
||||
|
||||
static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
|
||||
@ -1151,7 +1151,7 @@ static void __init xen_pagetable_p2m_free(void)
|
||||
xen_cleanhighmap(addr, addr + size);
|
||||
size = PAGE_ALIGN(xen_start_info->nr_pages *
|
||||
sizeof(unsigned long));
|
||||
memblock_free(__pa(addr), size);
|
||||
memblock_free((void *)addr, size);
|
||||
} else {
|
||||
xen_cleanmfnmap(addr);
|
||||
}
|
||||
@ -1956,7 +1956,7 @@ void __init xen_relocate_p2m(void)
|
||||
pfn_end = p2m_pfn_end;
|
||||
}
|
||||
|
||||
memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
|
||||
memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
|
||||
while (pfn < pfn_end) {
|
||||
if (pfn == p2m_pfn) {
|
||||
pfn = p2m_pfn_end;
|
||||
|
@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void)
|
||||
static void __ref free_p2m_page(void *p)
|
||||
{
|
||||
if (unlikely(!slab_is_available())) {
|
||||
memblock_free((unsigned long)p, PAGE_SIZE);
|
||||
memblock_free(p, PAGE_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn,
|
||||
break;
|
||||
}
|
||||
}
|
||||
memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
|
||||
memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -719,7 +719,7 @@ static void __init xen_reserve_xen_mfnlist(void)
|
||||
return;
|
||||
|
||||
xen_relocate_p2m();
|
||||
memblock_free(start, size);
|
||||
memblock_phys_free(start, size);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -885,7 +885,7 @@ char * __init xen_memory_setup(void)
|
||||
xen_phys_memcpy(new_area, start, size);
|
||||
pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
|
||||
start, start + size, new_area, new_area + size);
|
||||
memblock_free(start, size);
|
||||
memblock_phys_free(start, size);
|
||||
boot_params.hdr.ramdisk_image = new_area;
|
||||
boot_params.ext_ramdisk_image = new_area >> 32;
|
||||
}
|
||||
|
@ -13,7 +13,7 @@ obj-y += power/
|
||||
obj-$(CONFIG_ISA_BUS_API) += isa.o
|
||||
obj-y += firmware_loader/
|
||||
obj-$(CONFIG_NUMA) += node.o
|
||||
obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
|
||||
obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
|
||||
ifeq ($(CONFIG_SYSFS),y)
|
||||
obj-$(CONFIG_MODULES) += module.o
|
||||
endif
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <linux/of.h>
|
||||
|
||||
#include <asm/sections.h>
|
||||
#include <asm/pgalloc.h>
|
||||
|
||||
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
||||
EXPORT_SYMBOL(node_data);
|
||||
@ -165,25 +166,86 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
|
||||
|
||||
static void __init pcpu_fc_free(void *ptr, size_t size)
|
||||
{
|
||||
memblock_free_early(__pa(ptr), size);
|
||||
memblock_free(ptr, size);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||
static void __init pcpu_populate_pte(unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd = pgd_offset_k(addr);
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
||||
p4d = p4d_offset(pgd, addr);
|
||||
if (p4d_none(*p4d)) {
|
||||
pud_t *new;
|
||||
|
||||
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!new)
|
||||
goto err_alloc;
|
||||
p4d_populate(&init_mm, p4d, new);
|
||||
}
|
||||
|
||||
pud = pud_offset(p4d, addr);
|
||||
if (pud_none(*pud)) {
|
||||
pmd_t *new;
|
||||
|
||||
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!new)
|
||||
goto err_alloc;
|
||||
pud_populate(&init_mm, pud, new);
|
||||
}
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (!pmd_present(*pmd)) {
|
||||
pte_t *new;
|
||||
|
||||
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||
if (!new)
|
||||
goto err_alloc;
|
||||
pmd_populate_kernel(&init_mm, pmd, new);
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
err_alloc:
|
||||
panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n",
|
||||
__func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
|
||||
}
|
||||
#endif
|
||||
|
||||
void __init setup_per_cpu_areas(void)
|
||||
{
|
||||
unsigned long delta;
|
||||
unsigned int cpu;
|
||||
int rc;
|
||||
int rc = -EINVAL;
|
||||
|
||||
/*
|
||||
* Always reserve area for module percpu variables. That's
|
||||
* what the legacy allocator did.
|
||||
*/
|
||||
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
|
||||
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
|
||||
pcpu_cpu_distance,
|
||||
pcpu_fc_alloc, pcpu_fc_free);
|
||||
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
|
||||
/*
|
||||
* Always reserve area for module percpu variables. That's
|
||||
* what the legacy allocator did.
|
||||
*/
|
||||
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
|
||||
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
|
||||
pcpu_cpu_distance,
|
||||
pcpu_fc_alloc, pcpu_fc_free);
|
||||
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||
if (rc < 0)
|
||||
pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n",
|
||||
pcpu_fc_names[pcpu_chosen_fc], rc);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||
if (rc < 0)
|
||||
panic("Failed to initialize percpu areas.");
|
||||
rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE,
|
||||
pcpu_fc_alloc,
|
||||
pcpu_fc_free,
|
||||
pcpu_populate_pte);
|
||||
#endif
|
||||
if (rc < 0)
|
||||
panic("Failed to initialize percpu areas (err=%d).", rc);
|
||||
|
||||
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
|
||||
for_each_possible_cpu(cpu)
|
||||
@ -264,7 +326,7 @@ void __init numa_free_distance(void)
|
||||
size = numa_distance_cnt * numa_distance_cnt *
|
||||
sizeof(numa_distance[0]);
|
||||
|
||||
memblock_free_ptr(numa_distance, size);
|
||||
memblock_free(numa_distance, size);
|
||||
numa_distance_cnt = 0;
|
||||
numa_distance = NULL;
|
||||
}
|
||||
@ -275,15 +337,13 @@ void __init numa_free_distance(void)
|
||||
static int __init numa_alloc_distance(void)
|
||||
{
|
||||
size_t size;
|
||||
u64 phys;
|
||||
int i, j;
|
||||
|
||||
size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
|
||||
phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn));
|
||||
if (WARN_ON(!phys))
|
||||
numa_distance = memblock_alloc(size, PAGE_SIZE);
|
||||
if (WARN_ON(!numa_distance))
|
||||
return -ENOMEM;
|
||||
|
||||
numa_distance = __va(phys);
|
||||
numa_distance_cnt = nr_node_ids;
|
||||
|
||||
/* fill with the default distances */
|
||||
|
@ -629,7 +629,7 @@ static void node_device_release(struct device *dev)
|
||||
{
|
||||
struct node *node = to_node(dev);
|
||||
|
||||
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
|
||||
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
|
||||
/*
|
||||
* We schedule the work only when a memory section is
|
||||
* onlined/offlined on this node. When we come here,
|
||||
@ -782,7 +782,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
static int __ref get_nid_for_pfn(unsigned long pfn)
|
||||
{
|
||||
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
||||
@ -958,10 +958,9 @@ static int node_memory_callback(struct notifier_block *self,
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
#endif /* CONFIG_HUGETLBFS */
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG */
|
||||
|
||||
#if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
|
||||
!defined(CONFIG_HUGETLBFS)
|
||||
#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS)
|
||||
static inline int node_memory_callback(struct notifier_block *self,
|
||||
unsigned long action, void *arg)
|
||||
{
|
||||
|
@ -291,22 +291,16 @@ static ssize_t mem_used_max_store(struct device *dev,
|
||||
return len;
|
||||
}
|
||||
|
||||
static ssize_t idle_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
/*
|
||||
* Mark all pages which are older than or equal to cutoff as IDLE.
|
||||
* Callers should hold the zram init lock in read mode
|
||||
*/
|
||||
static void mark_idle(struct zram *zram, ktime_t cutoff)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
int is_idle = 1;
|
||||
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
|
||||
int index;
|
||||
|
||||
if (!sysfs_streq(buf, "all"))
|
||||
return -EINVAL;
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
if (!init_done(zram)) {
|
||||
up_read(&zram->init_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
for (index = 0; index < nr_pages; index++) {
|
||||
/*
|
||||
* Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
|
||||
@ -314,14 +308,50 @@ static ssize_t idle_store(struct device *dev,
|
||||
*/
|
||||
zram_slot_lock(zram, index);
|
||||
if (zram_allocated(zram, index) &&
|
||||
!zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
||||
zram_set_flag(zram, index, ZRAM_IDLE);
|
||||
!zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
|
||||
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
|
||||
is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time);
|
||||
#endif
|
||||
if (is_idle)
|
||||
zram_set_flag(zram, index, ZRAM_IDLE);
|
||||
}
|
||||
zram_slot_unlock(zram, index);
|
||||
}
|
||||
}
|
||||
|
||||
static ssize_t idle_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
ktime_t cutoff_time = 0;
|
||||
ssize_t rv = -EINVAL;
|
||||
|
||||
if (!sysfs_streq(buf, "all")) {
|
||||
/*
|
||||
* If it did not parse as 'all' try to treat it as an integer when
|
||||
* we have memory tracking enabled.
|
||||
*/
|
||||
u64 age_sec;
|
||||
|
||||
if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec))
|
||||
cutoff_time = ktime_sub(ktime_get_boottime(),
|
||||
ns_to_ktime(age_sec * NSEC_PER_SEC));
|
||||
else
|
||||
goto out;
|
||||
}
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
if (!init_done(zram))
|
||||
goto out_unlock;
|
||||
|
||||
/* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */
|
||||
mark_idle(zram, cutoff_time);
|
||||
rv = len;
|
||||
|
||||
out_unlock:
|
||||
up_read(&zram->init_lock);
|
||||
|
||||
return len;
|
||||
out:
|
||||
return rv;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
@ -587,7 +617,7 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
|
||||
{
|
||||
struct bio *bio;
|
||||
|
||||
bio = bio_alloc(GFP_ATOMIC, 1);
|
||||
bio = bio_alloc(GFP_NOIO, 1);
|
||||
if (!bio)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -910,7 +940,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf,
|
||||
zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
|
||||
zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
|
||||
|
||||
if (count < copied) {
|
||||
if (count <= copied) {
|
||||
zram_slot_unlock(zram, index);
|
||||
break;
|
||||
}
|
||||
|
@ -35,7 +35,7 @@ void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
|
||||
if (slab_is_available())
|
||||
memblock_free_late(phys, size);
|
||||
else
|
||||
memblock_free(phys, size);
|
||||
memblock_phys_free(phys, size);
|
||||
} else if (flags & EFI_MEMMAP_SLAB) {
|
||||
struct page *p = pfn_to_page(PHYS_PFN(phys));
|
||||
unsigned int order = get_order(size);
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/fsi-occ.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
|
@ -570,7 +570,7 @@ fail_msg_node:
|
||||
fail_db_node:
|
||||
of_node_put(smu->db_node);
|
||||
fail_bootmem:
|
||||
memblock_free_ptr(smu, sizeof(struct smu_device));
|
||||
memblock_free(smu, sizeof(struct smu_device));
|
||||
smu = NULL;
|
||||
fail_np:
|
||||
of_node_put(np);
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/swap.h> /* For nr_free_buffer_pages() */
|
||||
#include <linux/list.h>
|
||||
|
||||
#include <linux/debugfs.h>
|
||||
|
@ -2409,6 +2409,7 @@ static void __exit cleanup_mtd(void)
|
||||
if (proc_mtd)
|
||||
remove_proc_entry("mtd", NULL);
|
||||
class_unregister(&mtd_class);
|
||||
bdi_unregister(mtd_bdi);
|
||||
bdi_put(mtd_bdi);
|
||||
idr_destroy(&mtd_idr);
|
||||
}
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <linux/of.h>
|
||||
#include <linux/of_fdt.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#define RNG_SEED_SIZE 128
|
||||
@ -170,8 +171,7 @@ int ima_free_kexec_buffer(void)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return memblock_free(addr, size);
|
||||
|
||||
return memblock_phys_free(addr, size);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -46,7 +46,7 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
|
||||
if (nomap) {
|
||||
err = memblock_mark_nomap(base, size);
|
||||
if (err)
|
||||
memblock_free(base, size);
|
||||
memblock_phys_free(base, size);
|
||||
kmemleak_ignore_phys(base);
|
||||
}
|
||||
|
||||
@ -284,7 +284,8 @@ void __init fdt_init_reserved_mem(void)
|
||||
if (nomap)
|
||||
memblock_clear_nomap(rmem->base, rmem->size);
|
||||
else
|
||||
memblock_free(rmem->base, rmem->size);
|
||||
memblock_phys_free(rmem->base,
|
||||
rmem->size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -965,6 +965,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
|
||||
struct rio_transfer_io *transfer;
|
||||
enum dma_data_direction dir;
|
||||
int i, ret = 0;
|
||||
size_t size;
|
||||
|
||||
if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction))))
|
||||
return -EFAULT;
|
||||
@ -976,13 +977,14 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
|
||||
priv->md->properties.transfer_mode) == 0)
|
||||
return -ENODEV;
|
||||
|
||||
transfer = vmalloc(array_size(sizeof(*transfer), transaction.count));
|
||||
size = array_size(sizeof(*transfer), transaction.count);
|
||||
transfer = vmalloc(size);
|
||||
if (!transfer)
|
||||
return -ENOMEM;
|
||||
|
||||
if (unlikely(copy_from_user(transfer,
|
||||
(void __user *)(uintptr_t)transaction.block,
|
||||
array_size(sizeof(*transfer), transaction.count)))) {
|
||||
size))) {
|
||||
ret = -EFAULT;
|
||||
goto out_free;
|
||||
}
|
||||
@ -994,8 +996,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
|
||||
transaction.sync, dir, &transfer[i]);
|
||||
|
||||
if (unlikely(copy_to_user((void __user *)(uintptr_t)transaction.block,
|
||||
transfer,
|
||||
array_size(sizeof(*transfer), transaction.count))))
|
||||
transfer, size)))
|
||||
ret = -EFAULT;
|
||||
|
||||
out_free:
|
||||
|
@ -139,7 +139,7 @@ int __init sclp_early_get_core_info(struct sclp_core_info *info)
|
||||
}
|
||||
sclp_fill_core_info(info, sccb);
|
||||
out:
|
||||
memblock_free_early((unsigned long)sccb, length);
|
||||
memblock_phys_free((unsigned long)sccb, length);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -185,7 +185,7 @@ static void __init xdbc_free_ring(struct xdbc_ring *ring)
|
||||
if (!seg)
|
||||
return;
|
||||
|
||||
memblock_free(seg->dma, PAGE_SIZE);
|
||||
memblock_phys_free(seg->dma, PAGE_SIZE);
|
||||
ring->segment = NULL;
|
||||
}
|
||||
|
||||
@ -665,10 +665,10 @@ int __init early_xdbc_setup_hardware(void)
|
||||
xdbc_free_ring(&xdbc.in_ring);
|
||||
|
||||
if (xdbc.table_dma)
|
||||
memblock_free(xdbc.table_dma, PAGE_SIZE);
|
||||
memblock_phys_free(xdbc.table_dma, PAGE_SIZE);
|
||||
|
||||
if (xdbc.out_dma)
|
||||
memblock_free(xdbc.out_dma, PAGE_SIZE);
|
||||
memblock_phys_free(xdbc.out_dma, PAGE_SIZE);
|
||||
|
||||
xdbc.table_base = NULL;
|
||||
xdbc.out_buf = NULL;
|
||||
@ -987,8 +987,8 @@ free_and_quit:
|
||||
xdbc_free_ring(&xdbc.evt_ring);
|
||||
xdbc_free_ring(&xdbc.out_ring);
|
||||
xdbc_free_ring(&xdbc.in_ring);
|
||||
memblock_free(xdbc.table_dma, PAGE_SIZE);
|
||||
memblock_free(xdbc.out_dma, PAGE_SIZE);
|
||||
memblock_phys_free(xdbc.table_dma, PAGE_SIZE);
|
||||
memblock_phys_free(xdbc.out_dma, PAGE_SIZE);
|
||||
writel(0, &xdbc.xdbc_reg->control);
|
||||
early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
|
||||
|
||||
|
@ -108,7 +108,7 @@ config VIRTIO_MEM
|
||||
default m
|
||||
depends on X86_64
|
||||
depends on VIRTIO
|
||||
depends on MEMORY_HOTPLUG_SPARSE
|
||||
depends on MEMORY_HOTPLUG
|
||||
depends on MEMORY_HOTREMOVE
|
||||
depends on CONTIG_ALLOC
|
||||
help
|
||||
|
@ -241,7 +241,7 @@ retry:
|
||||
*/
|
||||
rc = xen_swiotlb_fixup(start, nslabs);
|
||||
if (rc) {
|
||||
memblock_free(__pa(start), PAGE_ALIGN(bytes));
|
||||
memblock_free(start, PAGE_ALIGN(bytes));
|
||||
if (nslabs > 1024 && repeat--) {
|
||||
/* Min is 2MB */
|
||||
nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
|
||||
|
@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
|
||||
|
||||
/**
|
||||
* prepend_name - prepend a pathname in front of current buffer pointer
|
||||
* @buffer: buffer pointer
|
||||
* @buflen: allocated length of the buffer
|
||||
* @name: name string and length qstr structure
|
||||
* @p: prepend buffer which contains buffer pointer and allocated length
|
||||
* @name: name string and length qstr structure
|
||||
*
|
||||
* With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
|
||||
* make sure that either the old or the new name pointer and length are
|
||||
@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
|
||||
* prepend_path - Prepend path string to a buffer
|
||||
* @path: the dentry/vfsmount to report
|
||||
* @root: root vfsmnt/dentry
|
||||
* @buffer: pointer to the end of the buffer
|
||||
* @buflen: pointer to buffer length
|
||||
* @p: prepend buffer which contains buffer pointer and allocated length
|
||||
*
|
||||
* The function will first try to write out the pathname without taking any
|
||||
* lock other than the RCU read lock to make sure that dentries won't go away.
|
||||
|
@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
|
||||
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
|
||||
OCFS2_JOURNAL_ACCESS_WRITE);
|
||||
if (status < 0) {
|
||||
ocfs2_commit_trans(osb, handle);
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
|
||||
data_alloc_bh, start_blk,
|
||||
num_clusters);
|
||||
if (status < 0) {
|
||||
ocfs2_commit_trans(osb, handle);
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
@ -6921,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
|
||||
}
|
||||
|
||||
/*
|
||||
* Zero the area past i_size but still within an allocated
|
||||
* cluster. This avoids exposing nonzero data on subsequent file
|
||||
* extends.
|
||||
* Zero partial cluster for a hole punch or truncate. This avoids exposing
|
||||
* nonzero data on subsequent file extends.
|
||||
*
|
||||
* We need to call this before i_size is updated on the inode because
|
||||
* otherwise block_write_full_page() will skip writeout of pages past
|
||||
* i_size. The new_i_size parameter is passed for this reason.
|
||||
* i_size.
|
||||
*/
|
||||
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
|
||||
u64 range_start, u64 range_end)
|
||||
@ -6945,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
|
||||
if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Avoid zeroing pages fully beyond current i_size. It is pointless as
|
||||
* underlying blocks of those pages should be already zeroed out and
|
||||
* page writeback will skip them anyway.
|
||||
*/
|
||||
range_end = min_t(u64, range_end, i_size_read(inode));
|
||||
if (range_start >= range_end)
|
||||
return 0;
|
||||
|
||||
pages = kcalloc(ocfs2_pages_per_cluster(sb),
|
||||
sizeof(struct page *), GFP_NOFS);
|
||||
if (pages == NULL) {
|
||||
@ -6953,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (range_start == range_end)
|
||||
goto out;
|
||||
|
||||
ret = ocfs2_extent_map_get_blocks(inode,
|
||||
range_start >> sb->s_blocksize_bits,
|
||||
&phys, NULL, &ext_flags);
|
||||
|
@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
|
||||
continue;
|
||||
}
|
||||
retry:
|
||||
ret = -EINVAL;
|
||||
mlog(0, "attempting to send begin reco msg to %d\n",
|
||||
nodenum);
|
||||
ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
|
||||
|
@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode,
|
||||
* greater than page size, so we have to truncate them
|
||||
* anyway.
|
||||
*/
|
||||
unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
|
||||
truncate_inode_pages(inode->i_mapping, new_i_size);
|
||||
|
||||
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
||||
unmap_mapping_range(inode->i_mapping,
|
||||
new_i_size + PAGE_SIZE - 1, 0, 1);
|
||||
truncate_inode_pages(inode->i_mapping, new_i_size);
|
||||
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
|
||||
i_size_read(inode), 1);
|
||||
if (status)
|
||||
@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode,
|
||||
goto bail_unlock_sem;
|
||||
}
|
||||
|
||||
unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
|
||||
truncate_inode_pages(inode->i_mapping, new_i_size);
|
||||
|
||||
status = ocfs2_commit_truncate(osb, inode, di_bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
|
@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
|
||||
struct inode *inode = NULL;
|
||||
struct super_block *sb = osb->sb;
|
||||
struct ocfs2_find_inode_args args;
|
||||
journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
|
||||
|
||||
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
|
||||
sysfile_type);
|
||||
@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
|
||||
* part of the transaction - the inode could have been reclaimed and
|
||||
* now it is reread from disk.
|
||||
*/
|
||||
if (journal) {
|
||||
if (osb->journal) {
|
||||
transaction_t *transaction;
|
||||
tid_t tid;
|
||||
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
||||
journal_t *journal = osb->journal->j_journal;
|
||||
|
||||
read_lock(&journal->j_state_lock);
|
||||
if (journal->j_running_transaction)
|
||||
|
@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
|
||||
write_unlock(&journal->j_state_lock);
|
||||
}
|
||||
|
||||
int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
|
||||
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
|
||||
{
|
||||
int status = -1;
|
||||
struct inode *inode = NULL; /* the journal inode */
|
||||
journal_t *j_journal = NULL;
|
||||
struct ocfs2_journal *journal = NULL;
|
||||
struct ocfs2_dinode *di = NULL;
|
||||
struct buffer_head *bh = NULL;
|
||||
struct ocfs2_super *osb;
|
||||
int inode_lock = 0;
|
||||
|
||||
BUG_ON(!journal);
|
||||
/* initialize our journal structure */
|
||||
journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
|
||||
if (!journal) {
|
||||
mlog(ML_ERROR, "unable to alloc journal\n");
|
||||
status = -ENOMEM;
|
||||
goto done;
|
||||
}
|
||||
osb->journal = journal;
|
||||
journal->j_osb = osb;
|
||||
|
||||
osb = journal->j_osb;
|
||||
atomic_set(&journal->j_num_trans, 0);
|
||||
init_rwsem(&journal->j_trans_barrier);
|
||||
init_waitqueue_head(&journal->j_checkpointed);
|
||||
spin_lock_init(&journal->j_lock);
|
||||
journal->j_trans_id = 1UL;
|
||||
INIT_LIST_HEAD(&journal->j_la_cleanups);
|
||||
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
|
||||
journal->j_state = OCFS2_JOURNAL_FREE;
|
||||
|
||||
/* already have the inode for our journal */
|
||||
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
|
||||
@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
|
||||
|
||||
journal->j_state = OCFS2_JOURNAL_FREE;
|
||||
|
||||
// up_write(&journal->j_trans_barrier);
|
||||
done:
|
||||
iput(inode);
|
||||
kfree(journal);
|
||||
osb->journal = NULL;
|
||||
}
|
||||
|
||||
static void ocfs2_clear_journal_error(struct super_block *sb,
|
||||
|
@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
|
||||
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
|
||||
*/
|
||||
void ocfs2_set_journal_params(struct ocfs2_super *osb);
|
||||
int ocfs2_journal_init(struct ocfs2_journal *journal,
|
||||
int *dirty);
|
||||
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
|
||||
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
|
||||
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
|
||||
int full);
|
||||
|
@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
|
||||
/* This will disable recovery and flush any recovery work. */
|
||||
ocfs2_recovery_exit(osb);
|
||||
|
||||
ocfs2_journal_shutdown(osb);
|
||||
|
||||
ocfs2_sync_blockdev(sb);
|
||||
|
||||
ocfs2_purge_refcount_trees(osb);
|
||||
@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
|
||||
|
||||
ocfs2_release_system_inodes(osb);
|
||||
|
||||
ocfs2_journal_shutdown(osb);
|
||||
|
||||
/*
|
||||
* If we're dismounting due to mount error, mount.ocfs2 will clean
|
||||
* up heartbeat. If we're a local mount, there is no heartbeat.
|
||||
@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
|
||||
int i, cbits, bbits;
|
||||
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
|
||||
struct inode *inode = NULL;
|
||||
struct ocfs2_journal *journal;
|
||||
struct ocfs2_super *osb;
|
||||
u64 total_blocks;
|
||||
|
||||
@ -2197,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
|
||||
|
||||
get_random_bytes(&osb->s_next_generation, sizeof(u32));
|
||||
|
||||
/* FIXME
|
||||
* This should be done in ocfs2_journal_init(), but unknown
|
||||
* ordering issues will cause the filesystem to crash.
|
||||
* If anyone wants to figure out what part of the code
|
||||
* refers to osb->journal before ocfs2_journal_init() is run,
|
||||
* be my guest.
|
||||
*/
|
||||
/* initialize our journal structure */
|
||||
|
||||
journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
|
||||
if (!journal) {
|
||||
mlog(ML_ERROR, "unable to alloc journal\n");
|
||||
status = -ENOMEM;
|
||||
goto bail;
|
||||
}
|
||||
osb->journal = journal;
|
||||
journal->j_osb = osb;
|
||||
|
||||
atomic_set(&journal->j_num_trans, 0);
|
||||
init_rwsem(&journal->j_trans_barrier);
|
||||
init_waitqueue_head(&journal->j_checkpointed);
|
||||
spin_lock_init(&journal->j_lock);
|
||||
journal->j_trans_id = (unsigned long) 1;
|
||||
INIT_LIST_HEAD(&journal->j_la_cleanups);
|
||||
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
|
||||
journal->j_state = OCFS2_JOURNAL_FREE;
|
||||
|
||||
INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
|
||||
init_llist_head(&osb->dquot_drop_list);
|
||||
|
||||
@ -2404,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
|
||||
* ourselves. */
|
||||
|
||||
/* Init our journal object. */
|
||||
status = ocfs2_journal_init(osb->journal, &dirty);
|
||||
status = ocfs2_journal_init(osb, &dirty);
|
||||
if (status < 0) {
|
||||
mlog(ML_ERROR, "Could not initialize journal!\n");
|
||||
goto finally;
|
||||
@ -2513,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
|
||||
|
||||
kfree(osb->osb_orphan_wipes);
|
||||
kfree(osb->slot_recovery_generations);
|
||||
/* FIXME
|
||||
* This belongs in journal shutdown, but because we have to
|
||||
* allocate osb->journal at the start of ocfs2_initialize_osb(),
|
||||
* we free it here.
|
||||
*/
|
||||
kfree(osb->journal);
|
||||
kfree(osb->local_alloc_copy);
|
||||
kfree(osb->uuid_str);
|
||||
kfree(osb->vol_label);
|
||||
|
16
fs/open.c
16
fs/open.c
@ -856,8 +856,20 @@ static int do_dentry_open(struct file *f,
|
||||
* of THPs into the page cache will fail.
|
||||
*/
|
||||
smp_mb();
|
||||
if (filemap_nr_thps(inode->i_mapping))
|
||||
truncate_pagecache(inode, 0);
|
||||
if (filemap_nr_thps(inode->i_mapping)) {
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
/*
|
||||
* unmap_mapping_range just need to be called once
|
||||
* here, because the private pages is not need to be
|
||||
* unmapped mapping (e.g. data segment of dynamic
|
||||
* shared libraries here).
|
||||
*/
|
||||
unmap_mapping_range(mapping, 0, 0, 0);
|
||||
truncate_inode_pages(mapping, 0);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user