How to keep executable code in memory even under memory pressure ? in Linux

纵然是瞬间 提交于 2019-11-30 09:30:54

To answer the question, here's a simple/preliminary patch to not evict Active(file)(as seen in /proc/meminfo) if it's less than 256 MiB, that seems to work ok (no disk thrashing) with linux-stable 5.2.4:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index dbdc46a84f63..7a0b7e32ff45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2445,6 +2445,13 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
            BUG();
        }

+    if (NR_ACTIVE_FILE == lru) {
+      long long kib_active_file_now=global_node_page_state(NR_ACTIVE_FILE) * MAX_NR_ZONES;
+      if (kib_active_file_now <= 256*1024) {
+        nr[lru] = 0; //don't reclaim any Active(file) (see /proc/meminfo) if they are under 256MiB
+        continue;
+      }
+    }
        *lru_pages += size;
        nr[lru] = scan;
    }

Note that some yet-to-be-found regression on kernel 5.3.0-rc4-gd45331b00ddb will cause a system freeze(without disk thrashing, and sysrq will still work) even without this patch.

(any new developments related to this should be happening here.)

WARNING: Do not use this patch if you have swap enabled, because two users reported worse effects. I've only tested this patch with swap disabled in kernel! (ie. CONFIG_SWAP is not set)

Until further notice(or someone comes up with something better), I am using (and it works, for me) the following patch in order to avoid any disk thrashing / OS freeze when about to run Out Of Memory and thus the OOM-killer triggers as soon as possible(max 1 sec):

revision 3
preliminary patch to avoid disk thrashing (constant reading) under memory pressure before OOM-killer triggers
more info: https://gist.github.com/constantoverride/84eba764f487049ed642eb2111a20830

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2..7636498 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -208,7 +208,7 @@ enum lru_list {

 #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

-#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
+#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_INACTIVE_FILE; lru++)

 static inline int is_file_lru(enum lru_list lru)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..1f3ffb5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2086,9 +2086,9 @@ static unsigned long shrink_list(enum lr
                 struct scan_control *sc)
 {
    if (is_active_lru(lru)) {
-       if (inactive_list_is_low(lruvec, is_file_lru(lru),
-                    memcg, sc, true))
-           shrink_active_list(nr_to_scan, lruvec, sc, lru);
+       //if (inactive_list_is_low(lruvec, is_file_lru(lru),
+       //           memcg, sc, true))
+       //  shrink_active_list(nr_to_scan, lruvec, sc, lru);
        return 0;
    }

@@ -2234,7 +2234,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,

    anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
        lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
-   file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+   file  = //lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
        lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);

    spin_lock_irq(&pgdat->lru_lock);
@@ -2345,7 +2345,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
             sc->priority == DEF_PRIORITY);

    blk_start_plug(&plug);
-   while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+   while (nr[LRU_INACTIVE_ANON] || //nr[LRU_ACTIVE_FILE] ||
                    nr[LRU_INACTIVE_FILE]) {
        unsigned long nr_anon, nr_file, percentage;
        unsigned long nr_scanned;
@@ -2372,7 +2372,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         * stop reclaiming one LRU and reduce the amount scanning
         * proportional to the original scan target.
         */
-       nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+       nr_file = nr[LRU_INACTIVE_FILE] //+ nr[LRU_ACTIVE_FILE]
+           ;
        nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

        /*
@@ -2391,7 +2392,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
            percentage = nr_anon * 100 / scan_target;
        } else {
            unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
-                       targets[LRU_ACTIVE_FILE] + 1;
+                       //targets[LRU_ACTIVE_FILE] + 
+                       1;
            lru = LRU_FILE;
            percentage = nr_file * 100 / scan_target;
        }
@@ -2409,10 +2411,12 @@ static void shrink_node_memcg(struct pgl
        nr[lru] = targets[lru] * (100 - percentage) / 100;
        nr[lru] -= min(nr[lru], nr_scanned);

+       if (LRU_FILE != lru) { //avoid this block for LRU_ACTIVE_FILE
        lru += LRU_ACTIVE;
        nr_scanned = targets[lru] - nr[lru];
        nr[lru] = targets[lru] * (100 - percentage) / 100;
        nr[lru] -= min(nr[lru], nr_scanned);
+       }

        scan_adjusted = true;
    }

Unfortunately the above converted tabs into spaces, so if you want the raw patch it's here.

What this patch does is not evict the Active(file) pages when under memory pressure and thus not cause kswapd0 (but seen in iotop as each program itself) to re-read every running process's executable pages each time there's a context switch in order to allow the program to (continue to)run. Thus, a ton of disk thrashing is avoided and the OS does not freeze into a crawl.

The above was tested with kernel 4.18.5 (and now testing 4.18.7) inside Qubes OS 4.0 's dom0(Fedora 25) and all VMs (Fedora 28) that I'm using.

For the first version of this patch, which also works as well(apparently), see the EDIT on the very question that this is an answer of.

UPDATE: After using this patch for a while on an ArchLinux laptop with 16G RAM (minus 512M reserved for integrated graphics card) and no swap(disabled in kernel too) I can say that the system can run out of memory sooner than without the le9d.patch (rev. 3), and so OOM-killer triggers for Xorg or chromium or other when it wouldn't have without the patch. And so as a mitigation, that seems to work for me thus far, I've been running echo 1 > /proc/sys/vm/drop_caches whenever the Active(file) number in /proc/meminfo is over 2G aka 2000000 KB (eg. get number of KB via this code: grep 'Active(file):' /proc/meminfo|tr -d ' '|cut -f2 -d:|sed 's/kB//') and doing this check with a sleep 5 afterwards. But lately in order to compile firefox-hg in /tmp which is tmpfs and which ultimately uses 12G and ensure it doesn't get OOM-killed, I've been using 500000 instead of 2000000 KB. It sure is better than freezing the entire system (ie. when without le9d.patch) which would've happened in this firefox compilation case. Without this check, Active(file) goes no higher than 4G, but that's enough to OOM-kill Xorg if something wants more memory, such as in this firefox compilation case or even when just copying many gigabytes via midnight commander(if I remember this correctly).

The memory.min parameter in the cgroups-v2 memory controller should help.

Namely, let me quote:

"Hard memory protection. If the memory usage of a cgroup is within its effective min boundary, the cgroup’s memory won’t be reclaimed under any conditions. If there is no unprotected reclaimable memory available, OOM killer is invoked."

https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!