790 lines
24 KiB
Diff
790 lines
24 KiB
Diff
From mboxrd@z Thu Jan 1 00:00:00 1970
|
|
Return-Path: <linux-kernel-owner@kernel.org>
|
|
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
|
|
aws-us-west-2-korg-lkml-1.web.codeaurora.org
|
|
X-Spam-Level:
|
|
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
|
|
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
|
|
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
|
|
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
|
|
version=3.4.0
|
|
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
|
|
by smtp.lore.kernel.org (Postfix) with ESMTP id 658D8C433B4
|
|
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:28 +0000 (UTC)
|
|
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
|
|
by mail.kernel.org (Postfix) with ESMTP id 477CA611BE
|
|
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:28 +0000 (UTC)
|
|
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
|
|
id S231165AbhETGzr (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
|
|
Thu, 20 May 2021 02:55:47 -0400
|
|
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37908 "EHLO
|
|
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
|
|
with ESMTP id S230424AbhETGzh (ORCPT
|
|
<rfc822;linux-kernel@vger.kernel.org>);
|
|
Thu, 20 May 2021 02:55:37 -0400
|
|
Received: from mail-qv1-xf4a.google.com (mail-qv1-xf4a.google.com [IPv6:2607:f8b0:4864:20::f4a])
|
|
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id EAF95C061574
|
|
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:15 -0700 (PDT)
|
|
Received: by mail-qv1-xf4a.google.com with SMTP id r11-20020a0cb28b0000b02901c87a178503so12393761qve.22
|
|
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:15 -0700 (PDT)
|
|
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
|
d=google.com; s=20161025;
|
|
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
|
|
:cc;
|
|
bh=P78haeNjzr5Qg1JjQymXtCqtqXQumRFjJWFx1f2kmKM=;
|
|
b=Tjsj7/GeS8mUtREXLxPPRM0sVotzXnOQ/Dq8MvDajXLm9nT1QjyleqN5ONXOxfHJSb
|
|
gOKQ1YJhBwyuC3HCKJXdOCqgqOmQbjJGjOkM9uXhZa9/9W+Bvnszx1RDX4YRwIqqWgFX
|
|
flJvQvCE2SODYJwvTs6wKWKKQlvvw9WY05ct8oakXuEPnAOblfqTR+pbk7GoCJo67kNf
|
|
enTegbyR2yRwGi9N5coUMJM8TYP+BoBWQaHNTVR3nL7a6nEjAg1IrL1w4WaZ+/fsdDdF
|
|
6FlorKJ31sPCd2wxkCOnn+o98vuymHUDmyr+h9KxZtecLKHCkTsolSRuLiyHQvlzqY3q
|
|
md3Q==
|
|
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
|
d=1e100.net; s=20161025;
|
|
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
|
|
:references:subject:from:to:cc;
|
|
bh=P78haeNjzr5Qg1JjQymXtCqtqXQumRFjJWFx1f2kmKM=;
|
|
b=oK3flk/MdWi/bqnKFxC7O7BqH1b1apkGTQgT4OLVuSurUs5o7HcTTMvjXuljN/KmMh
|
|
/OGEWIkS+BHD6OkEE9W7Q/5GoGXL7Np1sLByjbiNrfCNZHtmEvYLHtP9lYulkcWaLTgA
|
|
XEr3n9zWofP9Jw0bPM24RW8jqzAlzld2tkrpDSgnfmMEpyzmjuFEURnKsx/ubUbuQ8Vd
|
|
rkIngqIt1YDBI+x6EZEdq4OpP+8H9TDr8KZBjUVfzpvASnMYn2y9gZX4Obd5/t+wys2m
|
|
zn5+4aqeR8mtxQVzHwPM48LG5wPqbTtMF0+Mhoba0Enk55ZL29+xKT00ltswnvHNJDj9
|
|
UduQ==
|
|
X-Gm-Message-State: AOAM5324lhHETXZQ7vXVsQ3UhfF140iLgXV/soebRFc0ECp355pnwH5X
|
|
pEYaLnlH20Lc9hBvEeYp/HXipMEwsdE=
|
|
X-Google-Smtp-Source: ABdhPJxAkOjDRLTPPi669WBE6Bb6QiyW8Wr0JRRG09c2L2y7UvYt7Th6JQxML99ZXqbjrM7T5yJPx76NwGo=
|
|
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
|
|
(user=yuzhao job=sendgmr) by 2002:a05:6214:76b:: with SMTP id
|
|
f11mr3992753qvz.8.1621493655061; Wed, 19 May 2021 23:54:15 -0700 (PDT)
|
|
Date: Thu, 20 May 2021 00:53:50 -0600
|
|
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
|
|
Message-Id: <20210520065355.2736558-10-yuzhao@google.com>
|
|
Mime-Version: 1.0
|
|
References: <20210520065355.2736558-1-yuzhao@google.com>
|
|
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
|
|
Subject: [PATCH v3 09/14] mm: multigenerational lru: mm_struct list
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
To: linux-mm@kvack.org
|
|
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
|
|
Andrew Morton <akpm@linux-foundation.org>,
|
|
Dave Chinner <david@fromorbit.com>,
|
|
Dave Hansen <dave.hansen@linux.intel.com>,
|
|
Donald Carr <sirspudd@gmail.com>,
|
|
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
|
|
Johannes Weiner <hannes@cmpxchg.org>,
|
|
Jonathan Corbet <corbet@lwn.net>,
|
|
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
|
|
Konstantin Kharlamov <hi-angel@yandex.ru>,
|
|
Marcus Seyfarth <m.seyfarth@gmail.com>,
|
|
Matthew Wilcox <willy@infradead.org>,
|
|
Mel Gorman <mgorman@suse.de>,
|
|
Miaohe Lin <linmiaohe@huawei.com>,
|
|
Michael Larabel <michael@michaellarabel.com>,
|
|
Michal Hocko <mhocko@suse.com>,
|
|
Michel Lespinasse <michel@lespinasse.org>,
|
|
Rik van Riel <riel@surriel.com>,
|
|
Roman Gushchin <guro@fb.com>,
|
|
Tim Chen <tim.c.chen@linux.intel.com>,
|
|
Vlastimil Babka <vbabka@suse.cz>,
|
|
Yang Shi <shy828301@gmail.com>,
|
|
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
|
|
linux-kernel@vger.kernel.org, lkp@lists.01.org,
|
|
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
|
|
Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
Content-Type: text/plain; charset="UTF-8"
|
|
Precedence: bulk
|
|
List-ID: <linux-kernel.vger.kernel.org>
|
|
X-Mailing-List: linux-kernel@vger.kernel.org
|
|
List-Archive: <https://lore.kernel.org/lkml/>
|
|
|
|
In order to scan page tables, we add an infrastructure to maintain
|
|
either a system-wide mm_struct list or per-memcg mm_struct lists, and
|
|
track whether an mm_struct is being used or has been used since the
|
|
last scan.
|
|
|
|
Multiple threads can concurrently work on the same mm_struct list, and
|
|
each of them will be given a different mm_struct belonging to a
|
|
process that has been scheduled since the last scan.
|
|
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
---
|
|
fs/exec.c | 2 +
|
|
include/linux/memcontrol.h | 6 +
|
|
include/linux/mm_types.h | 107 ++++++++++++
|
|
kernel/exit.c | 1 +
|
|
kernel/fork.c | 10 ++
|
|
kernel/kthread.c | 1 +
|
|
kernel/sched/core.c | 2 +
|
|
mm/memcontrol.c | 28 ++++
|
|
mm/vmscan.c | 324 +++++++++++++++++++++++++++++++++++++
|
|
9 files changed, 481 insertions(+)
|
|
|
|
diff --git a/fs/exec.c b/fs/exec.c
|
|
index 18594f11c31f..c691d4d7720c 100644
|
|
--- a/fs/exec.c
|
|
+++ b/fs/exec.c
|
|
@@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm)
|
|
active_mm = tsk->active_mm;
|
|
tsk->active_mm = mm;
|
|
tsk->mm = mm;
|
|
+ lru_gen_add_mm(mm);
|
|
/*
|
|
* This prevents preemption while active_mm is being loaded and
|
|
* it and mm are being updated, which could cause problems for
|
|
@@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm)
|
|
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
|
|
local_irq_enable();
|
|
activate_mm(active_mm, mm);
|
|
+ lru_gen_switch_mm(active_mm, mm);
|
|
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
|
|
local_irq_enable();
|
|
tsk->mm->vmacache_seqnum = 0;
|
|
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
|
|
index 6bcac3d91dd1..60601a997433 100644
|
|
--- a/include/linux/memcontrol.h
|
|
+++ b/include/linux/memcontrol.h
|
|
@@ -230,6 +230,8 @@ struct obj_cgroup {
|
|
};
|
|
};
|
|
|
|
+struct lru_gen_mm_list;
|
|
+
|
|
/*
|
|
* The memory controller data structure. The memory controller controls both
|
|
* page cache and RSS per cgroup. We would eventually like to provide
|
|
@@ -349,6 +351,10 @@ struct mem_cgroup {
|
|
struct deferred_split deferred_split_queue;
|
|
#endif
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+ struct lru_gen_mm_list *mm_list;
|
|
+#endif
|
|
+
|
|
struct mem_cgroup_per_node *nodeinfo[0];
|
|
/* WARNING: nodeinfo must be the last member here */
|
|
};
|
|
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
|
|
index 5aacc1c10a45..b0f662555eae 100644
|
|
--- a/include/linux/mm_types.h
|
|
+++ b/include/linux/mm_types.h
|
|
@@ -15,6 +15,8 @@
|
|
#include <linux/page-flags-layout.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/seqlock.h>
|
|
+#include <linux/nodemask.h>
|
|
+#include <linux/mmdebug.h>
|
|
|
|
#include <asm/mmu.h>
|
|
|
|
@@ -561,6 +563,22 @@ struct mm_struct {
|
|
|
|
#ifdef CONFIG_IOMMU_SUPPORT
|
|
u32 pasid;
|
|
+#endif
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+ struct {
|
|
+ /* the node of a global or per-memcg mm_struct list */
|
|
+ struct list_head list;
|
|
+#ifdef CONFIG_MEMCG
|
|
+ /* points to the memcg of the owner task above */
|
|
+ struct mem_cgroup *memcg;
|
|
+#endif
|
|
+ /* whether this mm_struct has been used since the last walk */
|
|
+ nodemask_t nodes;
|
|
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
|
+ /* the number of CPUs using this mm_struct */
|
|
+ atomic_t nr_cpus;
|
|
+#endif
|
|
+ } lrugen;
|
|
#endif
|
|
} __randomize_layout;
|
|
|
|
@@ -588,6 +606,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
|
|
return (struct cpumask *)&mm->cpu_bitmap;
|
|
}
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+
|
|
+void lru_gen_init_mm(struct mm_struct *mm);
|
|
+void lru_gen_add_mm(struct mm_struct *mm);
|
|
+void lru_gen_del_mm(struct mm_struct *mm);
|
|
+#ifdef CONFIG_MEMCG
|
|
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg);
|
|
+void lru_gen_free_mm_list(struct mem_cgroup *memcg);
|
|
+void lru_gen_migrate_mm(struct mm_struct *mm);
|
|
+#endif
|
|
+
|
|
+/* Track the usage of each mm_struct so that we can skip inactive ones. */
|
|
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
|
|
+{
|
|
+ /* exclude init_mm, efi_mm, etc. */
|
|
+ if (!core_kernel_data((unsigned long)old)) {
|
|
+ VM_BUG_ON(old == &init_mm);
|
|
+
|
|
+ nodes_setall(old->lrugen.nodes);
|
|
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
|
+ atomic_dec(&old->lrugen.nr_cpus);
|
|
+ VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old);
|
|
+#endif
|
|
+ } else
|
|
+ VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) ||
|
|
+ READ_ONCE(old->lrugen.list.next), old);
|
|
+
|
|
+ if (!core_kernel_data((unsigned long)new)) {
|
|
+ VM_BUG_ON(new == &init_mm);
|
|
+
|
|
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
|
+ atomic_inc(&new->lrugen.nr_cpus);
|
|
+ VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new);
|
|
+#endif
|
|
+ } else
|
|
+ VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) ||
|
|
+ READ_ONCE(new->lrugen.list.next), new);
|
|
+}
|
|
+
|
|
+/* Return whether this mm_struct is being used on any CPUs. */
|
|
+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
|
|
+{
|
|
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
|
+ return !cpumask_empty(mm_cpumask(mm));
|
|
+#else
|
|
+ return atomic_read(&mm->lrugen.nr_cpus);
|
|
+#endif
|
|
+}
|
|
+
|
|
+#else /* CONFIG_LRU_GEN */
|
|
+
|
|
+static inline void lru_gen_init_mm(struct mm_struct *mm)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void lru_gen_add_mm(struct mm_struct *mm)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void lru_gen_del_mm(struct mm_struct *mm)
|
|
+{
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
|
|
+{
|
|
+}
|
|
+#endif
|
|
+
|
|
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
struct mmu_gather;
|
|
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
|
|
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
|
|
diff --git a/kernel/exit.c b/kernel/exit.c
|
|
index fd1c04193e18..b362179852f1 100644
|
|
--- a/kernel/exit.c
|
|
+++ b/kernel/exit.c
|
|
@@ -423,6 +423,7 @@ void mm_update_next_owner(struct mm_struct *mm)
|
|
goto retry;
|
|
}
|
|
WRITE_ONCE(mm->owner, c);
|
|
+ lru_gen_migrate_mm(mm);
|
|
task_unlock(c);
|
|
put_task_struct(c);
|
|
}
|
|
diff --git a/kernel/fork.c b/kernel/fork.c
|
|
index dc06afd725cb..2fd7dae9afcb 100644
|
|
--- a/kernel/fork.c
|
|
+++ b/kernel/fork.c
|
|
@@ -669,6 +669,7 @@ static void check_mm(struct mm_struct *mm)
|
|
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
|
|
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
|
|
#endif
|
|
+ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm);
|
|
}
|
|
|
|
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
|
|
@@ -1061,6 +1062,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
|
|
goto fail_nocontext;
|
|
|
|
mm->user_ns = get_user_ns(user_ns);
|
|
+ lru_gen_init_mm(mm);
|
|
return mm;
|
|
|
|
fail_nocontext:
|
|
@@ -1103,6 +1105,7 @@ static inline void __mmput(struct mm_struct *mm)
|
|
}
|
|
if (mm->binfmt)
|
|
module_put(mm->binfmt->module);
|
|
+ lru_gen_del_mm(mm);
|
|
mmdrop(mm);
|
|
}
|
|
|
|
@@ -2524,6 +2527,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
|
|
get_task_struct(p);
|
|
}
|
|
|
|
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
|
|
+ /* lock the task to synchronize with memcg migration */
|
|
+ task_lock(p);
|
|
+ lru_gen_add_mm(p->mm);
|
|
+ task_unlock(p);
|
|
+ }
|
|
+
|
|
wake_up_new_task(p);
|
|
|
|
/* forking complete and child started to run, tell ptracer */
|
|
diff --git a/kernel/kthread.c b/kernel/kthread.c
|
|
index fe3f2a40d61e..b81e49ed31a7 100644
|
|
--- a/kernel/kthread.c
|
|
+++ b/kernel/kthread.c
|
|
@@ -1325,6 +1325,7 @@ void kthread_use_mm(struct mm_struct *mm)
|
|
tsk->mm = mm;
|
|
membarrier_update_current_mm(mm);
|
|
switch_mm_irqs_off(active_mm, mm, tsk);
|
|
+ lru_gen_switch_mm(active_mm, mm);
|
|
local_irq_enable();
|
|
task_unlock(tsk);
|
|
#ifdef finish_arch_post_lock_switch
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index 5226cc26a095..2d4b77f173db 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -4323,6 +4323,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
|
|
* finish_task_switch()'s mmdrop().
|
|
*/
|
|
switch_mm_irqs_off(prev->active_mm, next->mm, next);
|
|
+ lru_gen_switch_mm(prev->active_mm, next->mm);
|
|
|
|
if (!prev->mm) { // from kernel
|
|
/* will mmdrop() in finish_task_switch(). */
|
|
@@ -7603,6 +7604,7 @@ void idle_task_exit(void)
|
|
|
|
if (mm != &init_mm) {
|
|
switch_mm(mm, &init_mm, current);
|
|
+ lru_gen_switch_mm(mm, &init_mm);
|
|
finish_arch_post_lock_switch();
|
|
}
|
|
|
|
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
|
|
index 64ada9e650a5..58b610ffa0e0 100644
|
|
--- a/mm/memcontrol.c
|
|
+++ b/mm/memcontrol.c
|
|
@@ -4981,6 +4981,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
|
|
for_each_node(node)
|
|
free_mem_cgroup_per_node_info(memcg, node);
|
|
free_percpu(memcg->vmstats_percpu);
|
|
+ lru_gen_free_mm_list(memcg);
|
|
kfree(memcg);
|
|
}
|
|
|
|
@@ -5030,6 +5031,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
|
|
if (alloc_mem_cgroup_per_node_info(memcg, node))
|
|
goto fail;
|
|
|
|
+ if (lru_gen_alloc_mm_list(memcg))
|
|
+ goto fail;
|
|
+
|
|
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
|
|
goto fail;
|
|
|
|
@@ -5991,6 +5995,29 @@ static void mem_cgroup_move_task(void)
|
|
}
|
|
#endif
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
|
|
+{
|
|
+ struct cgroup_subsys_state *css;
|
|
+ struct task_struct *task = NULL;
|
|
+
|
|
+ cgroup_taskset_for_each_leader(task, css, tset)
|
|
+ ;
|
|
+
|
|
+ if (!task)
|
|
+ return;
|
|
+
|
|
+ task_lock(task);
|
|
+ if (task->mm && task->mm->owner == task)
|
|
+ lru_gen_migrate_mm(task->mm);
|
|
+ task_unlock(task);
|
|
+}
|
|
+#else
|
|
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
|
|
+{
|
|
+}
|
|
+#endif
|
|
+
|
|
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
|
|
{
|
|
if (value == PAGE_COUNTER_MAX)
|
|
@@ -6332,6 +6359,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
|
|
.css_reset = mem_cgroup_css_reset,
|
|
.css_rstat_flush = mem_cgroup_css_rstat_flush,
|
|
.can_attach = mem_cgroup_can_attach,
|
|
+ .attach = mem_cgroup_attach,
|
|
.cancel_attach = mem_cgroup_cancel_attach,
|
|
.post_attach = mem_cgroup_move_task,
|
|
.dfl_cftypes = memory_files,
|
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
|
index 84d25079092e..d93d2272e475 100644
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -2869,6 +2869,323 @@ static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *
|
|
sp->refaulted * max(pv->total, 1UL) * pv->gain;
|
|
}
|
|
|
|
+/******************************************************************************
|
|
+ * mm_struct list
|
|
+ ******************************************************************************/
|
|
+
|
|
+enum {
|
|
+ MM_SCHED_ACTIVE, /* running processes */
|
|
+ MM_SCHED_INACTIVE, /* sleeping processes */
|
|
+ MM_LOCK_CONTENTION, /* lock contentions */
|
|
+ MM_VMA_INTERVAL, /* VMAs within the range of each PUD/PMD/PTE */
|
|
+ MM_LEAF_OTHER_NODE, /* entries not from the node under reclaim */
|
|
+ MM_LEAF_OTHER_MEMCG, /* entries not from the memcg under reclaim */
|
|
+ MM_LEAF_OLD, /* old entries */
|
|
+ MM_LEAF_YOUNG, /* young entries */
|
|
+ MM_LEAF_DIRTY, /* dirty entries */
|
|
+ MM_LEAF_HOLE, /* non-present entries */
|
|
+ MM_NONLEAF_OLD, /* old non-leaf PMD entries */
|
|
+ MM_NONLEAF_YOUNG, /* young non-leaf PMD entries */
|
|
+ NR_MM_STATS
|
|
+};
|
|
+
|
|
+/* mnemonic codes for the stats above */
|
|
+#define MM_STAT_CODES "aicvnmoydhlu"
|
|
+
|
|
+struct lru_gen_mm_list {
|
|
+ /* the head of a global or per-memcg mm_struct list */
|
|
+ struct list_head head;
|
|
+ /* protects the list */
|
|
+ spinlock_t lock;
|
|
+ struct {
|
|
+ /* set to max_seq after each round of walk */
|
|
+ unsigned long cur_seq;
|
|
+ /* the next mm on the list to walk */
|
|
+ struct list_head *iter;
|
|
+ /* to wait for the last worker to finish */
|
|
+ struct wait_queue_head wait;
|
|
+ /* the number of concurrent workers */
|
|
+ int nr_workers;
|
|
+ /* stats for debugging */
|
|
+ unsigned long stats[NR_STAT_GENS][NR_MM_STATS];
|
|
+ } nodes[0];
|
|
+};
|
|
+
|
|
+static struct lru_gen_mm_list *global_mm_list;
|
|
+
|
|
+static struct lru_gen_mm_list *alloc_mm_list(void)
|
|
+{
|
|
+ int nid;
|
|
+ struct lru_gen_mm_list *mm_list;
|
|
+
|
|
+ mm_list = kzalloc(struct_size(mm_list, nodes, nr_node_ids), GFP_KERNEL);
|
|
+ if (!mm_list)
|
|
+ return NULL;
|
|
+
|
|
+ INIT_LIST_HEAD(&mm_list->head);
|
|
+ spin_lock_init(&mm_list->lock);
|
|
+
|
|
+ for_each_node(nid) {
|
|
+ mm_list->nodes[nid].cur_seq = MIN_NR_GENS;
|
|
+ mm_list->nodes[nid].iter = &mm_list->head;
|
|
+ init_waitqueue_head(&mm_list->nodes[nid].wait);
|
|
+ }
|
|
+
|
|
+ return mm_list;
|
|
+}
|
|
+
|
|
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
|
|
+{
|
|
+#ifdef CONFIG_MEMCG
|
|
+ if (!mem_cgroup_disabled())
|
|
+ return memcg ? memcg->mm_list : root_mem_cgroup->mm_list;
|
|
+#endif
|
|
+ VM_BUG_ON(memcg);
|
|
+
|
|
+ return global_mm_list;
|
|
+}
|
|
+
|
|
+void lru_gen_init_mm(struct mm_struct *mm)
|
|
+{
|
|
+ INIT_LIST_HEAD(&mm->lrugen.list);
|
|
+#ifdef CONFIG_MEMCG
|
|
+ mm->lrugen.memcg = NULL;
|
|
+#endif
|
|
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
|
+ atomic_set(&mm->lrugen.nr_cpus, 0);
|
|
+#endif
|
|
+ nodes_clear(mm->lrugen.nodes);
|
|
+}
|
|
+
|
|
+void lru_gen_add_mm(struct mm_struct *mm)
|
|
+{
|
|
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
|
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
|
+
|
|
+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
|
|
+#ifdef CONFIG_MEMCG
|
|
+ VM_BUG_ON_MM(mm->lrugen.memcg, mm);
|
|
+ WRITE_ONCE(mm->lrugen.memcg, memcg);
|
|
+#endif
|
|
+ spin_lock(&mm_list->lock);
|
|
+ list_add_tail(&mm->lrugen.list, &mm_list->head);
|
|
+ spin_unlock(&mm_list->lock);
|
|
+}
|
|
+
|
|
+void lru_gen_del_mm(struct mm_struct *mm)
|
|
+{
|
|
+ int nid;
|
|
+#ifdef CONFIG_MEMCG
|
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(mm->lrugen.memcg);
|
|
+#else
|
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(NULL);
|
|
+#endif
|
|
+
|
|
+ spin_lock(&mm_list->lock);
|
|
+
|
|
+ for_each_node(nid) {
|
|
+ if (mm_list->nodes[nid].iter != &mm->lrugen.list)
|
|
+ continue;
|
|
+
|
|
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
|
|
+ if (mm_list->nodes[nid].iter == &mm_list->head)
|
|
+ WRITE_ONCE(mm_list->nodes[nid].cur_seq,
|
|
+ mm_list->nodes[nid].cur_seq + 1);
|
|
+ }
|
|
+
|
|
+ list_del_init(&mm->lrugen.list);
|
|
+
|
|
+ spin_unlock(&mm_list->lock);
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+ mem_cgroup_put(mm->lrugen.memcg);
|
|
+ WRITE_ONCE(mm->lrugen.memcg, NULL);
|
|
+#endif
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
|
|
+{
|
|
+ if (mem_cgroup_disabled())
|
|
+ return 0;
|
|
+
|
|
+ memcg->mm_list = alloc_mm_list();
|
|
+
|
|
+ return memcg->mm_list ? 0 : -ENOMEM;
|
|
+}
|
|
+
|
|
+void lru_gen_free_mm_list(struct mem_cgroup *memcg)
|
|
+{
|
|
+ kfree(memcg->mm_list);
|
|
+ memcg->mm_list = NULL;
|
|
+}
|
|
+
|
|
+void lru_gen_migrate_mm(struct mm_struct *mm)
|
|
+{
|
|
+ struct mem_cgroup *memcg;
|
|
+
|
|
+ lockdep_assert_held(&mm->owner->alloc_lock);
|
|
+
|
|
+ if (mem_cgroup_disabled())
|
|
+ return;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ memcg = mem_cgroup_from_task(mm->owner);
|
|
+ rcu_read_unlock();
|
|
+ if (memcg == mm->lrugen.memcg)
|
|
+ return;
|
|
+
|
|
+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
|
|
+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
|
|
+
|
|
+ lru_gen_del_mm(mm);
|
|
+ lru_gen_add_mm(mm);
|
|
+}
|
|
+
|
|
+static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg)
|
|
+{
|
|
+ return READ_ONCE(mm->lrugen.memcg) != memcg;
|
|
+}
|
|
+#else
|
|
+static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+#endif
|
|
+
|
|
+struct mm_walk_args {
|
|
+ struct mem_cgroup *memcg;
|
|
+ unsigned long max_seq;
|
|
+ unsigned long start_pfn;
|
|
+ unsigned long end_pfn;
|
|
+ unsigned long next_addr;
|
|
+ int node_id;
|
|
+ int swappiness;
|
|
+ int batch_size;
|
|
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
|
+ int mm_stats[NR_MM_STATS];
|
|
+ unsigned long bitmap[0];
|
|
+};
|
|
+
|
|
+static int size_of_mm_walk_args(void)
|
|
+{
|
|
+ int size = sizeof(struct mm_walk_args);
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) ||
|
|
+ IS_ENABLED(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG))
|
|
+ size += sizeof(unsigned long) * BITS_TO_LONGS(PTRS_PER_PMD);
|
|
+
|
|
+ return size;
|
|
+}
|
|
+
|
|
+static void reset_mm_stats(struct lru_gen_mm_list *mm_list, bool last,
|
|
+ struct mm_walk_args *args)
|
|
+{
|
|
+ int i;
|
|
+ int nid = args->node_id;
|
|
+ int hist = hist_from_seq_or_gen(args->max_seq);
|
|
+
|
|
+ lockdep_assert_held(&mm_list->lock);
|
|
+
|
|
+ for (i = 0; i < NR_MM_STATS; i++) {
|
|
+ WRITE_ONCE(mm_list->nodes[nid].stats[hist][i],
|
|
+ mm_list->nodes[nid].stats[hist][i] + args->mm_stats[i]);
|
|
+ args->mm_stats[i] = 0;
|
|
+ }
|
|
+
|
|
+ if (!last || NR_STAT_GENS == 1)
|
|
+ return;
|
|
+
|
|
+ hist = hist_from_seq_or_gen(args->max_seq + 1);
|
|
+ for (i = 0; i < NR_MM_STATS; i++)
|
|
+ WRITE_ONCE(mm_list->nodes[nid].stats[hist][i], 0);
|
|
+}
|
|
+
|
|
+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
|
|
+{
|
|
+ int type;
|
|
+ unsigned long size = 0;
|
|
+
|
|
+ if (!lru_gen_mm_is_active(mm) && !node_isset(args->node_id, mm->lrugen.nodes))
|
|
+ return true;
|
|
+
|
|
+ if (mm_is_oom_victim(mm))
|
|
+ return true;
|
|
+
|
|
+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
|
|
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
|
|
+ get_mm_counter(mm, MM_ANONPAGES) +
|
|
+ get_mm_counter(mm, MM_SHMEMPAGES);
|
|
+ }
|
|
+
|
|
+ /* leave the legwork to the rmap if mappings are too sparse */
|
|
+ if (size < max(SWAP_CLUSTER_MAX, mm_pgtables_bytes(mm) / PAGE_SIZE))
|
|
+ return true;
|
|
+
|
|
+ return !mmget_not_zero(mm);
|
|
+}
|
|
+
|
|
+/* To support multiple workers that concurrently walk an mm_struct list. */
|
|
+static bool get_next_mm(struct mm_walk_args *args, struct mm_struct **iter)
|
|
+{
|
|
+ bool last = true;
|
|
+ struct mm_struct *mm = NULL;
|
|
+ int nid = args->node_id;
|
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
|
|
+
|
|
+ if (*iter)
|
|
+ mmput_async(*iter);
|
|
+ else if (args->max_seq <= READ_ONCE(mm_list->nodes[nid].cur_seq))
|
|
+ return false;
|
|
+
|
|
+ spin_lock(&mm_list->lock);
|
|
+
|
|
+ VM_BUG_ON(args->max_seq > mm_list->nodes[nid].cur_seq + 1);
|
|
+ VM_BUG_ON(*iter && args->max_seq < mm_list->nodes[nid].cur_seq);
|
|
+ VM_BUG_ON(*iter && !mm_list->nodes[nid].nr_workers);
|
|
+
|
|
+ if (args->max_seq <= mm_list->nodes[nid].cur_seq) {
|
|
+ last = *iter;
|
|
+ goto done;
|
|
+ }
|
|
+
|
|
+ if (mm_list->nodes[nid].iter == &mm_list->head) {
|
|
+ VM_BUG_ON(*iter || mm_list->nodes[nid].nr_workers);
|
|
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
|
|
+ }
|
|
+
|
|
+ while (!mm && mm_list->nodes[nid].iter != &mm_list->head) {
|
|
+ mm = list_entry(mm_list->nodes[nid].iter, struct mm_struct, lrugen.list);
|
|
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
|
|
+ if (should_skip_mm(mm, args))
|
|
+ mm = NULL;
|
|
+
|
|
+ args->mm_stats[mm ? MM_SCHED_ACTIVE : MM_SCHED_INACTIVE]++;
|
|
+ }
|
|
+
|
|
+ if (mm_list->nodes[nid].iter == &mm_list->head)
|
|
+ WRITE_ONCE(mm_list->nodes[nid].cur_seq,
|
|
+ mm_list->nodes[nid].cur_seq + 1);
|
|
+done:
|
|
+ if (*iter && !mm)
|
|
+ mm_list->nodes[nid].nr_workers--;
|
|
+ if (!*iter && mm)
|
|
+ mm_list->nodes[nid].nr_workers++;
|
|
+
|
|
+ last = last && !mm_list->nodes[nid].nr_workers &&
|
|
+ mm_list->nodes[nid].iter == &mm_list->head;
|
|
+
|
|
+ reset_mm_stats(mm_list, last, args);
|
|
+
|
|
+ spin_unlock(&mm_list->lock);
|
|
+
|
|
+ *iter = mm;
|
|
+ if (mm)
|
|
+ node_clear(nid, mm->lrugen.nodes);
|
|
+
|
|
+ return last;
|
|
+}
|
|
+
|
|
/******************************************************************************
|
|
* state change
|
|
******************************************************************************/
|
|
@@ -3096,6 +3413,13 @@ static int __init init_lru_gen(void)
|
|
{
|
|
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
|
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
|
+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
|
|
+
|
|
+ if (mem_cgroup_disabled()) {
|
|
+ global_mm_list = alloc_mm_list();
|
|
+ if (WARN_ON_ONCE(!global_mm_list))
|
|
+ return -ENOMEM;
|
|
+ }
|
|
|
|
if (hotplug_memory_notifier(lru_gen_online_mem, 0))
|
|
pr_err("lru_gen: failed to subscribe hotplug notifications\n");
|
|
--
|
|
2.31.1.751.gd2f1c929bd-goog
|
|
|
|
|