854 lines
26 KiB
Diff
854 lines
26 KiB
Diff
|
From mboxrd@z Thu Jan 1 00:00:00 1970
|
||
|
Return-Path: <linux-kernel-owner@kernel.org>
|
||
|
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
|
||
|
aws-us-west-2-korg-lkml-1.web.codeaurora.org
|
||
|
X-Spam-Level:
|
||
|
X-Spam-Status: No, score=-26.2 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
|
||
|
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
|
||
|
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,
|
||
|
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
|
||
|
version=3.4.0
|
||
|
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
|
||
|
by smtp.lore.kernel.org (Postfix) with ESMTP id CC788C43460
|
||
|
for <linux-kernel@archiver.kernel.org>; Tue, 13 Apr 2021 06:57:20 +0000 (UTC)
|
||
|
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
|
||
|
by mail.kernel.org (Postfix) with ESMTP id ADF7B6128E
|
||
|
for <linux-kernel@archiver.kernel.org>; Tue, 13 Apr 2021 06:57:20 +0000 (UTC)
|
||
|
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
|
||
|
id S1345140AbhDMG5i (ORCPT
|
||
|
<rfc822;linux-kernel@archiver.kernel.org>);
|
||
|
Tue, 13 Apr 2021 02:57:38 -0400
|
||
|
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:44200 "EHLO
|
||
|
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
|
||
|
with ESMTP id S1345088AbhDMG5Q (ORCPT
|
||
|
<rfc822;linux-kernel@vger.kernel.org>);
|
||
|
Tue, 13 Apr 2021 02:57:16 -0400
|
||
|
Received: from mail-qt1-x849.google.com (mail-qt1-x849.google.com [IPv6:2607:f8b0:4864:20::849])
|
||
|
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 8F91CC061756
|
||
|
for <linux-kernel@vger.kernel.org>; Mon, 12 Apr 2021 23:56:57 -0700 (PDT)
|
||
|
Received: by mail-qt1-x849.google.com with SMTP id t18so666548qtw.15
|
||
|
for <linux-kernel@vger.kernel.org>; Mon, 12 Apr 2021 23:56:57 -0700 (PDT)
|
||
|
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
||
|
d=google.com; s=20161025;
|
||
|
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
|
||
|
:cc;
|
||
|
bh=6fhJHIbNqUBjvtOegfE2MyphyVhL6hJWTXmeiM/7CYU=;
|
||
|
b=nCCKEcrZRzhFu47i9x+KHFgV9bpn2QVPdLNp94/tvI2vdGJLS5yFnnrPQk/ZvV+805
|
||
|
oU9Y2xHhJFPVW5TfOLl+0cfdlw6G7bEAFmF1h4Uf+m4IIGVwMY+rg0tngfuV3hILEC/m
|
||
|
n+gQGstNi8BWz/WCQfT/CZcdFvYSUN04sTRJQZuLJPkujaFh7e8KEoTWM8Els3JqHgbc
|
||
|
LgYf9G3svPIdXSaGd7VPKBNPPf6gEFy/2HFBYAgJkJKvcduCSex9l6NdzI0GMRm0OYUM
|
||
|
C4BaQwaJZ6SJQXdHUAecfaC52R8b2Z/IZLmM44hUGJ3NGHSotvQ6lyAB8x6J2J/K2F2i
|
||
|
PJ9A==
|
||
|
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
||
|
d=1e100.net; s=20161025;
|
||
|
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
|
||
|
:references:subject:from:to:cc;
|
||
|
bh=6fhJHIbNqUBjvtOegfE2MyphyVhL6hJWTXmeiM/7CYU=;
|
||
|
b=RKke0otlx0Z8q7yzlS4XpyZ5aovH7VEdxD07op8jejoFs5sh8CiOsB0OWYJ7WtpxIx
|
||
|
5eGpQFXb9BDl7z/w8mHGGABHKc6R44O+H6hfTDY7lBM6ycMXzUSbjQvnLzA1hgsk5Qzz
|
||
|
dFshVj2i3XpZoeXGBCx8f9E8lOrxcWydcMYmGU5PvLhJcJh5otr+dDPYiOpTdW+v1h1F
|
||
|
7zmsGOz9U6qOA3KwGKCLm44MrC1JtdV9omiuSJHBD+QfkfnIBcdeKCwgyRE44/35eufm
|
||
|
6b2R7XpOsNHciIksiDnzt5wgJJ1KnlB7E7hjCN/Q77qQcVL7cnSVQBCcYQOvUHoJ8lNg
|
||
|
fXFA==
|
||
|
X-Gm-Message-State: AOAM532Oo0F4MpWnfaEOY3TDummCsibMAZArGFkZs9eTu66X+a59qfdI
|
||
|
ziZoz/a2u1Q+YaODOe4XEW2tOqr3t3c=
|
||
|
X-Google-Smtp-Source: ABdhPJwG6wdrxi/hta1GN0K/zTCsJXK0CKzWYrx4efW6qkJhGiiXfKR8fAg0J/tzxkhd2xOMwJf4T1jXgvA=
|
||
|
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:d02d:cccc:9ebe:9fe9])
|
||
|
(user=yuzhao job=sendgmr) by 2002:ad4:4894:: with SMTP id bv20mr10806368qvb.34.1618297016759;
|
||
|
Mon, 12 Apr 2021 23:56:56 -0700 (PDT)
|
||
|
Date: Tue, 13 Apr 2021 00:56:28 -0600
|
||
|
In-Reply-To: <20210413065633.2782273-1-yuzhao@google.com>
|
||
|
Message-Id: <20210413065633.2782273-12-yuzhao@google.com>
|
||
|
Mime-Version: 1.0
|
||
|
References: <20210413065633.2782273-1-yuzhao@google.com>
|
||
|
X-Mailer: git-send-email 2.31.1.295.g9ea45b61b8-goog
|
||
|
Subject: [PATCH v2 11/16] mm: multigenerational lru: aging
|
||
|
From: Yu Zhao <yuzhao@google.com>
|
||
|
To: linux-mm@kvack.org
|
||
|
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
|
||
|
Andrew Morton <akpm@linux-foundation.org>,
|
||
|
Benjamin Manes <ben.manes@gmail.com>,
|
||
|
Dave Chinner <david@fromorbit.com>,
|
||
|
Dave Hansen <dave.hansen@linux.intel.com>,
|
||
|
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
|
||
|
Johannes Weiner <hannes@cmpxchg.org>,
|
||
|
Jonathan Corbet <corbet@lwn.net>,
|
||
|
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
|
||
|
Matthew Wilcox <willy@infradead.org>,
|
||
|
Mel Gorman <mgorman@suse.de>,
|
||
|
Miaohe Lin <linmiaohe@huawei.com>,
|
||
|
Michael Larabel <michael@michaellarabel.com>,
|
||
|
Michal Hocko <mhocko@suse.com>,
|
||
|
Michel Lespinasse <michel@lespinasse.org>,
|
||
|
Rik van Riel <riel@surriel.com>,
|
||
|
Roman Gushchin <guro@fb.com>,
|
||
|
Rong Chen <rong.a.chen@intel.com>,
|
||
|
SeongJae Park <sjpark@amazon.de>,
|
||
|
Tim Chen <tim.c.chen@linux.intel.com>,
|
||
|
Vlastimil Babka <vbabka@suse.cz>,
|
||
|
Yang Shi <shy828301@gmail.com>,
|
||
|
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
|
||
|
linux-kernel@vger.kernel.org, lkp@lists.01.org,
|
||
|
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>
|
||
|
Content-Type: text/plain; charset="UTF-8"
|
||
|
Precedence: bulk
|
||
|
List-ID: <linux-kernel.vger.kernel.org>
|
||
|
X-Mailing-List: linux-kernel@vger.kernel.org
|
||
|
Archived-At: <https://lore.kernel.org/lkml/20210413065633.2782273-12-yuzhao@google.com/>
|
||
|
List-Archive: <https://lore.kernel.org/lkml/>
|
||
|
List-Post: <mailto:linux-kernel@vger.kernel.org>
|
||
|
|
||
|
The aging produces young generations. Given an lruvec, the aging walks
|
||
|
the mm_struct list associated with this lruvec to scan page tables for
|
||
|
referenced pages. Upon finding one, the aging updates the generation
|
||
|
number of this page to max_seq. After each round of scan, the aging
|
||
|
increments max_seq. The aging is due when both of min_seq[2] reaches
|
||
|
max_seq-1, assuming both anon and file types are reclaimable.
|
||
|
|
||
|
The aging uses the following optimizations when scanning page tables:
|
||
|
1) It will not scan page tables from processes that have been
|
||
|
sleeping since the last scan.
|
||
|
2) It will not scan PTE tables under non-leaf PMD entries that do
|
||
|
not have the accessed bit set, when
|
||
|
CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y.
|
||
|
3) It will not zigzag between the PGD table and the same PMD or PTE
|
||
|
table spanning multiple VMAs. In other words, it finishes all the
|
||
|
VMAs with the range of the same PMD or PTE table before it returns
|
||
|
to the PGD table. This optimizes workloads that have large numbers
|
||
|
of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5.
|
||
|
|
||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||
|
---
|
||
|
mm/vmscan.c | 700 ++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||
|
1 file changed, 700 insertions(+)
|
||
|
|
||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||
|
index d67dfd1e3930..31e1b4155677 100644
|
||
|
--- a/mm/vmscan.c
|
||
|
+++ b/mm/vmscan.c
|
||
|
@@ -50,6 +50,7 @@
|
||
|
#include <linux/dax.h>
|
||
|
#include <linux/psi.h>
|
||
|
#include <linux/memory.h>
|
||
|
+#include <linux/pagewalk.h>
|
||
|
|
||
|
#include <asm/tlbflush.h>
|
||
|
#include <asm/div64.h>
|
||
|
@@ -4771,6 +4772,702 @@ static bool get_next_mm(struct mm_walk_args *args, int swappiness, struct mm_str
|
||
|
return last;
|
||
|
}
|
||
|
|
||
|
+/******************************************************************************
|
||
|
+ * the aging
|
||
|
+ ******************************************************************************/
|
||
|
+
|
||
|
+static void update_batch_size(struct page *page, int old_gen, int new_gen,
|
||
|
+ struct mm_walk_args *args)
|
||
|
+{
|
||
|
+ int file = page_is_file_lru(page);
|
||
|
+ int zone = page_zonenum(page);
|
||
|
+ int delta = thp_nr_pages(page);
|
||
|
+
|
||
|
+ VM_BUG_ON(old_gen >= MAX_NR_GENS);
|
||
|
+ VM_BUG_ON(new_gen >= MAX_NR_GENS);
|
||
|
+
|
||
|
+ args->batch_size++;
|
||
|
+
|
||
|
+ args->nr_pages[old_gen][file][zone] -= delta;
|
||
|
+ args->nr_pages[new_gen][file][zone] += delta;
|
||
|
+}
|
||
|
+
|
||
|
+static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args)
|
||
|
+{
|
||
|
+ int gen, file, zone;
|
||
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
||
|
+
|
||
|
+ args->batch_size = 0;
|
||
|
+
|
||
|
+ spin_lock_irq(&lruvec->lru_lock);
|
||
|
+
|
||
|
+ for_each_gen_type_zone(gen, file, zone) {
|
||
|
+ enum lru_list lru = LRU_FILE * file;
|
||
|
+ int total = args->nr_pages[gen][file][zone];
|
||
|
+
|
||
|
+ if (!total)
|
||
|
+ continue;
|
||
|
+
|
||
|
+ args->nr_pages[gen][file][zone] = 0;
|
||
|
+ WRITE_ONCE(lrugen->sizes[gen][file][zone],
|
||
|
+ lrugen->sizes[gen][file][zone] + total);
|
||
|
+
|
||
|
+ if (lru_gen_is_active(lruvec, gen))
|
||
|
+ lru += LRU_ACTIVE;
|
||
|
+ update_lru_size(lruvec, lru, zone, total);
|
||
|
+ }
|
||
|
+
|
||
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
||
|
+}
|
||
|
+
|
||
|
+static int page_update_gen(struct page *page, int new_gen)
|
||
|
+{
|
||
|
+ int old_gen;
|
||
|
+ unsigned long old_flags, new_flags;
|
||
|
+
|
||
|
+ VM_BUG_ON(new_gen >= MAX_NR_GENS);
|
||
|
+
|
||
|
+ do {
|
||
|
+ old_flags = READ_ONCE(page->flags);
|
||
|
+
|
||
|
+ old_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||
|
+ if (old_gen < 0)
|
||
|
+ new_flags = old_flags | BIT(PG_referenced);
|
||
|
+ else
|
||
|
+ new_flags = (old_flags & ~(LRU_GEN_MASK | LRU_USAGE_MASK |
|
||
|
+ LRU_TIER_FLAGS)) | ((new_gen + 1UL) << LRU_GEN_PGOFF);
|
||
|
+
|
||
|
+ if (old_flags == new_flags)
|
||
|
+ break;
|
||
|
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
||
|
+
|
||
|
+ return old_gen;
|
||
|
+}
|
||
|
+
|
||
|
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
|
||
|
+{
|
||
|
+ struct vm_area_struct *vma = walk->vma;
|
||
|
+ struct mm_walk_args *args = walk->private;
|
||
|
+
|
||
|
+ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) ||
|
||
|
+ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)))
|
||
|
+ return true;
|
||
|
+
|
||
|
+ if (vma_is_anonymous(vma))
|
||
|
+ return !args->should_walk[0];
|
||
|
+
|
||
|
+ if (vma_is_shmem(vma))
|
||
|
+ return !args->should_walk[0] ||
|
||
|
+ mapping_unevictable(vma->vm_file->f_mapping);
|
||
|
+
|
||
|
+ return !args->should_walk[1] || vma_is_dax(vma) ||
|
||
|
+ vma == get_gate_vma(vma->vm_mm) ||
|
||
|
+ mapping_unevictable(vma->vm_file->f_mapping);
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * Some userspace memory allocators create many single-page VMAs. So instead of
|
||
|
+ * returning back to the PGD table for each of such VMAs, we finish at least an
|
||
|
+ * entire PMD table and therefore avoid many zigzags. This optimizes page table
|
||
|
+ * walks for workloads that have large numbers of tiny VMAs.
|
||
|
+ *
|
||
|
+ * We scan PMD tables in two pass. The first pass reaches to PTE tables and
|
||
|
+ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD
|
||
|
+ * entries and needs to take the PMD lock. The second pass is only done on the
|
||
|
+ * PMD entries that first pass has found the accessed bit is set, and they must
|
||
|
+ * be:
|
||
|
+ * 1) leaf entries mapping huge pages from the node under reclaim
|
||
|
+ * 2) non-leaf entries whose leaf entries only map pages from the node under
|
||
|
+ * reclaim, when CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y.
|
||
|
+ */
|
||
|
+static bool get_next_interval(struct mm_walk *walk, unsigned long mask, unsigned long size,
|
||
|
+ unsigned long *start, unsigned long *end)
|
||
|
+{
|
||
|
+ unsigned long next = round_up(*end, size);
|
||
|
+ struct mm_walk_args *args = walk->private;
|
||
|
+
|
||
|
+ VM_BUG_ON(mask & size);
|
||
|
+ VM_BUG_ON(*start != *end);
|
||
|
+ VM_BUG_ON(!(*end & ~mask));
|
||
|
+ VM_BUG_ON((*end & mask) != (next & mask));
|
||
|
+
|
||
|
+ while (walk->vma) {
|
||
|
+ if (next >= walk->vma->vm_end) {
|
||
|
+ walk->vma = walk->vma->vm_next;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if ((next & mask) != (walk->vma->vm_start & mask))
|
||
|
+ return false;
|
||
|
+
|
||
|
+ if (next <= walk->vma->vm_start &&
|
||
|
+ should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) {
|
||
|
+ walk->vma = walk->vma->vm_next;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ args->mm_stats[MM_VMA_INTERVAL]++;
|
||
|
+
|
||
|
+ *start = max(next, walk->vma->vm_start);
|
||
|
+ next = (next | ~mask) + 1;
|
||
|
+ /* rounded-up boundaries can wrap to 0 */
|
||
|
+ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end;
|
||
|
+
|
||
|
+ return true;
|
||
|
+ }
|
||
|
+
|
||
|
+ return false;
|
||
|
+}
|
||
|
+
|
||
|
+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
|
||
|
+ struct mm_walk *walk)
|
||
|
+{
|
||
|
+ int i;
|
||
|
+ pte_t *pte;
|
||
|
+ spinlock_t *ptl;
|
||
|
+ int remote = 0;
|
||
|
+ struct mm_walk_args *args = walk->private;
|
||
|
+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
|
||
|
+
|
||
|
+ VM_BUG_ON(pmd_leaf(*pmd));
|
||
|
+
|
||
|
+ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl);
|
||
|
+ arch_enter_lazy_mmu_mode();
|
||
|
+restart:
|
||
|
+ for (i = pte_index(start); start != end; i++, start += PAGE_SIZE) {
|
||
|
+ struct page *page;
|
||
|
+ unsigned long pfn = pte_pfn(pte[i]);
|
||
|
+
|
||
|
+ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) {
|
||
|
+ args->mm_stats[MM_LEAF_HOLE]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (!pte_young(pte[i])) {
|
||
|
+ args->mm_stats[MM_LEAF_OLD]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (pfn < args->start_pfn || pfn >= args->end_pfn) {
|
||
|
+ remote++;
|
||
|
+ args->mm_stats[MM_LEAF_OTHER_NODE]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ page = compound_head(pfn_to_page(pfn));
|
||
|
+ if (page_to_nid(page) != args->node_id) {
|
||
|
+ remote++;
|
||
|
+ args->mm_stats[MM_LEAF_OTHER_NODE]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (!ptep_test_and_clear_young(walk->vma, start, pte + i))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ if (pte_dirty(pte[i]) && !PageDirty(page) &&
|
||
|
+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) {
|
||
|
+ set_page_dirty(page);
|
||
|
+ args->mm_stats[MM_LEAF_DIRTY]++;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (page_memcg_rcu(page) != args->memcg) {
|
||
|
+ args->mm_stats[MM_LEAF_OTHER_MEMCG]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ old_gen = page_update_gen(page, new_gen);
|
||
|
+ if (old_gen >= 0 && old_gen != new_gen)
|
||
|
+ update_batch_size(page, old_gen, new_gen, args);
|
||
|
+ args->mm_stats[MM_LEAF_YOUNG]++;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (i < PTRS_PER_PTE && get_next_interval(walk, PMD_MASK, PAGE_SIZE, &start, &end))
|
||
|
+ goto restart;
|
||
|
+
|
||
|
+ arch_leave_lazy_mmu_mode();
|
||
|
+ pte_unmap_unlock(pte, ptl);
|
||
|
+
|
||
|
+ return !remote;
|
||
|
+}
|
||
|
+
|
||
|
+static bool walk_pmd_range_unlocked(pud_t *pud, unsigned long start, unsigned long end,
|
||
|
+ struct mm_walk *walk)
|
||
|
+{
|
||
|
+ int i;
|
||
|
+ pmd_t *pmd;
|
||
|
+ unsigned long next;
|
||
|
+ int young = 0;
|
||
|
+ struct mm_walk_args *args = walk->private;
|
||
|
+
|
||
|
+ VM_BUG_ON(pud_leaf(*pud));
|
||
|
+
|
||
|
+ pmd = pmd_offset(pud, start & PUD_MASK);
|
||
|
+restart:
|
||
|
+ for (i = pmd_index(start); start != end; i++, start = next) {
|
||
|
+ pmd_t val = pmd_read_atomic(pmd + i);
|
||
|
+
|
||
|
+ next = pmd_addr_end(start, end);
|
||
|
+
|
||
|
+ barrier();
|
||
|
+ if (!pmd_present(val) || is_huge_zero_pmd(val)) {
|
||
|
+ args->mm_stats[MM_LEAF_HOLE]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (pmd_trans_huge(val)) {
|
||
|
+ unsigned long pfn = pmd_pfn(val);
|
||
|
+
|
||
|
+ if (!pmd_young(val)) {
|
||
|
+ args->mm_stats[MM_LEAF_OLD]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (pfn < args->start_pfn || pfn >= args->end_pfn) {
|
||
|
+ args->mm_stats[MM_LEAF_OTHER_NODE]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||
|
+ young++;
|
||
|
+ __set_bit(i, args->bitmap);
|
||
|
+#endif
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+#ifdef CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG
|
||
|
+ if (!pmd_young(val)) {
|
||
|
+ args->mm_stats[MM_NONLEAF_OLD]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+#endif
|
||
|
+
|
||
|
+ if (walk_pte_range(&val, start, next, walk)) {
|
||
|
+#ifdef CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG
|
||
|
+ young++;
|
||
|
+ __set_bit(i, args->bitmap);
|
||
|
+#endif
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
+ if (i < PTRS_PER_PMD && get_next_interval(walk, PUD_MASK, PMD_SIZE, &start, &end))
|
||
|
+ goto restart;
|
||
|
+
|
||
|
+ return young;
|
||
|
+}
|
||
|
+
|
||
|
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
|
||
|
+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, unsigned long end,
|
||
|
+ struct mm_walk *walk)
|
||
|
+{
|
||
|
+ int i;
|
||
|
+ pmd_t *pmd;
|
||
|
+ spinlock_t *ptl;
|
||
|
+ struct mm_walk_args *args = walk->private;
|
||
|
+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
|
||
|
+
|
||
|
+ VM_BUG_ON(pud_leaf(*pud));
|
||
|
+
|
||
|
+ start &= PUD_MASK;
|
||
|
+ pmd = pmd_offset(pud, start);
|
||
|
+ ptl = pmd_lock(walk->mm, pmd);
|
||
|
+ arch_enter_lazy_mmu_mode();
|
||
|
+
|
||
|
+ for_each_set_bit(i, args->bitmap, PTRS_PER_PMD) {
|
||
|
+ struct page *page;
|
||
|
+ unsigned long pfn = pmd_pfn(pmd[i]);
|
||
|
+ unsigned long addr = start + PMD_SIZE * i;
|
||
|
+
|
||
|
+ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) {
|
||
|
+ args->mm_stats[MM_LEAF_HOLE]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (!pmd_young(pmd[i])) {
|
||
|
+ args->mm_stats[MM_LEAF_OLD]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (!pmd_trans_huge(pmd[i])) {
|
||
|
+#ifdef CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG
|
||
|
+ args->mm_stats[MM_NONLEAF_YOUNG]++;
|
||
|
+ pmdp_test_and_clear_young(walk->vma, addr, pmd + i);
|
||
|
+#endif
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (pfn < args->start_pfn || pfn >= args->end_pfn) {
|
||
|
+ args->mm_stats[MM_LEAF_OTHER_NODE]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ page = pfn_to_page(pfn);
|
||
|
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
||
|
+ if (page_to_nid(page) != args->node_id) {
|
||
|
+ args->mm_stats[MM_LEAF_OTHER_NODE]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (!pmdp_test_and_clear_young(walk->vma, addr, pmd + i))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
|
||
|
+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) {
|
||
|
+ set_page_dirty(page);
|
||
|
+ args->mm_stats[MM_LEAF_DIRTY]++;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (page_memcg_rcu(page) != args->memcg) {
|
||
|
+ args->mm_stats[MM_LEAF_OTHER_MEMCG]++;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ old_gen = page_update_gen(page, new_gen);
|
||
|
+ if (old_gen >= 0 && old_gen != new_gen)
|
||
|
+ update_batch_size(page, old_gen, new_gen, args);
|
||
|
+ args->mm_stats[MM_LEAF_YOUNG]++;
|
||
|
+ }
|
||
|
+
|
||
|
+ arch_leave_lazy_mmu_mode();
|
||
|
+ spin_unlock(ptl);
|
||
|
+
|
||
|
+ memset(args->bitmap, 0, sizeof(args->bitmap));
|
||
|
+}
|
||
|
+#else
|
||
|
+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, unsigned long end,
|
||
|
+ struct mm_walk *walk)
|
||
|
+{
|
||
|
+}
|
||
|
+#endif
|
||
|
+
|
||
|
+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
|
||
|
+ struct mm_walk *walk)
|
||
|
+{
|
||
|
+ int i;
|
||
|
+ pud_t *pud;
|
||
|
+ unsigned long next;
|
||
|
+ struct mm_walk_args *args = walk->private;
|
||
|
+
|
||
|
+ VM_BUG_ON(p4d_leaf(*p4d));
|
||
|
+
|
||
|
+ pud = pud_offset(p4d, start & P4D_MASK);
|
||
|
+restart:
|
||
|
+ for (i = pud_index(start); start != end; i++, start = next) {
|
||
|
+ pud_t val = READ_ONCE(pud[i]);
|
||
|
+
|
||
|
+ next = pud_addr_end(start, end);
|
||
|
+
|
||
|
+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ if (walk_pmd_range_unlocked(&val, start, next, walk))
|
||
|
+ walk_pmd_range_locked(&val, start, next, walk);
|
||
|
+
|
||
|
+ if (args->batch_size >= MAX_BATCH_SIZE) {
|
||
|
+ end = (start | ~PUD_MASK) + 1;
|
||
|
+ goto done;
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
+ if (i < PTRS_PER_PUD && get_next_interval(walk, P4D_MASK, PUD_SIZE, &start, &end))
|
||
|
+ goto restart;
|
||
|
+
|
||
|
+ end = round_up(end, P4D_SIZE);
|
||
|
+done:
|
||
|
+ /* rounded-up boundaries can wrap to 0 */
|
||
|
+ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0;
|
||
|
+
|
||
|
+ return -EAGAIN;
|
||
|
+}
|
||
|
+
|
||
|
+static void walk_mm(struct mm_walk_args *args, int swappiness, struct mm_struct *mm)
|
||
|
+{
|
||
|
+ static const struct mm_walk_ops mm_walk_ops = {
|
||
|
+ .test_walk = should_skip_vma,
|
||
|
+ .p4d_entry = walk_pud_range,
|
||
|
+ };
|
||
|
+
|
||
|
+ int err;
|
||
|
+ int file;
|
||
|
+ int nid = args->node_id;
|
||
|
+ struct mem_cgroup *memcg = args->memcg;
|
||
|
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
|
||
|
+
|
||
|
+ args->next_addr = FIRST_USER_ADDRESS;
|
||
|
+ for (file = !swappiness; file < ANON_AND_FILE; file++)
|
||
|
+ args->should_walk[file] = lru_gen_mm_is_active(mm) ||
|
||
|
+ node_isset(nid, mm->lrugen.nodes[file]);
|
||
|
+
|
||
|
+ do {
|
||
|
+ unsigned long start = args->next_addr;
|
||
|
+ unsigned long end = mm->highest_vm_end;
|
||
|
+
|
||
|
+ err = -EBUSY;
|
||
|
+
|
||
|
+ preempt_disable();
|
||
|
+ rcu_read_lock();
|
||
|
+
|
||
|
+#ifdef CONFIG_MEMCG
|
||
|
+ if (memcg && atomic_read(&memcg->moving_account)) {
|
||
|
+ args->mm_stats[MM_LOCK_CONTENTION]++;
|
||
|
+ goto contended;
|
||
|
+ }
|
||
|
+#endif
|
||
|
+ if (!mmap_read_trylock(mm)) {
|
||
|
+ args->mm_stats[MM_LOCK_CONTENTION]++;
|
||
|
+ goto contended;
|
||
|
+ }
|
||
|
+
|
||
|
+ err = walk_page_range(mm, start, end, &mm_walk_ops, args);
|
||
|
+
|
||
|
+ mmap_read_unlock(mm);
|
||
|
+
|
||
|
+ if (args->batch_size)
|
||
|
+ reset_batch_size(lruvec, args);
|
||
|
+contended:
|
||
|
+ rcu_read_unlock();
|
||
|
+ preempt_enable();
|
||
|
+
|
||
|
+ cond_resched();
|
||
|
+ } while (err == -EAGAIN && args->next_addr &&
|
||
|
+ !mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg));
|
||
|
+
|
||
|
+ if (err == -EBUSY)
|
||
|
+ return;
|
||
|
+
|
||
|
+ for (file = !swappiness; file < ANON_AND_FILE; file++) {
|
||
|
+ if (args->should_walk[file])
|
||
|
+ node_clear(nid, mm->lrugen.nodes[file]);
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool front)
|
||
|
+{
|
||
|
+ int old_gen, new_gen;
|
||
|
+ unsigned long old_flags, new_flags;
|
||
|
+ int file = page_is_file_lru(page);
|
||
|
+ int zone = page_zonenum(page);
|
||
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
||
|
+
|
||
|
+ old_gen = lru_gen_from_seq(lrugen->min_seq[file]);
|
||
|
+
|
||
|
+ do {
|
||
|
+ old_flags = READ_ONCE(page->flags);
|
||
|
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||
|
+ VM_BUG_ON_PAGE(new_gen < 0, page);
|
||
|
+ if (new_gen >= 0 && new_gen != old_gen)
|
||
|
+ goto sort;
|
||
|
+
|
||
|
+ new_gen = (old_gen + 1) % MAX_NR_GENS;
|
||
|
+ new_flags = (old_flags & ~(LRU_GEN_MASK | LRU_USAGE_MASK | LRU_TIER_FLAGS)) |
|
||
|
+ ((new_gen + 1UL) << LRU_GEN_PGOFF);
|
||
|
+ /* mark the page for reclaim if it's pending writeback */
|
||
|
+ if (front)
|
||
|
+ new_flags |= BIT(PG_reclaim);
|
||
|
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
||
|
+
|
||
|
+ lru_gen_update_size(page, lruvec, old_gen, new_gen);
|
||
|
+sort:
|
||
|
+ if (front)
|
||
|
+ list_move(&page->lru, &lrugen->lists[new_gen][file][zone]);
|
||
|
+ else
|
||
|
+ list_move_tail(&page->lru, &lrugen->lists[new_gen][file][zone]);
|
||
|
+}
|
||
|
+
|
||
|
+static bool try_inc_min_seq(struct lruvec *lruvec, int file)
|
||
|
+{
|
||
|
+ int gen, zone;
|
||
|
+ bool success = false;
|
||
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
||
|
+
|
||
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
||
|
+
|
||
|
+ while (get_nr_gens(lruvec, file) > MIN_NR_GENS) {
|
||
|
+ gen = lru_gen_from_seq(lrugen->min_seq[file]);
|
||
|
+
|
||
|
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||
|
+ if (!list_empty(&lrugen->lists[gen][file][zone]))
|
||
|
+ return success;
|
||
|
+ }
|
||
|
+
|
||
|
+ reset_controller_pos(lruvec, gen, file);
|
||
|
+ WRITE_ONCE(lrugen->min_seq[file], lrugen->min_seq[file] + 1);
|
||
|
+
|
||
|
+ success = true;
|
||
|
+ }
|
||
|
+
|
||
|
+ return success;
|
||
|
+}
|
||
|
+
|
||
|
+static bool inc_min_seq(struct lruvec *lruvec, int file)
|
||
|
+{
|
||
|
+ int gen, zone;
|
||
|
+ int batch_size = 0;
|
||
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
||
|
+
|
||
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
||
|
+
|
||
|
+ if (get_nr_gens(lruvec, file) != MAX_NR_GENS)
|
||
|
+ return true;
|
||
|
+
|
||
|
+ gen = lru_gen_from_seq(lrugen->min_seq[file]);
|
||
|
+
|
||
|
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||
|
+ struct list_head *head = &lrugen->lists[gen][file][zone];
|
||
|
+
|
||
|
+ while (!list_empty(head)) {
|
||
|
+ struct page *page = lru_to_page(head);
|
||
|
+
|
||
|
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
||
|
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||
|
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
||
|
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != file, page);
|
||
|
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
||
|
+
|
||
|
+ prefetchw_prev_lru_page(page, head, flags);
|
||
|
+
|
||
|
+ page_inc_gen(page, lruvec, false);
|
||
|
+
|
||
|
+ if (++batch_size == MAX_BATCH_SIZE)
|
||
|
+ return false;
|
||
|
+ }
|
||
|
+
|
||
|
+ VM_BUG_ON(lrugen->sizes[gen][file][zone]);
|
||
|
+ }
|
||
|
+
|
||
|
+ reset_controller_pos(lruvec, gen, file);
|
||
|
+ WRITE_ONCE(lrugen->min_seq[file], lrugen->min_seq[file] + 1);
|
||
|
+
|
||
|
+ return true;
|
||
|
+}
|
||
|
+
|
||
|
+static void inc_max_seq(struct lruvec *lruvec)
|
||
|
+{
|
||
|
+ int gen, file, zone;
|
||
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
||
|
+
|
||
|
+ spin_lock_irq(&lruvec->lru_lock);
|
||
|
+
|
||
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
||
|
+
|
||
|
+ for (file = 0; file < ANON_AND_FILE; file++) {
|
||
|
+ if (try_inc_min_seq(lruvec, file))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ while (!inc_min_seq(lruvec, file)) {
|
||
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
||
|
+ cond_resched();
|
||
|
+ spin_lock_irq(&lruvec->lru_lock);
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
+ gen = lru_gen_from_seq(lrugen->max_seq - 1);
|
||
|
+ for_each_type_zone(file, zone) {
|
||
|
+ enum lru_list lru = LRU_FILE * file;
|
||
|
+ long total = lrugen->sizes[gen][file][zone];
|
||
|
+
|
||
|
+ if (!total)
|
||
|
+ continue;
|
||
|
+
|
||
|
+ WARN_ON_ONCE(total != (int)total);
|
||
|
+
|
||
|
+ update_lru_size(lruvec, lru, zone, total);
|
||
|
+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -total);
|
||
|
+ }
|
||
|
+
|
||
|
+ gen = lru_gen_from_seq(lrugen->max_seq + 1);
|
||
|
+ for_each_type_zone(file, zone) {
|
||
|
+ VM_BUG_ON(lrugen->sizes[gen][file][zone]);
|
||
|
+ VM_BUG_ON(!list_empty(&lrugen->lists[gen][file][zone]));
|
||
|
+ }
|
||
|
+
|
||
|
+ for (file = 0; file < ANON_AND_FILE; file++)
|
||
|
+ reset_controller_pos(lruvec, gen, file);
|
||
|
+
|
||
|
+ WRITE_ONCE(lrugen->timestamps[gen], jiffies);
|
||
|
+ /* make sure all preceding modifications appear first */
|
||
|
+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
|
||
|
+
|
||
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
||
|
+}
|
||
|
+
|
||
|
+/* Main function used by foreground, background and user-triggered aging. */
|
||
|
+static bool walk_mm_list(struct lruvec *lruvec, unsigned long max_seq,
|
||
|
+ struct scan_control *sc, int swappiness, struct mm_walk_args *args)
|
||
|
+{
|
||
|
+ bool last;
|
||
|
+ bool alloc = !args;
|
||
|
+ struct mm_struct *mm = NULL;
|
||
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
||
|
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||
|
+ int nid = pgdat->node_id;
|
||
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
||
|
+
|
||
|
+ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq));
|
||
|
+
|
||
|
+ /*
|
||
|
+ * For each walk of the mm_struct list of a memcg, we decrement the
|
||
|
+ * priority of its lrugen. For each walk of all memcgs in kswapd, we
|
||
|
+ * increment the priority of every lrugen.
|
||
|
+ *
|
||
|
+ * So if this lrugen has a higher priority (smaller value), it means
|
||
|
+ * other concurrent reclaimers have walked its mm list, and we skip it
|
||
|
+ * for this priority in order to balance the pressure on all memcgs.
|
||
|
+ */
|
||
|
+ if (!mem_cgroup_disabled() && !cgroup_reclaim(sc) &&
|
||
|
+ sc->priority > atomic_read(&lrugen->priority))
|
||
|
+ return false;
|
||
|
+
|
||
|
+ if (alloc) {
|
||
|
+ args = kvzalloc_node(sizeof(*args), GFP_KERNEL, nid);
|
||
|
+ if (!args)
|
||
|
+ return false;
|
||
|
+ }
|
||
|
+
|
||
|
+ args->memcg = memcg;
|
||
|
+ args->max_seq = max_seq;
|
||
|
+ args->start_pfn = pgdat->node_start_pfn;
|
||
|
+ args->end_pfn = pgdat_end_pfn(pgdat);
|
||
|
+ args->node_id = nid;
|
||
|
+
|
||
|
+ do {
|
||
|
+ last = get_next_mm(args, swappiness, &mm);
|
||
|
+ if (mm)
|
||
|
+ walk_mm(args, swappiness, mm);
|
||
|
+
|
||
|
+ cond_resched();
|
||
|
+ } while (mm);
|
||
|
+
|
||
|
+ if (alloc)
|
||
|
+ kvfree(args);
|
||
|
+
|
||
|
+ if (!last) {
|
||
|
+ /* foreground aging prefers not to wait unless "necessary" */
|
||
|
+ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2)
|
||
|
+ wait_event_killable(mm_list->nodes[nid].wait,
|
||
|
+ max_seq < READ_ONCE(lrugen->max_seq));
|
||
|
+
|
||
|
+ return max_seq < READ_ONCE(lrugen->max_seq);
|
||
|
+ }
|
||
|
+
|
||
|
+ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq));
|
||
|
+
|
||
|
+ inc_max_seq(lruvec);
|
||
|
+
|
||
|
+ if (!mem_cgroup_disabled())
|
||
|
+ atomic_add_unless(&lrugen->priority, -1, 0);
|
||
|
+
|
||
|
+ /* order against inc_max_seq() */
|
||
|
+ smp_mb();
|
||
|
+ /* either we see any waiters or they will see the updated max_seq */
|
||
|
+ if (waitqueue_active(&mm_list->nodes[nid].wait))
|
||
|
+ wake_up_all(&mm_list->nodes[nid].wait);
|
||
|
+
|
||
|
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
|
||
|
+
|
||
|
+ return true;
|
||
|
+}
|
||
|
+
|
||
|
/******************************************************************************
|
||
|
* state change
|
||
|
******************************************************************************/
|
||
|
@@ -5002,6 +5699,9 @@ static int __init init_lru_gen(void)
|
||
|
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||
|
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||
|
BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
|
||
|
+ BUILD_BUG_ON(PMD_SIZE / PAGE_SIZE != PTRS_PER_PTE);
|
||
|
+ BUILD_BUG_ON(PUD_SIZE / PMD_SIZE != PTRS_PER_PMD);
|
||
|
+ BUILD_BUG_ON(P4D_SIZE / PUD_SIZE != PTRS_PER_PUD);
|
||
|
|
||
|
if (mem_cgroup_disabled()) {
|
||
|
global_mm_list = alloc_mm_list();
|
||
|
--
|
||
|
2.31.1.295.g9ea45b61b8-goog
|
||
|
|
||
|
|