769 lines
25 KiB
Diff
769 lines
25 KiB
Diff
From mboxrd@z Thu Jan 1 00:00:00 1970
|
|
Return-Path: <linux-kernel-owner@kernel.org>
|
|
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
|
|
aws-us-west-2-korg-lkml-1.web.codeaurora.org
|
|
X-Spam-Level:
|
|
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
|
|
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
|
|
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
|
|
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
|
|
version=3.4.0
|
|
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
|
|
by smtp.lore.kernel.org (Postfix) with ESMTP id A01B6C433B4
|
|
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:39 +0000 (UTC)
|
|
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
|
|
by mail.kernel.org (Postfix) with ESMTP id 7B61B6108C
|
|
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:39 +0000 (UTC)
|
|
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
|
|
id S231207AbhETGz5 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
|
|
Thu, 20 May 2021 02:55:57 -0400
|
|
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37946 "EHLO
|
|
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
|
|
with ESMTP id S230519AbhETGzl (ORCPT
|
|
<rfc822;linux-kernel@vger.kernel.org>);
|
|
Thu, 20 May 2021 02:55:41 -0400
|
|
Received: from mail-yb1-xb49.google.com (mail-yb1-xb49.google.com [IPv6:2607:f8b0:4864:20::b49])
|
|
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2E0BFC06175F
|
|
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:19 -0700 (PDT)
|
|
Received: by mail-yb1-xb49.google.com with SMTP id o6-20020a5b06460000b02905004326697dso21269948ybq.22
|
|
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:19 -0700 (PDT)
|
|
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
|
d=google.com; s=20161025;
|
|
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
|
|
:cc;
|
|
bh=u1VH9oezkgLqdxJ2J45QA+bE6HFSfI1t2pM/Z9SSfcw=;
|
|
b=ZCwqDV3PtHM/LJ1Jk3mVLSR0meKIBgFwo1J8fy1XCqpRUSN2IaxDKRl6kQ+Kr5x6il
|
|
ONEGQ71NTF3X5YriYi1HDhha3PmMaPofh1moI1cvhXBQ3BC7QtVM3R2+bDqRzq1heN8I
|
|
AIXSKXUdwikQDrunmGAxvTK29DMwl/KeHCe+4v24DaVODm4+A+McG4cMpvigEHvQjTyF
|
|
v8VcycT2kwKRw3j6yPu6tWP+l/IwnXQiY+KsQ1ti1IgPSlH/WyvqWlUCVB7h2C+o5ZS+
|
|
/wKVmM36EtyVbHuHWwWCJkvkjGaJnzvDjISmaVK9XCh1D8kFXjAL3uXkcExirtkdXQBN
|
|
na+A==
|
|
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
|
d=1e100.net; s=20161025;
|
|
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
|
|
:references:subject:from:to:cc;
|
|
bh=u1VH9oezkgLqdxJ2J45QA+bE6HFSfI1t2pM/Z9SSfcw=;
|
|
b=LJDAHspg6fnPnue5XfzSf/BNfnVkyVlvJxf0/6UoekRIfXnLhw2K1izgIrVveHAz3I
|
|
J/NCoJhs8jZ/aPXP/cQXIGGSdtjJW5eDfEf4zm2qn/9oNaQnLZ7BV6aCDANBoPUCqa/r
|
|
AJIKGX9sXqevfdwgMdyFFNCF5HROG3lCnszQrQm+Y91p8HixZQRUngPI+mUlfY2VvvbM
|
|
MK5IMhmus/o35uuc/UPt0oRdz7fAgWg0WMJL5aZJMbFtZ9K8x8KpzSQqTf5tF9cNg3jZ
|
|
+0F1HWd4vCibOEBYJ0aENh+LFGxZoC+et9tQi5mJM42r+AlcYVzilVzRLzpjG0KKaHNi
|
|
FKBQ==
|
|
X-Gm-Message-State: AOAM5318NWZEOBXD8F42C3giE31Ee6cXk+kglz/8je4dTQkxkTvmIP2F
|
|
D7kBlXDwCzr93Jm/4pjbod1wRW/kBWo=
|
|
X-Google-Smtp-Source: ABdhPJxfQJokAdXpqmzcdeUweiPSNNLZPWEWbQ8Rs0Vczp0sf0utxd24KmzGgE8wHfpeqesf2U2Y+MnqF+Q=
|
|
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
|
|
(user=yuzhao job=sendgmr) by 2002:a25:b3c3:: with SMTP id x3mr5173887ybf.334.1621493658204;
|
|
Wed, 19 May 2021 23:54:18 -0700 (PDT)
|
|
Date: Thu, 20 May 2021 00:53:52 -0600
|
|
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
|
|
Message-Id: <20210520065355.2736558-12-yuzhao@google.com>
|
|
Mime-Version: 1.0
|
|
References: <20210520065355.2736558-1-yuzhao@google.com>
|
|
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
|
|
Subject: [PATCH v3 11/14] mm: multigenerational lru: eviction
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
To: linux-mm@kvack.org
|
|
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
|
|
Andrew Morton <akpm@linux-foundation.org>,
|
|
Dave Chinner <david@fromorbit.com>,
|
|
Dave Hansen <dave.hansen@linux.intel.com>,
|
|
Donald Carr <sirspudd@gmail.com>,
|
|
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
|
|
Johannes Weiner <hannes@cmpxchg.org>,
|
|
Jonathan Corbet <corbet@lwn.net>,
|
|
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
|
|
Konstantin Kharlamov <hi-angel@yandex.ru>,
|
|
Marcus Seyfarth <m.seyfarth@gmail.com>,
|
|
Matthew Wilcox <willy@infradead.org>,
|
|
Mel Gorman <mgorman@suse.de>,
|
|
Miaohe Lin <linmiaohe@huawei.com>,
|
|
Michael Larabel <michael@michaellarabel.com>,
|
|
Michal Hocko <mhocko@suse.com>,
|
|
Michel Lespinasse <michel@lespinasse.org>,
|
|
Rik van Riel <riel@surriel.com>,
|
|
Roman Gushchin <guro@fb.com>,
|
|
Tim Chen <tim.c.chen@linux.intel.com>,
|
|
Vlastimil Babka <vbabka@suse.cz>,
|
|
Yang Shi <shy828301@gmail.com>,
|
|
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
|
|
linux-kernel@vger.kernel.org, lkp@lists.01.org,
|
|
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
|
|
Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
Content-Type: text/plain; charset="UTF-8"
|
|
Precedence: bulk
|
|
List-ID: <linux-kernel.vger.kernel.org>
|
|
X-Mailing-List: linux-kernel@vger.kernel.org
|
|
List-Archive: <https://lore.kernel.org/lkml/>
|
|
|
|
The eviction consumes old generations. Given an lruvec, the eviction
|
|
scans the pages on the per-zone lists indexed by either of min_seq[2].
|
|
It first tries to select a type based on the values of min_seq[2].
|
|
When anon and file types are both available from the same generation,
|
|
it selects the one that has a lower refault rate.
|
|
|
|
During a scan, the eviction sorts pages according to their new
|
|
generation numbers, if the aging has found them referenced. It also
|
|
moves pages from the tiers that have higher refault rates than tier 0
|
|
to the next generation. When it finds all the per-zone lists of a
|
|
selected type are empty, the eviction increments min_seq[2] indexed by
|
|
this selected type.
|
|
|
|
With the aging and the eviction in place, we can build page reclaim in
|
|
a straightforward manner:
|
|
1) In order to reduce the latency, direct reclaim only invokes the
|
|
aging when both min_seq[2] reaches max_seq-1; otherwise it invokes
|
|
the eviction.
|
|
2) In order to avoid the aging in the direct reclaim path, kswapd
|
|
does the background aging. It invokes the aging when either of
|
|
min_seq[2] reaches max_seq-1; otherwise it invokes the eviction.
|
|
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
---
|
|
include/linux/mmzone.h | 5 +
|
|
mm/vmscan.c | 540 +++++++++++++++++++++++++++++++++++++++++
|
|
2 files changed, 545 insertions(+)
|
|
|
|
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
|
index 38de59fcbe54..ded72f44d7e7 100644
|
|
--- a/include/linux/mmzone.h
|
|
+++ b/include/linux/mmzone.h
|
|
@@ -863,6 +863,8 @@ struct deferred_split {
|
|
};
|
|
#endif
|
|
|
|
+struct mm_walk_args;
|
|
+
|
|
/*
|
|
* On NUMA machines, each NUMA node would have a pg_data_t to describe
|
|
* it's memory layout. On UMA machines there is a single pglist_data which
|
|
@@ -968,6 +970,9 @@ typedef struct pglist_data {
|
|
|
|
unsigned long flags;
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+ struct mm_walk_args *mm_walk_args;
|
|
+#endif
|
|
ZONE_PADDING(_pad2_)
|
|
|
|
/* Per-node vmstats */
|
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
|
index 837d5e6a821e..2f86dcc04c56 100644
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -1311,6 +1311,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
|
if (!sc->may_unmap && page_mapped(page))
|
|
goto keep_locked;
|
|
|
|
+ /* in case the page was found accessed by lru_gen_scan_around() */
|
|
+ if (lru_gen_enabled() && !ignore_references &&
|
|
+ page_mapped(page) && PageReferenced(page))
|
|
+ goto keep_locked;
|
|
+
|
|
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
|
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
|
|
|
|
@@ -2431,6 +2436,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
|
|
unsigned long file;
|
|
struct lruvec *target_lruvec;
|
|
|
|
+ if (lru_gen_enabled())
|
|
+ return;
|
|
+
|
|
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
|
|
|
/*
|
|
@@ -3970,6 +3978,489 @@ void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
|
|
set_page_dirty(pte_page(pte[i]));
|
|
}
|
|
|
|
+/******************************************************************************
|
|
+ * the eviction
|
|
+ ******************************************************************************/
|
|
+
|
|
+static bool should_skip_page(struct page *page, struct scan_control *sc)
|
|
+{
|
|
+ if (!sc->may_unmap && page_mapped(page))
|
|
+ return true;
|
|
+
|
|
+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
|
|
+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
|
|
+ return true;
|
|
+
|
|
+ if (!get_page_unless_zero(page))
|
|
+ return true;
|
|
+
|
|
+ if (!TestClearPageLRU(page)) {
|
|
+ put_page(page);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_to_isolate)
|
|
+{
|
|
+ bool success;
|
|
+ int gen = page_lru_gen(page);
|
|
+ int type = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+ int tier = lru_tier_from_usage(page_tier_usage(page));
|
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
|
+
|
|
+ VM_BUG_ON_PAGE(gen == -1, page);
|
|
+ VM_BUG_ON_PAGE(tier_to_isolate < 0, page);
|
|
+
|
|
+ /* a lazy-free page that has been written into? */
|
|
+ if (type && PageDirty(page) && PageAnon(page)) {
|
|
+ success = lru_gen_deletion(page, lruvec);
|
|
+ VM_BUG_ON_PAGE(!success, page);
|
|
+ SetPageSwapBacked(page);
|
|
+ add_page_to_lru_list_tail(page, lruvec);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* page_update_gen() has updated the gen #? */
|
|
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
|
+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* activate this page if its tier has a higher refault rate */
|
|
+ if (tier_to_isolate < tier) {
|
|
+ int hist = hist_from_seq_or_gen(gen);
|
|
+
|
|
+ page_inc_gen(page, lruvec, false);
|
|
+ WRITE_ONCE(lrugen->activated[hist][type][tier - 1],
|
|
+ lrugen->activated[hist][type][tier - 1] + thp_nr_pages(page));
|
|
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* mark this page for reclaim if it's pending writeback */
|
|
+ if (PageWriteback(page) || (type && PageDirty(page))) {
|
|
+ page_inc_gen(page, lruvec, true);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void isolate_page(struct page *page, struct lruvec *lruvec)
|
|
+{
|
|
+ bool success;
|
|
+
|
|
+ success = lru_gen_deletion(page, lruvec);
|
|
+ VM_BUG_ON_PAGE(!success, page);
|
|
+
|
|
+ if (PageActive(page)) {
|
|
+ ClearPageActive(page);
|
|
+ /* make sure shrink_page_list() rejects this page */
|
|
+ SetPageReferenced(page);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* make sure shrink_page_list() doesn't try to write this page */
|
|
+ ClearPageReclaim(page);
|
|
+ /* make sure shrink_page_list() doesn't reject this page */
|
|
+ ClearPageReferenced(page);
|
|
+}
|
|
+
|
|
+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, long *nr_to_scan,
|
|
+ int type, int tier, struct list_head *list)
|
|
+{
|
|
+ bool success;
|
|
+ int gen, zone;
|
|
+ enum vm_event_item item;
|
|
+ int sorted = 0;
|
|
+ int scanned = 0;
|
|
+ int isolated = 0;
|
|
+ int batch_size = 0;
|
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+
|
|
+ VM_BUG_ON(!list_empty(list));
|
|
+
|
|
+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
|
|
+ return -ENOENT;
|
|
+
|
|
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
|
+
|
|
+ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
|
|
+ LIST_HEAD(moved);
|
|
+ int skipped = 0;
|
|
+ struct list_head *head = &lrugen->lists[gen][type][zone];
|
|
+
|
|
+ while (!list_empty(head)) {
|
|
+ struct page *page = lru_to_page(head);
|
|
+ int delta = thp_nr_pages(page);
|
|
+
|
|
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
|
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
|
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
|
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
|
|
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
|
+
|
|
+ prefetchw_prev_lru_page(page, head, flags);
|
|
+
|
|
+ scanned += delta;
|
|
+
|
|
+ if (sort_page(page, lruvec, tier))
|
|
+ sorted += delta;
|
|
+ else if (should_skip_page(page, sc)) {
|
|
+ list_move(&page->lru, &moved);
|
|
+ skipped += delta;
|
|
+ } else {
|
|
+ isolate_page(page, lruvec);
|
|
+ list_add(&page->lru, list);
|
|
+ isolated += delta;
|
|
+ }
|
|
+
|
|
+ if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX ||
|
|
+ ++batch_size == MAX_BATCH_SIZE)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ list_splice(&moved, head);
|
|
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
|
|
+
|
|
+ if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX ||
|
|
+ batch_size == MAX_BATCH_SIZE)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ success = try_inc_min_seq(lruvec, type);
|
|
+
|
|
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
|
|
+ if (!cgroup_reclaim(sc)) {
|
|
+ __count_vm_events(item, scanned);
|
|
+ __count_vm_events(PGREFILL, sorted);
|
|
+ }
|
|
+ __count_memcg_events(memcg, item, scanned);
|
|
+ __count_memcg_events(memcg, PGREFILL, sorted);
|
|
+ __count_vm_events(PGSCAN_ANON + type, scanned);
|
|
+
|
|
+ *nr_to_scan -= scanned;
|
|
+
|
|
+ if (*nr_to_scan <= 0 || success || isolated)
|
|
+ return isolated;
|
|
+ /*
|
|
+ * We may have trouble finding eligible pages due to reclaim_idx,
|
|
+ * may_unmap and may_writepage. The following check makes sure we won't
|
|
+ * be stuck if we aren't making enough progress.
|
|
+ */
|
|
+ return batch_size == MAX_BATCH_SIZE && sorted >= SWAP_CLUSTER_MAX ? 0 : -ENOENT;
|
|
+}
|
|
+
|
|
+static int get_tier_to_isolate(struct lruvec *lruvec, int type)
|
|
+{
|
|
+ int tier;
|
|
+ struct controller_pos sp, pv;
|
|
+
|
|
+ /*
|
|
+ * Ideally we don't want to evict upper tiers that have higher refault
|
|
+ * rates. However, we need to leave a margin for the fluctuations in
|
|
+ * refault rates. So we use a larger gain factor to make sure upper
|
|
+ * tiers are indeed more active. We choose 2 because the lowest upper
|
|
+ * tier would have twice of the refault rate of the base tier, according
|
|
+ * to their numbers of accesses.
|
|
+ */
|
|
+ read_controller_pos(&sp, lruvec, type, 0, 1);
|
|
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
|
|
+ read_controller_pos(&pv, lruvec, type, tier, 2);
|
|
+ if (!positive_ctrl_err(&sp, &pv))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return tier - 1;
|
|
+}
|
|
+
|
|
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_to_isolate)
|
|
+{
|
|
+ int type, tier;
|
|
+ struct controller_pos sp, pv;
|
|
+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
|
|
+
|
|
+ /*
|
|
+ * Compare the refault rates between the base tiers of anon and file to
|
|
+ * determine which type to evict. Also need to compare the refault rates
|
|
+ * of the upper tiers of the selected type with that of the base tier of
|
|
+ * the other type to determine which tier of the selected type to evict.
|
|
+ */
|
|
+ read_controller_pos(&sp, lruvec, 0, 0, gain[0]);
|
|
+ read_controller_pos(&pv, lruvec, 1, 0, gain[1]);
|
|
+ type = positive_ctrl_err(&sp, &pv);
|
|
+
|
|
+ read_controller_pos(&sp, lruvec, !type, 0, gain[!type]);
|
|
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
|
|
+ read_controller_pos(&pv, lruvec, type, tier, gain[type]);
|
|
+ if (!positive_ctrl_err(&sp, &pv))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ *tier_to_isolate = tier - 1;
|
|
+
|
|
+ return type;
|
|
+}
|
|
+
|
|
+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
|
+ long *nr_to_scan, int *type_to_scan, struct list_head *list)
|
|
+{
|
|
+ int i;
|
|
+ int type;
|
|
+ int isolated;
|
|
+ int tier = -1;
|
|
+ DEFINE_MAX_SEQ();
|
|
+ DEFINE_MIN_SEQ();
|
|
+
|
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
|
+
|
|
+ if (max_nr_gens(max_seq, min_seq, swappiness) == MIN_NR_GENS)
|
|
+ return 0;
|
|
+ /*
|
|
+ * Try to select a type based on generations and swappiness, and if that
|
|
+ * fails, fall back to get_type_to_scan(). When anon and file are both
|
|
+ * available from the same generation, swappiness 200 is interpreted as
|
|
+ * anon first and swappiness 1 is interpreted as file first.
|
|
+ */
|
|
+ type = !swappiness || min_seq[0] > min_seq[1] ||
|
|
+ (min_seq[0] == min_seq[1] && swappiness != 200 &&
|
|
+ (swappiness == 1 || get_type_to_scan(lruvec, swappiness, &tier)));
|
|
+
|
|
+ if (tier == -1)
|
|
+ tier = get_tier_to_isolate(lruvec, type);
|
|
+
|
|
+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
|
|
+ isolated = scan_pages(lruvec, sc, nr_to_scan, type, tier, list);
|
|
+ if (isolated >= 0)
|
|
+ break;
|
|
+
|
|
+ type = !type;
|
|
+ tier = get_tier_to_isolate(lruvec, type);
|
|
+ }
|
|
+
|
|
+ if (isolated < 0)
|
|
+ isolated = *nr_to_scan = 0;
|
|
+
|
|
+ *type_to_scan = type;
|
|
+
|
|
+ return isolated;
|
|
+}
|
|
+
|
|
+/* Main function used by the foreground, the background and the user-triggered eviction. */
|
|
+static bool evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
|
+ long *nr_to_scan)
|
|
+{
|
|
+ int type;
|
|
+ int isolated;
|
|
+ int reclaimed;
|
|
+ LIST_HEAD(list);
|
|
+ struct page *page;
|
|
+ enum vm_event_item item;
|
|
+ struct reclaim_stat stat;
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ isolated = isolate_pages(lruvec, sc, swappiness, nr_to_scan, &type, &list);
|
|
+ VM_BUG_ON(list_empty(&list) == !!isolated);
|
|
+
|
|
+ if (isolated)
|
|
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, isolated);
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ if (!isolated)
|
|
+ goto done;
|
|
+
|
|
+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
|
|
+ /*
|
|
+ * We need to prevent rejected pages from being added back to the same
|
|
+ * lists they were isolated from. Otherwise we may risk looping on them
|
|
+ * forever. We use PageActive() or !PageReferenced() && PageWorkingset()
|
|
+ * to tell lru_gen_addition() not to add them to the oldest generation.
|
|
+ */
|
|
+ list_for_each_entry(page, &list, lru) {
|
|
+ if (PageMlocked(page))
|
|
+ continue;
|
|
+
|
|
+ if (page_mapped(page) && PageReferenced(page))
|
|
+ SetPageActive(page);
|
|
+ else {
|
|
+ ClearPageActive(page);
|
|
+ SetPageWorkingset(page);
|
|
+ }
|
|
+ ClearPageReferenced(page);
|
|
+ }
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ move_pages_to_lru(lruvec, &list);
|
|
+
|
|
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, -isolated);
|
|
+
|
|
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
|
|
+ if (!cgroup_reclaim(sc))
|
|
+ __count_vm_events(item, reclaimed);
|
|
+ __count_memcg_events(lruvec_memcg(lruvec), item, reclaimed);
|
|
+ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ mem_cgroup_uncharge_list(&list);
|
|
+ free_unref_page_list(&list);
|
|
+
|
|
+ sc->nr_reclaimed += reclaimed;
|
|
+done:
|
|
+ return *nr_to_scan > 0 && sc->nr_reclaimed < sc->nr_to_reclaim;
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
+ * page reclaim
|
|
+ ******************************************************************************/
|
|
+
|
|
+static int get_swappiness(struct lruvec *lruvec)
|
|
+{
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+ int swappiness = mem_cgroup_get_nr_swap_pages(memcg) >= (long)SWAP_CLUSTER_MAX ?
|
|
+ mem_cgroup_swappiness(memcg) : 0;
|
|
+
|
|
+ VM_BUG_ON(swappiness > 200U);
|
|
+
|
|
+ return swappiness;
|
|
+}
|
|
+
|
|
+static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
|
+ int swappiness)
|
|
+{
|
|
+ int gen, type, zone;
|
|
+ long nr_to_scan = 0;
|
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
|
+ DEFINE_MAX_SEQ();
|
|
+ DEFINE_MIN_SEQ();
|
|
+
|
|
+ lru_add_drain();
|
|
+
|
|
+ for (type = !swappiness; type < ANON_AND_FILE; type++) {
|
|
+ unsigned long seq;
|
|
+
|
|
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
|
+ gen = lru_gen_from_seq(seq);
|
|
+
|
|
+ for (zone = 0; zone <= sc->reclaim_idx; zone++)
|
|
+ nr_to_scan += READ_ONCE(lrugen->sizes[gen][type][zone]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ nr_to_scan = max(nr_to_scan, 0L);
|
|
+ nr_to_scan = round_up(nr_to_scan >> sc->priority, SWAP_CLUSTER_MAX);
|
|
+
|
|
+ if (max_nr_gens(max_seq, min_seq, swappiness) > MIN_NR_GENS)
|
|
+ return nr_to_scan;
|
|
+
|
|
+ /* kswapd uses lru_gen_age_node() */
|
|
+ if (current_is_kswapd())
|
|
+ return 0;
|
|
+
|
|
+ return walk_mm_list(lruvec, max_seq, sc, swappiness, NULL) ? nr_to_scan : 0;
|
|
+}
|
|
+
|
|
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+ struct blk_plug plug;
|
|
+ unsigned long scanned = 0;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+
|
|
+ blk_start_plug(&plug);
|
|
+
|
|
+ while (true) {
|
|
+ long nr_to_scan;
|
|
+ int swappiness = sc->may_swap ? get_swappiness(lruvec) : 0;
|
|
+
|
|
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness) - scanned;
|
|
+ if (nr_to_scan < (long)SWAP_CLUSTER_MAX)
|
|
+ break;
|
|
+
|
|
+ scanned += nr_to_scan;
|
|
+
|
|
+ if (!evict_pages(lruvec, sc, swappiness, &nr_to_scan))
|
|
+ break;
|
|
+
|
|
+ scanned -= nr_to_scan;
|
|
+
|
|
+ if (mem_cgroup_below_min(memcg) ||
|
|
+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
|
+ break;
|
|
+
|
|
+ cond_resched();
|
|
+ }
|
|
+
|
|
+ blk_finish_plug(&plug);
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
+ * the background aging
|
|
+ ******************************************************************************/
|
|
+
|
|
+static int lru_gen_spread = MIN_NR_GENS;
|
|
+
|
|
+static void try_walk_mm_list(struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+ int gen, type, zone;
|
|
+ long old_and_young[2] = {};
|
|
+ int spread = READ_ONCE(lru_gen_spread);
|
|
+ int swappiness = get_swappiness(lruvec);
|
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
+ DEFINE_MAX_SEQ();
|
|
+ DEFINE_MIN_SEQ();
|
|
+
|
|
+ lru_add_drain();
|
|
+
|
|
+ for (type = !swappiness; type < ANON_AND_FILE; type++) {
|
|
+ unsigned long seq;
|
|
+
|
|
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
|
+ gen = lru_gen_from_seq(seq);
|
|
+
|
|
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
|
+ old_and_young[seq == max_seq] +=
|
|
+ READ_ONCE(lrugen->sizes[gen][type][zone]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ old_and_young[0] = max(old_and_young[0], 0L);
|
|
+ old_and_young[1] = max(old_and_young[1], 0L);
|
|
+
|
|
+ /* try to spread pages out across spread+1 generations */
|
|
+ if (old_and_young[0] >= old_and_young[1] * spread &&
|
|
+ min_nr_gens(max_seq, min_seq, swappiness) > max(spread, MIN_NR_GENS))
|
|
+ return;
|
|
+
|
|
+ walk_mm_list(lruvec, max_seq, sc, swappiness, pgdat->mm_walk_args);
|
|
+}
|
|
+
|
|
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
|
+{
|
|
+ struct mem_cgroup *memcg;
|
|
+
|
|
+ VM_BUG_ON(!current_is_kswapd());
|
|
+
|
|
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
|
+ do {
|
|
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
|
+
|
|
+ if (!mem_cgroup_below_min(memcg) &&
|
|
+ (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
|
|
+ try_walk_mm_list(lruvec, sc);
|
|
+
|
|
+ cond_resched();
|
|
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
|
+}
|
|
+
|
|
/******************************************************************************
|
|
* state change
|
|
******************************************************************************/
|
|
@@ -4172,6 +4663,21 @@ static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *se
|
|
return NOTIFY_DONE;
|
|
}
|
|
|
|
+static void lru_gen_start_kswapd(int nid)
|
|
+{
|
|
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
|
+
|
|
+ pgdat->mm_walk_args = kvzalloc_node(size_of_mm_walk_args(), GFP_KERNEL, nid);
|
|
+ WARN_ON_ONCE(!pgdat->mm_walk_args);
|
|
+}
|
|
+
|
|
+static void lru_gen_stop_kswapd(int nid)
|
|
+{
|
|
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
|
+
|
|
+ kvfree(pgdat->mm_walk_args);
|
|
+}
|
|
+
|
|
/******************************************************************************
|
|
* initialization
|
|
******************************************************************************/
|
|
@@ -4220,6 +4726,24 @@ static int __init init_lru_gen(void)
|
|
*/
|
|
arch_initcall(init_lru_gen);
|
|
|
|
+#else /* CONFIG_LRU_GEN */
|
|
+
|
|
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+}
|
|
+
|
|
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
|
+{
|
|
+}
|
|
+
|
|
+static void lru_gen_start_kswapd(int nid)
|
|
+{
|
|
+}
|
|
+
|
|
+static void lru_gen_stop_kswapd(int nid)
|
|
+{
|
|
+}
|
|
+
|
|
#endif /* CONFIG_LRU_GEN */
|
|
|
|
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
@@ -4233,6 +4757,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
struct blk_plug plug;
|
|
bool scan_adjusted;
|
|
|
|
+ if (lru_gen_enabled()) {
|
|
+ lru_gen_shrink_lruvec(lruvec, sc);
|
|
+ return;
|
|
+ }
|
|
+
|
|
get_scan_count(lruvec, sc, nr);
|
|
|
|
/* Record the original scan target for proportional adjustments later */
|
|
@@ -4699,6 +5228,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
|
|
struct lruvec *target_lruvec;
|
|
unsigned long refaults;
|
|
|
|
+ if (lru_gen_enabled())
|
|
+ return;
|
|
+
|
|
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
|
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
|
|
target_lruvec->refaults[0] = refaults;
|
|
@@ -5073,6 +5605,11 @@ static void age_active_anon(struct pglist_data *pgdat,
|
|
struct mem_cgroup *memcg;
|
|
struct lruvec *lruvec;
|
|
|
|
+ if (lru_gen_enabled()) {
|
|
+ lru_gen_age_node(pgdat, sc);
|
|
+ return;
|
|
+ }
|
|
+
|
|
if (!total_swap_pages)
|
|
return;
|
|
|
|
@@ -5753,6 +6290,8 @@ int kswapd_run(int nid)
|
|
if (pgdat->kswapd)
|
|
return 0;
|
|
|
|
+ lru_gen_start_kswapd(nid);
|
|
+
|
|
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
|
|
if (IS_ERR(pgdat->kswapd)) {
|
|
/* failure at boot is fatal */
|
|
@@ -5775,6 +6314,7 @@ void kswapd_stop(int nid)
|
|
if (kswapd) {
|
|
kthread_stop(kswapd);
|
|
NODE_DATA(nid)->kswapd = NULL;
|
|
+ lru_gen_stop_kswapd(nid);
|
|
}
|
|
}
|
|
|
|
--
|
|
2.31.1.751.gd2f1c929bd-goog
|
|
|
|
|