Added kernel

This commit is contained in:
Gerben Jan Dijkman 2021-07-29 22:50:48 +02:00
parent ae0a534e13
commit 5a003fc1a8
29 changed files with 16124 additions and 0 deletions

View File

@ -0,0 +1,3 @@
DIST all-5.13.5.patch 15071574 BLAKE2B f0b44888b216a60bb12a920a170ffb8ee705e357b82b0cacd58551e2d0e257c0f4419c34976263dc062335bb37f4b3a7418f3d9674e601fd8adda88bacad97d6 SHA512 046f42a5c8fe6477cdda82f47a07093ea51cf26b231b1c58230885954b7ecab9faa9eb72ac3c0cb1603dd6ca2b5b0d76421de6d2c3c05a0bee3ca6e080bfa084
DIST linux-5.13.tar.xz 119297284 BLAKE2B 9c4c12e2394dec064adff51f7ccdf389192eb27ba7906db5eda543afe3d04afca6b9ea0848a057571bf2534eeb98e1e3a67734deff82c0d3731be205ad995668 SHA512 a8edf97e9d38a49f1be2bde1e29ad96274bb2c6f7e8a2bebaa1161dd4df9cabcbaec4ff644c45bee94f86ae47725087d6deed0cd954209cec717621d137db85e
DIST patch-5.13.5.xz 473120 BLAKE2B a0dd9f3f972a16de87f0d2d8daa7f5d35b27314d22597a28f471cdbe6cedfa7d4bf69e41504d6a9b9d4c1f085146604394747771185dd0a09276cfd92820b4a8 SHA512 1e4eb575775ccbc2e88b34b902a75562e49d6dfb4699dadd5b41fff9db8c2bc994d946d1e60f6320f48ef233aa721d3725582d4ec57458f2293da9a85806c7b1

View File

@ -0,0 +1,29 @@
From c965cb8a004c6cc370b4bf297c61fe5ac8ab0583 Mon Sep 17 00:00:00 2001
From: Martijn Braam <martijn@brixit.nl>
Date: Wed, 6 Jan 2021 03:11:17 +0100
Subject: [PATCH] arm64: dts: sunxi: Add mmc aliases for the PineTab
The order for the mmc devices changed in the kernel without this change.
Signed-off-by: Martijn Braam <martijn@brixit.nl>
---
arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
index a87790df94b3..1cf3c3a9ad7f 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
@@ -18,6 +18,9 @@ / {
compatible = "pine64,pinetab", "allwinner,sun50i-a64";
aliases {
+ mmc0 = &mmc0;
+ mmc1 = &mmc1;
+ mmc2 = &mmc2;
serial0 = &uart0;
ethernet0 = &rtl8723cs;
};
--
2.29.2

View File

@ -0,0 +1,511 @@
diff --git a/MAINTAINERS b/MAINTAINERS
index 7ffac272434e..ddff07cd794c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2715,6 +2715,7 @@ F: drivers/video/fbdev/core/bootsplash*.*
F: drivers/video/fbdev/core/dummycon.c
F: include/linux/bootsplash.h
F: include/uapi/linux/bootsplash_file.h
+F: tools/bootsplash/*
BPF (Safe dynamic programs and tools)
M: Alexei Starovoitov <ast@kernel.org>
diff --git a/tools/bootsplash/.gitignore b/tools/bootsplash/.gitignore
new file mode 100644
index 000000000000..091b99a17567
--- /dev/null
+++ b/tools/bootsplash/.gitignore
@@ -0,0 +1 @@
+bootsplash-packer
diff --git a/tools/bootsplash/Makefile b/tools/bootsplash/Makefile
new file mode 100644
index 000000000000..0ad8e8a84942
--- /dev/null
+++ b/tools/bootsplash/Makefile
@@ -0,0 +1,9 @@
+CC := $(CROSS_COMPILE)gcc
+CFLAGS := -I../../usr/include
+
+PROGS := bootsplash-packer
+
+all: $(PROGS)
+
+clean:
+ rm -fr $(PROGS)
diff --git a/tools/bootsplash/bootsplash-packer.c b/tools/bootsplash/bootsplash-packer.c
new file mode 100644
index 000000000000..ffb6a8b69885
--- /dev/null
+++ b/tools/bootsplash/bootsplash-packer.c
@@ -0,0 +1,471 @@
+/*
+ * Kernel based bootsplash.
+ *
+ * (Splash file packer tool)
+ *
+ * Authors:
+ * Max Staudt <mstaudt@suse.de>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+
+#include <endian.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <linux/bootsplash_file.h>
+
+
+static void print_help(char *progname)
+{
+ printf("Usage: %s [OPTIONS] outfile\n", progname);
+ printf("\n"
+ "Options, executed in order given:\n"
+ " -h, --help Print this help message\n"
+ "\n"
+ " --bg_red <u8> Background color (red part)\n"
+ " --bg_green <u8> Background color (green part)\n"
+ " --bg_blue <u8> Background color (blue part)\n"
+ " --bg_reserved <u8> (do not use)\n"
+ " --frame_ms <u16> Minimum milliseconds between animation steps\n"
+ "\n"
+ " --picture Start describing the next picture\n"
+ " --pic_width <u16> Picture width in pixels\n"
+ " --pic_height <u16> Picture height in pixels\n"
+ " --pic_position <u8> Coarse picture placement:\n"
+ " 0x00 - Top left\n"
+ " 0x01 - Top\n"
+ " 0x02 - Top right\n"
+ " 0x03 - Right\n"
+ " 0x04 - Bottom right\n"
+ " 0x05 - Bottom\n"
+ " 0x06 - Bottom left\n"
+ " 0x07 - Left\n"
+ "\n"
+ " Flags:\n"
+ " 0x10 - Calculate offset from corner towards center,\n"
+ " rather than from center towards corner\n"
+ " --pic_position_offset <u16> Distance from base position in pixels\n"
+ " --pic_anim_type <u8> Animation type:\n"
+ " 0 - None\n"
+ " 1 - Forward loop\n"
+ " --pic_anim_loop <u8> Loop point for animation\n"
+ "\n"
+ " --blob <filename> Include next data stream\n"
+ " --blob_type <u16> Type of data\n"
+ " --blob_picture_id <u8> Picture to associate this blob with, starting at 0\n"
+ " (default: number of last --picture)\n"
+ "\n");
+ printf("This tool will write %s files.\n\n",
+#if __BYTE_ORDER == __BIG_ENDIAN
+ "Big Endian (BE)");
+#elif __BYTE_ORDER == __LITTLE_ENDIAN
+ "Little Endian (LE)");
+#else
+#error
+#endif
+}
+
+
+struct blob_entry {
+ struct blob_entry *next;
+
+ char *fn;
+
+ struct splash_blob_header header;
+};
+
+
+static void dump_file_header(struct splash_file_header *h)
+{
+ printf(" --- File header ---\n");
+ printf("\n");
+ printf(" version: %5u\n", h->version);
+ printf("\n");
+ printf(" bg_red: %5u\n", h->bg_red);
+ printf(" bg_green: %5u\n", h->bg_green);
+ printf(" bg_blue: %5u\n", h->bg_blue);
+ printf(" bg_reserved: %5u\n", h->bg_reserved);
+ printf("\n");
+ printf(" num_blobs: %5u\n", h->num_blobs);
+ printf(" num_pics: %5u\n", h->num_pics);
+ printf("\n");
+ printf(" frame_ms: %5u\n", h->frame_ms);
+ printf("\n");
+}
+
+static void dump_pic_header(struct splash_pic_header *ph)
+{
+ printf(" --- Picture header ---\n");
+ printf("\n");
+ printf(" width: %5u\n", ph->width);
+ printf(" height: %5u\n", ph->height);
+ printf("\n");
+ printf(" num_blobs: %5u\n", ph->num_blobs);
+ printf("\n");
+ printf(" position: %0x3x\n", ph->position);
+ printf(" position_offset: %5u\n", ph->position_offset);
+ printf("\n");
+ printf(" anim_type: %5u\n", ph->anim_type);
+ printf(" anim_loop: %5u\n", ph->anim_loop);
+ printf("\n");
+}
+
+static void dump_blob(struct blob_entry *b)
+{
+ printf(" --- Blob header ---\n");
+ printf("\n");
+ printf(" length: %7u\n", b->header.length);
+ printf(" type: %7u\n", b->header.type);
+ printf("\n");
+ printf(" picture_id: %7u\n", b->header.picture_id);
+ printf("\n");
+}
+
+
+#define OPT_MAX(var, max) \
+ do { \
+ if ((var) > max) { \
+ fprintf(stderr, "--%s: Invalid value\n", \
+ long_options[option_index].name); \
+ break; \
+ } \
+ } while (0)
+
+static struct option long_options[] = {
+ {"help", 0, 0, 'h'},
+ {"bg_red", 1, 0, 10001},
+ {"bg_green", 1, 0, 10002},
+ {"bg_blue", 1, 0, 10003},
+ {"bg_reserved", 1, 0, 10004},
+ {"frame_ms", 1, 0, 10005},
+ {"picture", 0, 0, 20000},
+ {"pic_width", 1, 0, 20001},
+ {"pic_height", 1, 0, 20002},
+ {"pic_position", 1, 0, 20003},
+ {"pic_position_offset", 1, 0, 20004},
+ {"pic_anim_type", 1, 0, 20005},
+ {"pic_anim_loop", 1, 0, 20006},
+ {"blob", 1, 0, 30000},
+ {"blob_type", 1, 0, 30001},
+ {"blob_picture_id", 1, 0, 30002},
+ {NULL, 0, NULL, 0}
+};
+
+
+int main(int argc, char **argv)
+{
+ FILE *of;
+ char *ofn;
+ int c;
+ int option_index = 0;
+
+ unsigned long ul;
+ struct splash_file_header fh = {};
+ struct splash_pic_header ph[255];
+ struct blob_entry *blob_first = NULL;
+ struct blob_entry *blob_last = NULL;
+ struct blob_entry *blob_cur = NULL;
+
+ if (argc < 2) {
+ print_help(argv[0]);
+ return EXIT_FAILURE;
+ }
+
+
+ /* Parse and and execute user commands */
+ while ((c = getopt_long(argc, argv, "h",
+ long_options, &option_index)) != -1) {
+ switch (c) {
+ case 10001: /* bg_red */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ fh.bg_red = ul;
+ break;
+ case 10002: /* bg_green */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ fh.bg_green = ul;
+ break;
+ case 10003: /* bg_blue */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ fh.bg_blue = ul;
+ break;
+ case 10004: /* bg_reserved */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ fh.bg_reserved = ul;
+ break;
+ case 10005: /* frame_ms */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 65535);
+ fh.frame_ms = ul;
+ break;
+
+
+ case 20000: /* picture */
+ if (fh.num_pics >= 255) {
+ fprintf(stderr, "--%s: Picture array full\n",
+ long_options[option_index].name);
+ break;
+ }
+
+ fh.num_pics++;
+ break;
+
+ case 20001: /* pic_width */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 65535);
+ ph[fh.num_pics - 1].width = ul;
+ break;
+
+ case 20002: /* pic_height */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 65535);
+ ph[fh.num_pics - 1].height = ul;
+ break;
+
+ case 20003: /* pic_position */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ ph[fh.num_pics - 1].position = ul;
+ break;
+
+ case 20004: /* pic_position_offset */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ ph[fh.num_pics - 1].position_offset = ul;
+ break;
+
+ case 20005: /* pic_anim_type */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ ph[fh.num_pics - 1].anim_type = ul;
+ break;
+
+ case 20006: /* pic_anim_loop */
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ ph[fh.num_pics - 1].anim_loop = ul;
+ break;
+
+
+ case 30000: /* blob */
+ if (fh.num_blobs >= 65535) {
+ fprintf(stderr, "--%s: Blob array full\n",
+ long_options[option_index].name);
+ break;
+ }
+
+ blob_cur = calloc(1, sizeof(struct blob_entry));
+ if (!blob_cur) {
+ fprintf(stderr, "--%s: Out of memory\n",
+ long_options[option_index].name);
+ break;
+ }
+
+ blob_cur->fn = optarg;
+ if (fh.num_pics)
+ blob_cur->header.picture_id = fh.num_pics - 1;
+
+ if (!blob_first)
+ blob_first = blob_cur;
+ if (blob_last)
+ blob_last->next = blob_cur;
+ blob_last = blob_cur;
+ fh.num_blobs++;
+ break;
+
+ case 30001: /* blob_type */
+ if (!blob_cur) {
+ fprintf(stderr, "--%s: No blob selected\n",
+ long_options[option_index].name);
+ break;
+ }
+
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ blob_cur->header.type = ul;
+ break;
+
+ case 30002: /* blob_picture_id */
+ if (!blob_cur) {
+ fprintf(stderr, "--%s: No blob selected\n",
+ long_options[option_index].name);
+ break;
+ }
+
+ ul = strtoul(optarg, NULL, 0);
+ OPT_MAX(ul, 255);
+ blob_cur->header.picture_id = ul;
+ break;
+
+
+
+ case 'h':
+ case '?':
+ default:
+ print_help(argv[0]);
+ goto EXIT;
+ } /* switch (c) */
+ } /* while ((c = getopt_long(...)) != -1) */
+
+ /* Consume and drop lone arguments */
+ while (optind < argc) {
+ ofn = argv[optind];
+ optind++;
+ }
+
+
+ /* Read file lengths */
+ for (blob_cur = blob_first; blob_cur; blob_cur = blob_cur->next) {
+ FILE *f;
+ long pos;
+ int i;
+
+ if (!blob_cur->fn)
+ continue;
+
+ f = fopen(blob_cur->fn, "rb");
+ if (!f)
+ goto ERR_FILE_LEN;
+
+ if (fseek(f, 0, SEEK_END))
+ goto ERR_FILE_LEN;
+
+ pos = ftell(f);
+ if (pos < 0 || pos > (1 << 30))
+ goto ERR_FILE_LEN;
+
+ blob_cur->header.length = pos;
+
+ fclose(f);
+ continue;
+
+ERR_FILE_LEN:
+ fprintf(stderr, "Error getting file length (or too long): %s\n",
+ blob_cur->fn);
+ if (f)
+ fclose(f);
+ continue;
+ }
+
+
+ /* Set magic headers */
+#if __BYTE_ORDER == __BIG_ENDIAN
+ memcpy(&fh.id[0], BOOTSPLASH_MAGIC_BE, 16);
+#elif __BYTE_ORDER == __LITTLE_ENDIAN
+ memcpy(&fh.id[0], BOOTSPLASH_MAGIC_LE, 16);
+#else
+#error
+#endif
+ fh.version = BOOTSPLASH_VERSION;
+
+ /* Set blob counts */
+ for (blob_cur = blob_first; blob_cur; blob_cur = blob_cur->next) {
+ if (blob_cur->header.picture_id < fh.num_pics)
+ ph[blob_cur->header.picture_id].num_blobs++;
+ }
+
+
+ /* Dump structs */
+ dump_file_header(&fh);
+
+ for (ul = 0; ul < fh.num_pics; ul++)
+ dump_pic_header(&ph[ul]);
+
+ for (blob_cur = blob_first; blob_cur; blob_cur = blob_cur->next)
+ dump_blob(blob_cur);
+
+
+ /* Write to file */
+ printf("Writing splash to file: %s\n", ofn);
+ of = fopen(ofn, "wb");
+ if (!of)
+ goto ERR_WRITING;
+
+ if (fwrite(&fh, sizeof(struct splash_file_header), 1, of) != 1)
+ goto ERR_WRITING;
+
+ for (ul = 0; ul < fh.num_pics; ul++) {
+ if (fwrite(&ph[ul], sizeof(struct splash_pic_header), 1, of)
+ != 1)
+ goto ERR_WRITING;
+ }
+
+ blob_cur = blob_first;
+ while (blob_cur) {
+ struct blob_entry *blob_old = blob_cur;
+ FILE *f;
+ char *buf[256];
+ uint32_t left;
+
+ if (fwrite(&blob_cur->header,
+ sizeof(struct splash_blob_header), 1, of) != 1)
+ goto ERR_WRITING;
+
+ if (!blob_cur->header.length || !blob_cur->fn)
+ continue;
+
+ f = fopen(blob_cur->fn, "rb");
+ if (!f)
+ goto ERR_FILE_COPY;
+
+ left = blob_cur->header.length;
+ while (left >= sizeof(buf)) {
+ if (fread(buf, sizeof(buf), 1, f) != 1)
+ goto ERR_FILE_COPY;
+ if (fwrite(buf, sizeof(buf), 1, of) != 1)
+ goto ERR_FILE_COPY;
+ left -= sizeof(buf);
+ }
+ if (left) {
+ if (fread(buf, left, 1, f) != 1)
+ goto ERR_FILE_COPY;
+ if (fwrite(buf, left, 1, of) != 1)
+ goto ERR_FILE_COPY;
+ }
+
+ /* Pad data stream to 16 bytes */
+ if (left % 16) {
+ if (fwrite("\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0",
+ 16 - (left % 16), 1, of) != 1)
+ goto ERR_FILE_COPY;
+ }
+
+ fclose(f);
+ blob_cur = blob_cur->next;
+ free(blob_old);
+ continue;
+
+ERR_FILE_COPY:
+ if (f)
+ fclose(f);
+ goto ERR_WRITING;
+ }
+
+ fclose(of);
+
+EXIT:
+ return EXIT_SUCCESS;
+
+
+ERR_WRITING:
+ fprintf(stderr, "Error writing splash.\n");
+ fprintf(stderr, "The output file is probably corrupt.\n");
+ if (of)
+ fclose(of);
+
+ while (blob_cur) {
+ struct blob_entry *blob_old = blob_cur;
+
+ blob_cur = blob_cur->next;
+ free(blob_old);
+ }
+
+ return EXIT_FAILURE;
+}

View File

@ -0,0 +1,12 @@
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 03b83aa91277..dfc6c7d1b0e7 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -4070,6 +4070,7 @@ static int btusb_setup_qca(struct hci_dev *hdev)
}
if (!info) {
bt_dev_err(hdev, "don't support firmware rome 0x%x", ver_rom);
+ if (ver_rom & ~0xffffU) return 0;
return -ENODEV;
}

View File

@ -0,0 +1,49 @@
From cb408fb65a08bd45543724c1e9b8f38ae1bebc4a Mon Sep 17 00:00:00 2001
From: Arnaud Ferraris <arnaud.ferraris@gmail.com>
Date: Tue, 4 Aug 2020 15:12:59 +0200
Subject: [PATCH 177/183] leds-gpio: make max_brightness configurable
---
drivers/leds/leds-gpio.c | 4 ++++
include/linux/leds.h | 3 ++-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index 93f5b1b60fde..f8483fab1164 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -108,6 +108,8 @@ static int create_gpio_led(const struct gpio_led *template,
if (ret < 0)
return ret;
+ led_dat->cdev.max_brightness = template->max_brightness;
+
if (template->name) {
led_dat->cdev.name = template->name;
ret = devm_led_classdev_register(parent, &led_dat->cdev);
@@ -177,6 +179,8 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
if (fwnode_property_present(child, "panic-indicator"))
led.panic_indicator = 1;
+ fwnode_property_read_u32(child, "max-brightness", &led.max_brightness);
+
ret = create_gpio_led(&led, led_dat, dev, child, NULL);
if (ret < 0) {
fwnode_handle_put(child);
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 6a8d6409c993..99a80092114d 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -513,7 +513,8 @@ typedef int (*gpio_blink_set_t)(struct gpio_desc *desc, int state,
struct gpio_led {
const char *name;
const char *default_trigger;
- unsigned gpio;
+ unsigned gpio;
+ unsigned max_brightness;
unsigned active_low : 1;
unsigned retain_state_suspended : 1;
unsigned panic_indicator : 1;
--
2.30.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,391 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_PATCH,MAILING_LIST_MULTI,MENTIONS_GIT_HOSTING,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id BA09AC433ED
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:04 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 99A326108C
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:04 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S230359AbhETGzY (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:24 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37854 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S229534AbhETGzX (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:23 -0400
Received: from mail-qk1-x74a.google.com (mail-qk1-x74a.google.com [IPv6:2607:f8b0:4864:20::74a])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2DB47C061574
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:01 -0700 (PDT)
Received: by mail-qk1-x74a.google.com with SMTP id z2-20020a3765020000b02903a5f51b1c74so684222qkb.7
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:01 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:message-id:mime-version:subject:from:to:cc;
bh=Y3hJAMwzbf34YQU8QX5BsSV2xoCmy36DYto5ZLIStJc=;
b=r/V1aR1KHSQ2RwrGIEEbdDV0RqV+tdHJLBnCnPMLdI4quvTDua13dKOHpxS2Rc7bc4
6ON9rpxOpEhBMPLS8798xqa4jQBTINTCKNlIi3TpaV8t/shwlViCb4Y9bZ4ng8VEsXp3
H2s3DQbb47Iio7YrOnBahF4qBDJl2fkHL257Ao4wgzgG/ZCK2oy5dcipOFrEpQqPk5vO
hhTC4Zr1DE3XI+Y+uTozfI8CoAtllv6qL31gAWcycyeN72teVQa9ilaeTdglxhCO9DVG
BFkiZH+21Eo3M8PRz4OztnGgRtMvbgNnuUWZ68bnZkO4wMyL6mX2520HA9NQNkGSXLnP
74Zg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:message-id:mime-version:subject:from:to:cc;
bh=Y3hJAMwzbf34YQU8QX5BsSV2xoCmy36DYto5ZLIStJc=;
b=h+lPKp7mQ6QF8fW3fzT7HQgoaLvOfkKtGjwFNvWMOi8UMz94CGWpTgC4tsEX0PenoK
snCz9kDMIR35YO9Dlhz1Ci/04htNK9p+rnvGn7ri/Oin5fFeyVQ15qh33Bgut5m3SKR2
imeFBLkWsXGtFd23XCBmjIcNrqZA0LxhIwoYCbrVWSq5H29Eo6C9ab0gmJ1oY0DCPOL/
Fi8M2neMwLN09EebwZONh8AGuP0XiL0oSnAGDZAhaaAimfHrPBMMYCrxpjnaGxPG2hY0
gvju/bIag6Ug8urHdAAGWsdLaNIsdrIKWlaL76FjcULVwdAARKQifiMMTwJ2JU5y5jMG
OKRg==
X-Gm-Message-State: AOAM5322gu+Tvm1pCjTiKdWMNb3cz1Z6+VCfYHkB7vDvNRYItvu08gEA
/W/WlY6Lc6/4O5nrreOspbq5n77XobE=
X-Google-Smtp-Source: ABdhPJy+4EmI1VvFDhlB3errX+0774OdClFY8nQyFqDe9Pqq8FOdLBnXamEbn+N9M1F/HG6sJ6Mw/n7qw/8=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a0c:e4cd:: with SMTP id g13mr3727631qvm.34.1621493640278;
Wed, 19 May 2021 23:54:00 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:41 -0600
Message-Id: <20210520065355.2736558-1-yuzhao@google.com>
Mime-Version: 1.0
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 00/14] Multigenerational LRU Framework
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
What's new in v3
================
1) Fixed a bug reported by the Arch Linux kernel team:
https://github.com/zen-kernel/zen-kernel/issues/207
2) Rebased to v5.13-rc2.
Highlights from v2
==================
Konstantin Kharlamov <hi-angel@yandex.ru> reported:
My success story: I have Archlinux with 8G RAM + zswap + swap. While
developing, I have lots of apps opened such as multiple LSP-servers
for different langs, chats, two browsers, etc. Usually, my system
gets quickly to a point of SWAP-storms, where I have to kill
LSP-servers, restart browsers to free memory, etc, otherwise the
system lags heavily and is barely usable.
1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU
patchset, and I started up by opening lots of apps to create memory
pressure, and worked for a day like this. Till now I had *not a
single SWAP-storm*, and mind you I got 3.4G in SWAP. I was never
getting to the point of 3G in SWAP before without a single
SWAP-storm.
TLDR
====
The current page reclaim is too expensive in terms of CPU usage and
often making poor choices about what to evict. We would like to offer
an alternative framework that is performant, versatile and
straightforward.
Repo
====
git fetch https://linux-mm.googlesource.com/page-reclaim refs/changes/53/1253/1
Problems
========
Notion of active/inactive
-------------------------
Data centers need to predict whether a job can successfully land on a
machine without actually impacting the existing jobs. The granularity
of the active/inactive is too coarse to be useful for job schedulers
to make such decisions. In addition, data centers need to monitor
their memory utilization for horizontal scaling. The active/inactive
cannot give any insight into a pool of machines because aggregating
them across multiple machines without a common frame of reference
yields no meaningful results.
Phones and laptops need to make good choices about what to evict,
since they are more sensitive to the major faults and the power
consumption. Major faults can cause "janks" (slow UI renderings) and
negatively impact user experience. The selection between anon and file
types has been suboptimal because direct comparisons between them are
infeasible based on the notion of active/inactive. On phones and
laptops, executable pages are frequently evicted despite the fact that
there are many less recently used anon pages. Conversely, on
workstations building large projects, anon pages are occasionally
swapped out while page cache contains many less recently used pages.
Fundamentally, the notion of active/inactive has very limited ability
to measure temporal locality.
Incremental scans via rmap
--------------------------
Each incremental scan picks up at where the last scan left off and
stops after it has found a handful of unreferenced pages. For
workloads using a large amount of anon memory, incremental scans lose
the advantage under sustained memory pressure due to high ratios of
the number of scanned pages to the number of reclaimed pages. On top
of this, the rmap has complex data structures. And the combined
effects typically result in a high amount of CPU usage in the reclaim
path.
Simply put, incremental scans via rmap have no regard for spatial
locality.
Solutions
=========
Notion of generation numbers
----------------------------
The notion of generation numbers introduces a temporal dimension. Each
generation is a dot on the timeline and it includes all pages that
have been referenced since it was created.
Given an lruvec, scans of anon and file types and selections between
them are all based on direct comparisons of generation numbers, which
are simple and yet effective.
A larger number of pages can be spread out across a configurable
number of generations, which are associated with timestamps and
therefore aggregatable. This is specifically designed for data centers
that require working set estimation and proactive reclaim.
Differential scans via page tables
----------------------------------
Each differential scan discovers all pages that have been referenced
since the last scan. It walks the mm_struct list associated with an
lruvec to scan page tables of processes that have been scheduled since
the last scan. The cost of each differential scan is roughly
proportional to the number of referenced pages it discovers. Page
tables usually have good memory locality. The end result is generally
a significant reduction in CPU usage, for workloads using a large
amount of anon memory.
For workloads that have extremely sparse page tables, it is still
possible to fall back to incremental scans via rmap.
Framework
=========
For each lruvec, evictable pages are divided into multiple
generations. The youngest generation number is stored in
lrugen->max_seq for both anon and file types as they are aged on an
equal footing. The oldest generation numbers are stored in
lrugen->min_seq[2] separately for anon and file types as clean file
pages can be evicted regardless of may_swap or may_writepage. These
three variables are monotonically increasing. Generation numbers are
truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
page->flags. The sliding window technique is used to prevent truncated
generation numbers from overlapping. Each truncated generation number
is an index to
lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. Evictable
pages are added to the per-zone lists indexed by lrugen->max_seq or
lrugen->min_seq[2] (modulo MAX_NR_GENS), depending on their types.
Each generation is then divided into multiple tiers. Tiers represent
levels of usage from file descriptors only. Pages accessed N times via
file descriptors belong to tier order_base_2(N). Each generation
contains at most MAX_NR_TIERS tiers, and they require additional
MAX_NR_TIERS-2 bits in page->flags. In contrast to moving across
generations which requires the lru lock for the list operations,
moving across tiers only involves an atomic operation on page->flags
and therefore has a negligible cost. A feedback loop modeled after the
PID controller monitors the refault rates across all tiers and decides
when to activate pages from which tiers in the reclaim path.
The framework comprises two conceptually independent components: the
aging and the eviction, which can be invoked separately from user
space for the purpose of working set estimation and proactive reclaim.
Aging
-----
The aging produces young generations. Given an lruvec, the aging scans
page tables for referenced pages of this lruvec. Upon finding one, the
aging updates its generation number to max_seq. After each round of
scan, the aging increments max_seq.
The aging maintains either a system-wide mm_struct list or per-memcg
mm_struct lists, and it only scans page tables of processes that have
been scheduled since the last scan.
The aging is due when both of min_seq[2] reaches max_seq-1, assuming
both anon and file types are reclaimable.
Eviction
--------
The eviction consumes old generations. Given an lruvec, the eviction
scans the pages on the per-zone lists indexed by either of min_seq[2].
It first tries to select a type based on the values of min_seq[2].
When anon and file types are both available from the same generation,
it selects the one that has a lower refault rate.
During a scan, the eviction sorts pages according to their new
generation numbers, if the aging has found them referenced. It also
moves pages from the tiers that have higher refault rates than tier 0
to the next generation.
When it finds all the per-zone lists of a selected type are empty, the
eviction increments min_seq[2] indexed by this selected type.
Use cases
=========
High anon workloads
-------------------
Our real-world benchmark that browses popular websites in multiple
Chrome tabs demonstrates 51% less CPU usage from kswapd and 52% (full)
less PSI.
Without this patchset, the profile of kswapd looks like:
31.03% page_vma_mapped_walk
25.59% lzo1x_1_do_compress
4.63% do_raw_spin_lock
3.89% vma_interval_tree_iter_next
3.33% vma_interval_tree_subtree_search
With this patchset, it looks like:
49.36% lzo1x_1_do_compress
4.54% page_vma_mapped_walk
4.45% memset_erms
3.47% walk_pte_range
2.88% zram_bvec_rw
In addition, direct reclaim latency is reduced by 22% at 99th
percentile and the number of refaults is reduced by 7%. Both metrics
are important to phones and laptops as they are highly correlated to
user experience.
High page cache workloads
-------------------------
Tiers are specifically designed to improve the performance of page
cache under memory pressure. The fio/io_uring benchmark shows 14%
increase in IOPS when randomly accessing in buffered I/O mode.
Without this patchset, the profile of fio/io_uring looks like:
Children Self Symbol
-----------------------------------
12.03% 0.03% __page_cache_alloc
6.53% 0.83% shrink_active_list
2.53% 0.44% mark_page_accessed
With this patchset, it looks like:
Children Self Symbol
-----------------------------------
9.45% 0.03% __page_cache_alloc
0.52% 0.46% mark_page_accessed
Working set estimation
----------------------
User space can invoke the aging by writing "+ memcg_id node_id gen
[swappiness]" to /sys/kernel/debug/lru_gen. This debugfs interface
also provides the birth time and the size of each generation.
For example, given a pool of machines, a job scheduler periodically
invokes the aging to estimate the working set of each machine. And it
ranks the machines based on the sizes of their working sets and
selects the most ideal ones to land new jobs.
Proactive reclaim
-----------------
User space can invoke the eviction by writing "- memcg_id node_id gen
[swappiness] [nr_to_reclaim]" to /sys/kernel/debug/lru_gen. Multiple
command lines are supported, so does concatenation with delimiters.
For example, a job scheduler can invoke the eviction if it anticipates
new jobs. The savings from proactive reclaim may provide certain SLA
when new jobs actually land.
Yu Zhao (14):
include/linux/memcontrol.h: do not warn in page_memcg_rcu() if
!CONFIG_MEMCG
include/linux/nodemask.h: define next_memory_node() if !CONFIG_NUMA
include/linux/cgroup.h: export cgroup_mutex
mm, x86: support the access bit on non-leaf PMD entries
mm/vmscan.c: refactor shrink_node()
mm/workingset.c: refactor pack_shadow() and unpack_shadow()
mm: multigenerational lru: groundwork
mm: multigenerational lru: activation
mm: multigenerational lru: mm_struct list
mm: multigenerational lru: aging
mm: multigenerational lru: eviction
mm: multigenerational lru: user interface
mm: multigenerational lru: Kconfig
mm: multigenerational lru: documentation
Documentation/vm/index.rst | 1 +
Documentation/vm/multigen_lru.rst | 143 ++
arch/Kconfig | 9 +
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 2 +-
arch/x86/mm/pgtable.c | 5 +-
fs/exec.c | 2 +
fs/fuse/dev.c | 3 +-
include/linux/cgroup.h | 15 +-
include/linux/memcontrol.h | 7 +-
include/linux/mm.h | 2 +
include/linux/mm_inline.h | 234 +++
include/linux/mm_types.h | 107 ++
include/linux/mmzone.h | 117 ++
include/linux/nodemask.h | 1 +
include/linux/page-flags-layout.h | 19 +-
include/linux/page-flags.h | 4 +-
include/linux/pgtable.h | 4 +-
include/linux/swap.h | 4 +-
kernel/bounds.c | 6 +
kernel/events/uprobes.c | 2 +-
kernel/exit.c | 1 +
kernel/fork.c | 10 +
kernel/kthread.c | 1 +
kernel/sched/core.c | 2 +
mm/Kconfig | 58 +
mm/huge_memory.c | 5 +-
mm/khugepaged.c | 2 +-
mm/memcontrol.c | 28 +
mm/memory.c | 10 +-
mm/migrate.c | 2 +-
mm/mm_init.c | 6 +-
mm/mmzone.c | 2 +
mm/rmap.c | 6 +
mm/swap.c | 22 +-
mm/swapfile.c | 6 +-
mm/userfaultfd.c | 2 +-
mm/vmscan.c | 2638 ++++++++++++++++++++++++++++-
mm/workingset.c | 169 +-
39 files changed, 3498 insertions(+), 160 deletions(-)
create mode 100644 Documentation/vm/multigen_lru.rst
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,146 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id D67E4C433B4
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:09 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id BF3A360E0B
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:09 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S230431AbhETGz2 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:28 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37864 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S229534AbhETGzY (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:24 -0400
Received: from mail-yb1-xb4a.google.com (mail-yb1-xb4a.google.com [IPv6:2607:f8b0:4864:20::b4a])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id DD592C061574
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:02 -0700 (PDT)
Received: by mail-yb1-xb4a.google.com with SMTP id h67-20020a25d0460000b0290517e5f14ba4so2155434ybg.18
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:02 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=rcGZDNyJ6vYh3nv1NbyctjSVuVLx0bLzs+7ceU1fwm4=;
b=W+uE07dbkPRm0EI7rt0odOA402xB5GM5xsOKzyKKjPQdnq9FzvuhBH2EmYJx+e3w2P
M+GjA/Y/0N577Zt0vRn1fv9k1GS93aX/OLI3asM1EluD+bF6m15Qua90BDPhuN6RLdFt
9XaT7ugKFU1Zb0CN5pODFmCE1L4eWk8Idy1/MbWRtRICoOacDrCOBD3XXG+gene95EAz
h6RenUUXrHuOEIq+2ZT1q6P10VKHqSaPsyoiUDDSBllpMLW3kYmkOWBQGnRaPndswvZ6
VxYMBaR/6WNfgBuQGLp6vrXdw55euSCrNkjy2sf+vVpzlTPTbCCa8UgSnsOUvdDUidvY
K/+A==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=rcGZDNyJ6vYh3nv1NbyctjSVuVLx0bLzs+7ceU1fwm4=;
b=jzFekar0HP/0GPy5CL4U72iLLcQoeWxnnbnFRr4z4BTc2/hrpionzQD3GXHT9lyHMR
/YmLr75qJXGcWb0dXLEPIxzZ7UjTYADt32Jy07ZfWITJ2+jRh2k1W3Wgty9YjwFX9wmd
/MWxRyZj/674SBQZu12SYNlqqqh/WZxiziHwdgJEnbvX2lzsMWl2I1dL0e+tJwPr9OlS
sG5TsZvw0fciDahseafGyx4m1dmrCykPWBTkpCu+BJmF7Bt4PV/ogCZO0TIxn9ezbYr0
AkpMnHQnddjXudvfygGW20ymE3ieIKJU8hRbuF0+DfL6jdFdbczkyxRLk2dp6uiavglX
Eajg==
X-Gm-Message-State: AOAM533a36jLBDzDifmN467aiI0KlSL/85xe4UvoMJPjnzoeRbTl9eck
Ndk0rslscVb76VSHIoD/Mmkup4p5pgk=
X-Google-Smtp-Source: ABdhPJxDZYdk2SRkfTUl/Zu3nhhCt+1mIXIL72HTmroHynNzAjrSh46FnvoimDPO7nUIugXPbacDy3rCHfQ=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a5b:b92:: with SMTP id l18mr5146859ybq.414.1621493641980;
Wed, 19 May 2021 23:54:01 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:42 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-2-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 01/14] include/linux/memcontrol.h: do not warn in
page_memcg_rcu() if !CONFIG_MEMCG
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
page_memcg_rcu() warns on !rcu_read_lock_held() regardless of
CONFIG_MEMCG. The following legit code trips the warning when
!CONFIG_MEMCG, since lock_page_memcg() and unlock_page_memcg() are
empty for this config.
memcg = lock_page_memcg(page1)
(rcu_read_lock() if CONFIG_MEMCG=y)
do something to page1
if (page_memcg_rcu(page2) == memcg)
do something to page2 too as it cannot be migrated away from the
memcg either.
unlock_page_memcg(page1)
(rcu_read_unlock() if CONFIG_MEMCG=y)
Locking/unlocking rcu consistently for both configs is rigorous but it
also forces unnecessary locking upon users who have no interest in
CONFIG_MEMCG.
This patch removes the assertion for !CONFIG_MEMCG, because
page_memcg_rcu() has a few callers and there are no concerns regarding
their correctness at the moment.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
include/linux/memcontrol.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c193be760709..6bcac3d91dd1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1131,7 +1131,6 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
return NULL;
}
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,124 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 4FED7C433B4
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:14 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 33A1260E0B
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:14 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S229681AbhETGzd (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:33 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37868 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230365AbhETGzZ (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:25 -0400
Received: from mail-qv1-xf4a.google.com (mail-qv1-xf4a.google.com [IPv6:2607:f8b0:4864:20::f4a])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6DFF8C061574
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:04 -0700 (PDT)
Received: by mail-qv1-xf4a.google.com with SMTP id e15-20020a0caa4f0000b02901eedbb09299so10877261qvb.15
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:04 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=Vxs+IsfkjudRR9AMrF0VuRYwRpXVOuThAM5t2BO4Wtw=;
b=gEeAhMTYbAQslSO01e0o8YIym/dFnsQfB9lZrmyFhl8uqTqmQEDIIu/28e1BOlyvt9
rJm5/8Caqo1KIaunpdBy2LPtOXmfi9ZJDnuwRnb21JoByNrFkCChT4Z5xyeBqut1yHQm
/TlMDm6OPoewZgMjaOhRgLuU1i+Q2viLaBK5TcX/f4jp7CkEtCTn5SioWFrXLpHFPgfg
kYO7g2IN+CR6iy3EfEzmDy81m8wakeRxZZOx4HjJ7gGFFDfSfK4SyZnaOFS2lmsp1BrE
F/LSnBPYHDzzGJCEqa0RfGIu9OFnYG7fyBb1AzMWt5UOwD1Z5Gw0p3PCAJBE1ykqsTQ7
ooLQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=Vxs+IsfkjudRR9AMrF0VuRYwRpXVOuThAM5t2BO4Wtw=;
b=WqdLL8JjD4oD11umCae6nAxvQ9zl0qjRO1psbjN4FhRRkc/CQM+QJ7TCbayd5VCaUu
rtoZfhuozKEHvBBclv4jQzorRkiOVjwZGIX52HwBCvcXpPAQF69jkqG0WRjQKrUK9laH
TcyQDkdTbJztqqLLgoaMke0Pvw3qiJvr4a19wbHUQt+uwR4UXoL2NTQBotaVS1/Ji2zn
MhoxUD+f8pgFoCCCn/G5DgDvOIOCN2Ed8dcKb+fMyQn2Lhjc9xLUnlaGN6WdKwBpOq9Z
AFd/Ld6gumYWjtW7nBuum/ysfqbF5Au47sGCuLngPSTq/x4OIApMHQLjOikk1jB9ydlN
GRzA==
X-Gm-Message-State: AOAM5302cFNM+/KQLI85uCrC18olBzO35TXBlTOBABeaNnESgoIvYml8
1LlliNnJr3ok2uXDGXaBzjDPiIU7ddM=
X-Google-Smtp-Source: ABdhPJzxHNbGXEeZppZputW4eyMKgrpzCcHkInnC6cFjCLcGtMdRmVBNnbFHvD2nDRL//K+y6INSV9VSvtY=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:ad4:4184:: with SMTP id e4mr4097411qvp.13.1621493643517;
Wed, 19 May 2021 23:54:03 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:43 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-3-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 02/14] include/linux/nodemask.h: define next_memory_node()
if !CONFIG_NUMA
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
Currently next_memory_node only exists when CONFIG_NUMA=y. This patch
adds the macro for !CONFIG_NUMA.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
include/linux/nodemask.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index ac398e143c9a..89fe4e3592f9 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state)
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
+#define next_memory_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1U
#define nr_online_nodes 1U
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,150 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 5894CC43460
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:18 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 3DF1B61355
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:18 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S230492AbhETGzi (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:38 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37880 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230430AbhETGz2 (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:28 -0400
Received: from mail-yb1-xb49.google.com (mail-yb1-xb49.google.com [IPv6:2607:f8b0:4864:20::b49])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id F2439C061761
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:05 -0700 (PDT)
Received: by mail-yb1-xb49.google.com with SMTP id p138-20020a2542900000b029051304a381d9so9042617yba.20
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:05 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=u+gVIGEYIPA/2ZnKQ/jn+syjhmo8lEqvUYZxU0dhdmw=;
b=ilLDWOeIOTKbkZAyTcceDtOpp2Z6oTFrQhsVyqk7X5l9/6+8NJYYI+dDrFdNy8GHPn
TXNwPsD+oRKvYdGx4axZhlkFzOkdcr+xYDHDYwdfV6GJubHW0qUcVuNhCtoKbC5rA7Rc
HLlOQqtRJZ/ivTzUig8CQccV040hHCbuz35dLgXbD1dVokwc1cOuKZaTLQpYVLsUP3Bu
MGJAAygLFemJO4Lj2rtnjvJG8CDZr9Z0uZhqKEqHkyenPQKZNhlA4Evgi1wYHSSLSqnJ
48ySo0abwH067PuNMNMETfFX32LpXeIda/dgmAGMAOCqUYbqyCKHzmjDuutRjkgtmoG5
3meg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=u+gVIGEYIPA/2ZnKQ/jn+syjhmo8lEqvUYZxU0dhdmw=;
b=kGPHggyxo0QzLlCv4G7r06Qb9zfq6cYJu4n3UotivJGpIGltx02xhr3dmw5myvMMTB
+ZbHFgrPBlvHRFOzV44X/BNunb0sZE08NWoL7Ukl2CPGFW5C4ojk1f5ULcVpNMkwMHLM
a8+V1TCPtJTZj/8tHL+9HnYHlYz/Bigq1ANURCC14mJjIXuCM5eCU6+JTuizAZzNiIIr
u8eXfLqGhtBe68cHudHsJtza4h2srgcHTDTznQxwVhruHz3nU10Sni6vFJNWMxYzEfHy
179XSwdgavLRIWc5CxN1biYoS2EkJnVyEf1eYcyMtFUArZKIHQtx6DsDN2wNQ2P6YXD5
BUtw==
X-Gm-Message-State: AOAM533YpaJcNo3c5ia3Q6gdyS41kyWgd490xt0zYDyNkyhH3hi4hj2m
uiBbgl6yA1VtsOKBod8/iMtZhtd4aXg=
X-Google-Smtp-Source: ABdhPJw2fnax4tdT5Wsay5+rPu2/CLppUyd3dEee27l274OvNNlEZeyxb+pNstV/LKb7q5/PkQmoAivSEq8=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a05:6902:513:: with SMTP id
x19mr5279236ybs.129.1621493645039; Wed, 19 May 2021 23:54:05 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:44 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-4-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 03/14] include/linux/cgroup.h: export cgroup_mutex
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
cgroup_mutex is needed to synchronize with memcg creations.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
include/linux/cgroup.h | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4f2f79de083e..bd5744360cfa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -704,6 +715,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,214 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 3FBCEC433B4
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:18 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 2678D6108C
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:18 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S230467AbhETGzf (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:35 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37890 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230435AbhETGz2 (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:28 -0400
Received: from mail-yb1-xb49.google.com (mail-yb1-xb49.google.com [IPv6:2607:f8b0:4864:20::b49])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id DC741C0613CE
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:07 -0700 (PDT)
Received: by mail-yb1-xb49.google.com with SMTP id 129-20020a2504870000b0290513326cc5e0so8674080ybe.10
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:07 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=SyTpiiLQc7KnB8Q1E8X3BxXutjquJ4KYbVlpDQgVi/g=;
b=F/5vtOLfn2hWqJCcZJILEPNvAi4G3UC/HROV8n8s10GH7JhHnrGHdEho6MiIGVETaO
sHn+wn+lopXgJMLEqp5WqaQ769JJNG7YB4Pq15oo9pv+HRPYGP/d500+gP+KrGyChFzI
iRtkvAcNwlgumar+mpa5HZRGCb08Jm1ZBJ5134Kg6M2RP3KBMa9LpRBW+jA/uB2ZH6dY
SHmfSiGBjz0MLdKbjMO0ZC+E0iCgLKKyI3liy35dgrf7U0uAsmS+Tq+vBabFfUY8cvI2
9S4m0Grod6BK2vh7Cxh9tBxuiOnpUkk6GOwodZ5MXTpgU9J25Ztod8Cas20KXVuUUu5L
0caQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=SyTpiiLQc7KnB8Q1E8X3BxXutjquJ4KYbVlpDQgVi/g=;
b=I/lOZpe4tWCS0mala3TZsfeszZC2f6ezlF6QsZ4a/ik+ur09NYRA+x5bJGwlW5GDlq
UnLRSTXE7lfK71dSWsWOnbdahNawKurDhQQHAWYazRhXZbx8wQr8tVsCQRJt62tjdH10
3hySRpK88u8siMlEwsnnjOD9xPsxcVvC9q60ppd6Eg2OjbtByyWxV86qM2x45/Wp9SgS
1K0/jwK6L1A8aN+ccrl6RiewC05OjfkXbKE3qp2X4jLzxo0Z5jchuk4yzUI9a9UtqjR2
jluXY0tCVPqedtSHAZ1h+oHspBuclW26af/5c7EGck5IwrADYKN8hL9LfA0GmuxthLa8
CT+g==
X-Gm-Message-State: AOAM530i7XGlRCP9jxfMmcb0QrUXw8gBhH02/rSREHScRnCCUJvVu8zh
smjHRFK1+f6BrShUrxMXHYEhJlEv03M=
X-Google-Smtp-Source: ABdhPJxef10azDAzK7UEGotre/hx9MbP+rt8RSM5uDH+7LMog0h8qkdAugFcXq/qcLN78UEvnUxUn/nobaI=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a25:2009:: with SMTP id g9mr4983935ybg.198.1621493646666;
Wed, 19 May 2021 23:54:06 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:45 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-5-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 04/14] mm, x86: support the access bit on non-leaf PMD entries
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
Some architectures support the accessed bit on non-leaf PMD entries
(parents) in addition to leaf PTE entries (children) where pages are
mapped, e.g., x86_64 sets the accessed bit on a parent when using it
as part of linear-address translation [1]. Page table walkers who are
interested in the accessed bit on children can take advantage of this:
they do not need to search the children when the accessed bit is not
set on a parent, given that they have previously cleared the accessed
bit on this parent.
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
Volume 3 (October 2019), section 4.8
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
arch/Kconfig | 9 +++++++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 2 +-
arch/x86/mm/pgtable.c | 5 ++++-
include/linux/pgtable.h | 4 ++--
5 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index c45b770d3579..e3812adc69f7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -826,6 +826,15 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE
config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
bool
+config HAVE_ARCH_PARENT_PMD_YOUNG
+ bool
+ depends on PGTABLE_LEVELS > 2
+ help
+ Architectures that select this are able to set the accessed bit on
+ non-leaf PMD entries in addition to leaf PTE entries where pages are
+ mapped. For them, page table walkers that clear the accessed bit may
+ stop at non-leaf PMD entries if they do not see the accessed bit.
+
config HAVE_ARCH_HUGE_VMAP
bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b44190..f619055c4537 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -170,6 +170,7 @@ config X86
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
+ select HAVE_ARCH_PARENT_PMD_YOUNG if X86_64
select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
select HAVE_ARCH_USERFAULTFD_MINOR if X86_64 && USERFAULTFD
select HAVE_ARCH_VMAP_STACK if X86_64
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b1099f2d9800..3a24d2af4e9b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -846,7 +846,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
static inline int pmd_bad(pmd_t pmd)
{
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ return ((pmd_flags(pmd) | _PAGE_ACCESSED) & ~_PAGE_USER) != _KERNPG_TABLE;
}
static inline unsigned long pages_to_mb(unsigned long npg)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d27cf69e811d..b968d6bd28b6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pudp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp)
{
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 46b13780c2c8..94ecc1d277a2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -193,7 +193,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
@@ -214,7 +214,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
BUILD_BUG();
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,323 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 614A3C43461
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:20 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 44F1D61186
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:20 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S230499AbhETGzj (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:39 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37910 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230452AbhETGzb (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:31 -0400
Received: from mail-qv1-xf4a.google.com (mail-qv1-xf4a.google.com [IPv6:2607:f8b0:4864:20::f4a])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 245E9C06138B
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:09 -0700 (PDT)
Received: by mail-qv1-xf4a.google.com with SMTP id c5-20020a0ca9c50000b02901aede9b5061so12455193qvb.14
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:09 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=Mvah71zeYWGfuEGRbEsLqflL7nXzJ5AdEYR+UovaRYY=;
b=EbaEOCfalGO/Os4OKgi4M0ux2tbj/9YV7PKsVCGQdr/8gcQO1wsCl7ywZY/pNC7eXz
NoDBi8g1D9jnfogpVvkt+RSkZlQ/wIQfMR8guk0/qk6EZebG/utx01m5VEv0G0jHv0Zr
k6d+sXr5o4NS2Kl/7Ur6tOhmyQYo1mJS8W6wy8htCD9qRhKO9rljjjcNNoQFh7jF53I2
oqJdy/ZRwC1k/6/iastZquGfCQ1ZDPp9qbDEfPp6RfaePLHAvS2mEcu3b5IlddG8UjMG
gnQkyzTi9RZ60CdCTtFo/33uy+SQMY1vKs2glF5gunlSHFA1EaqvtsVi1W2ngxGWiGKh
ajyA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=Mvah71zeYWGfuEGRbEsLqflL7nXzJ5AdEYR+UovaRYY=;
b=fKmHfVBojnVD7v0HScBBju/h6BeYHqtiGWztHdkYBBMbAblj6OoCIkOdzbET71QBoY
R3bKjlFy/3PcckxPJrbFRhSVsTk4faV9uQfJlZuedG4G3O2EY6PeqhlGN722JnrxjpVk
og7sFMeaPJCcbmkZyC+jm4xmnS5Ox2CVjXqgw96+ViRsnfjSJ3Vvu0mOk/ab+jfZ3/ZB
HW3plDWWuAO8ijU0AazsQOOG3rvfr/szKKmeJs3e5a7HPpChkX9wvZKAnlyPw+6MvtLe
ssM2BgmFkGzG2yd8AzvepX/afdU14K4bigWqSjN8IRR7JVDnGdBKYR4N/tH0JI/apOet
Rn1g==
X-Gm-Message-State: AOAM5337ZUQnuxydxDF/VBFRzHtx51o3/N5HDpf9MYMdQrx5kcdbVyhf
HEyO2/+GFcfRnTIPxodPADdKQi6qUwA=
X-Google-Smtp-Source: ABdhPJxyjS6uG8i8Sad50t/5Pf/9RTagtFbxDvAuxBuu8l0odJlhqIGjN9aFII0GYF+uFWIxSdphl51ZKHc=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a0c:d84d:: with SMTP id i13mr3839330qvj.32.1621493648268;
Wed, 19 May 2021 23:54:08 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:46 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-6-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 05/14] mm/vmscan.c: refactor shrink_node()
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
Heuristics that determine scan balance between anon and file LRUs are
rather independent. Move them into a separate function to improve
readability.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
mm/vmscan.c | 186 +++++++++++++++++++++++++++-------------------------
1 file changed, 98 insertions(+), 88 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5199b9696bab..2339459c97d4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2421,6 +2421,103 @@ enum scan_balance {
SCAN_FILE,
};
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&target_lruvec->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&target_lruvec->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
@@ -2866,7 +2963,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
- unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
@@ -2876,93 +2972,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- /*
- * Determine the scan balance between anon and file LRUs.
- */
- spin_lock_irq(&target_lruvec->lru_lock);
- sc->anon_cost = target_lruvec->anon_cost;
- sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&target_lruvec->lru_lock);
-
- /*
- * Target desirable inactive:active list ratios for the anon
- * and file LRU lists.
- */
- if (!sc->force_deactivate) {
- unsigned long refaults;
-
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
- sc->may_deactivate |= DEACTIVATE_ANON;
- else
- sc->may_deactivate &= ~DEACTIVATE_ANON;
-
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
- sc->may_deactivate |= DEACTIVATE_FILE;
- else
- sc->may_deactivate &= ~DEACTIVATE_FILE;
- } else
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-
- /*
- * If we have plenty of inactive file pages that aren't
- * thrashing, try to reclaim those first before touching
- * anonymous pages.
- */
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
- sc->cache_trim_mode = 1;
- else
- sc->cache_trim_mode = 0;
-
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
- if (!cgroup_reclaim(sc)) {
- unsigned long total_high_wmark = 0;
- unsigned long free, anon;
- int z;
-
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- /*
- * Consider anon: if that's low too, this isn't a
- * runaway file reclaim problem, but rather just
- * extreme pressure. Reclaim as per usual then.
- */
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-
- sc->file_is_tiny =
- file + free <= total_high_wmark &&
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
- anon >> sc->priority;
- }
+ prepare_scan_count(pgdat, sc);
shrink_node_memcgs(pgdat, sc);
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,233 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 173FBC433B4
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:22 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id EA57F61261
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:21 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S230452AbhETGzl (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:41 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37912 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230466AbhETGzc (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:32 -0400
Received: from mail-qk1-x749.google.com (mail-qk1-x749.google.com [IPv6:2607:f8b0:4864:20::749])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id C46CEC061574
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:10 -0700 (PDT)
Received: by mail-qk1-x749.google.com with SMTP id d201-20020ae9efd20000b02902e9e9d8d9dcso11687575qkg.10
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:10 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=Yl+DjwsBODMfyiL17kQw/9BLrEC9zXFz8Mxu59WzP4Y=;
b=Mrzy2M2k9QAzqf4Qlq2wFgC1uycH9/GSOy89uVYR+gUD3oaKMWpOn95M8hs2EzT1FG
/N40V1/URD7mP48ZsP72lzG3rMvI0SepioQCu+0asiEBUJUrtY6kEz2CKTJEB4MwAGRU
xnH/e/C5szSot199E2rMI+ZUJo/y8pBDfNIzZ7XzQ811Wxr6oM1C4DVA6DHQSWtdqS5J
VMMjdjvsXW8hHCzs+5W06EYb73kJeqPHOFZ+XFMWXFrm8l/F2qujro4FMOgux0JB/XLW
32qxH7ovQyCHL8Gg6vGigkolgFZhe6oag4JfCx0cj6eFlP+2j2w5ryU0kRvvHVILA9Xq
e6Hg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=Yl+DjwsBODMfyiL17kQw/9BLrEC9zXFz8Mxu59WzP4Y=;
b=Nel79d/ExWs1HdZZs12GqadGHxgI3W8FnqvRED1IBxJnCtiluti6ndyhp/JXtjDIct
YtIRkMNVwmwMk6EFx5QOv0Br5VYQ/72hZsMd8kNj9z/m8CUtpMnJluKVNeyNT+Livkww
y2hwgGJiuvWxBIy+ja/GH64SkCJTuttiOxpNFaRxB1STfhM2PjwwwiQG5GTxbqkkn3Dg
fKBlHYI17sQ05tZRovJETs0f+1wBQPftjwjm6PJzpZ3ooNcBXdB6hRUrGZ6Pmyf2bRBR
BptRYxbaQdSprCABGMW/2ySltaJaFitv6fShejQMxDX8xe+JDYJ0kEn9/3aMVEt+Vy+X
rTAw==
X-Gm-Message-State: AOAM532IBC74aEi91Xqgl2rYw1QINB7mrdZT3v/EvERcHHCbI6v9/2cU
Qe2UGq6f4OIKykMrwADvTCo5whMI+DQ=
X-Google-Smtp-Source: ABdhPJyyAX33aocRYynTtALpNyjv0w+Wa7lDS9awJiNK6me024wMLg+4FL2RHzwNDLwZg9DFBZ+B1LiDSsc=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a05:6214:18d:: with SMTP id
q13mr3804726qvr.60.1621493649877; Wed, 19 May 2021 23:54:09 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:47 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-7-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 06/14] mm/workingset.c: refactor pack_shadow() and unpack_shadow()
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
This patches moves the bucket order and PageWorkingset() out of
pack_shadow() and unpack_shadow(). It has no merits on its own but
makes the upcoming changes to mm/workingset.c less diffy.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
mm/workingset.c | 53 ++++++++++++++++++++-----------------------------
1 file changed, 22 insertions(+), 31 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c
index b7cdeca5a76d..edb8aed2587e 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -168,9 +168,9 @@
* refault distance will immediately activate the refaulting page.
*/
-#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
-#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+#define EVICTION_SHIFT (BITS_PER_XA_VALUE - MEM_CGROUP_ID_SHIFT - NODES_SHIFT)
+#define EVICTION_MASK (BIT(EVICTION_SHIFT) - 1)
+#define WORKINGSET_WIDTH 1
/*
* Eviction timestamps need to be able to cover the full range of
@@ -182,36 +182,23 @@
*/
static unsigned int bucket_order __read_mostly;
-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
- bool workingset)
+static void *pack_shadow(int memcg_id, struct pglist_data *pgdat, unsigned long val)
{
- eviction >>= bucket_order;
- eviction &= EVICTION_MASK;
- eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
- eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
- eviction = (eviction << 1) | workingset;
+ val = (val << MEM_CGROUP_ID_SHIFT) | memcg_id;
+ val = (val << NODES_SHIFT) | pgdat->node_id;
- return xa_mk_value(eviction);
+ return xa_mk_value(val);
}
-static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp, bool *workingsetp)
+static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_data **pgdat)
{
- unsigned long entry = xa_to_value(shadow);
- int memcgid, nid;
- bool workingset;
+ unsigned long val = xa_to_value(shadow);
- workingset = entry & 1;
- entry >>= 1;
- nid = entry & ((1UL << NODES_SHIFT) - 1);
- entry >>= NODES_SHIFT;
- memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
- entry >>= MEM_CGROUP_ID_SHIFT;
+ *pgdat = NODE_DATA(val & (BIT(NODES_SHIFT) - 1));
+ val >>= NODES_SHIFT;
+ *memcg_id = val & (BIT(MEM_CGROUP_ID_SHIFT) - 1);
- *memcgidp = memcgid;
- *pgdat = NODE_DATA(nid);
- *evictionp = entry << bucket_order;
- *workingsetp = workingset;
+ return val >> MEM_CGROUP_ID_SHIFT;
}
/**
@@ -266,8 +253,10 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
+ eviction >>= bucket_order;
+ eviction = (eviction << WORKINGSET_WIDTH) | PageWorkingset(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ return pack_shadow(memcgid, pgdat, eviction);
}
/**
@@ -294,7 +283,7 @@ void workingset_refault(struct page *page, void *shadow)
bool workingset;
int memcgid;
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ eviction = unpack_shadow(shadow, &memcgid, &pgdat);
rcu_read_lock();
/*
@@ -318,6 +307,8 @@ void workingset_refault(struct page *page, void *shadow)
goto out;
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
+ workingset = eviction & (BIT(WORKINGSET_WIDTH) - 1);
+ eviction = (eviction >> WORKINGSET_WIDTH) << bucket_order;
/*
* Calculate the refault distance
@@ -335,7 +326,7 @@ void workingset_refault(struct page *page, void *shadow)
* longest time, so the occasional inappropriate activation
* leading to pressure on the active list is not a problem.
*/
- refault_distance = (refault - eviction) & EVICTION_MASK;
+ refault_distance = (refault - eviction) & (EVICTION_MASK >> WORKINGSET_WIDTH);
/*
* The activation decision for this page is made at the level
@@ -593,7 +584,7 @@ static int __init workingset_init(void)
unsigned int max_order;
int ret;
- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ BUILD_BUG_ON(EVICTION_SHIFT < WORKINGSET_WIDTH);
/*
* Calculate the eviction bucket size to cover the longest
* actionable refault distance, which is currently half of
@@ -601,7 +592,7 @@ static int __init workingset_init(void)
* some more pages at runtime, so keep working with up to
* double the initial memory by using totalram_pages as-is.
*/
- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ timestamp_bits = EVICTION_SHIFT - WORKINGSET_WIDTH;
max_order = fls_long(totalram_pages() - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,686 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id AEEEDC43461
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:26 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 4D74961186
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:26 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S231144AbhETGzp (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:45 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37910 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230478AbhETGzg (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:36 -0400
Received: from mail-qv1-xf4a.google.com (mail-qv1-xf4a.google.com [IPv6:2607:f8b0:4864:20::f4a])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5CD6FC061763
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:14 -0700 (PDT)
Received: by mail-qv1-xf4a.google.com with SMTP id x2-20020a0cda020000b02901edb4c412fdso12424236qvj.11
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:14 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=Jb580jSe4IcT6fVqPR22jrL3z+VNcMEKM2UgbfL90k4=;
b=rTxj5e7tRY5wx29jetDGP8dUly4vBHNX0SBJeZKRsCOEiHaQ+coy05du1f4bT6oCWw
rJWrdbUyp5aci9MKmCQ2Z5qPBf7F+zDTL+8wpoufyGbRvdGkfwDkAgQV6LLsi9xZzdyr
bpcyHItG1lIReRXOkR0GKWNz8GfEVNO7lE+G6Sc1sHPUEEfw3FF5Vl/Wta1OxKsGQQe4
02oeo8STGdqGF0yOczRyqWZ/SBFcNGiPQ7nrGaWA3FguRBAwZ2dOrTrmM5ug10rbOQmf
L/m3eja1mOwffFkrgumZ0Sm9KZ5sbKJNbLAjPYQAmAcoXhU/NVnLrMVtxSGppGFwdyOz
NMsw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=Jb580jSe4IcT6fVqPR22jrL3z+VNcMEKM2UgbfL90k4=;
b=i0Py5qwrQv4OOBWcpJcYxjG5lgHvV4Gq3X3fG5L0aB3lLLnS3mObKdM6XG+uYC1b1G
z/Sfx4n/1+/0EPZnFoo80K1ry0Y7SD/W30OUEPR8PValuCLHEHzVeoVK2+TPI8DMEzz1
r+jWpxZkah8B613QrPIvvcZSIb0lxcsV6JxpYjFixO/mizct7mrdls35j1Thb7ehgWtO
W5aAGiMIxBprDhKHJ2D2Oz85hWRyQYND4jEA68bzh9ybz4cYMVIX3C+9uH+cVIhZ6JZL
febwADPME4CsH8gMntK/GWzf5Yu+sdeBYn+6VJKrG/4c7dWi0xgFGWYrgCtSlk8kVgOe
bH1w==
X-Gm-Message-State: AOAM530OUX0GiyYChE/1C1GuJXPP4zDS9QrWZKB+3aIFDiz73ADIQaxu
gJxNX12VvCvNdCSId0kuSWl88ETTfcg=
X-Google-Smtp-Source: ABdhPJz4m/yxkWn5wBamzXd/wEoVvHq3AOPsnc1+c/ewg4oojPM6XcGKJYYybO2Mtsb6BDRPtu5ccAJRcHw=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a0c:edcf:: with SMTP id i15mr4021372qvr.10.1621493653456;
Wed, 19 May 2021 23:54:13 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:49 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-9-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 08/14] mm: multigenerational lru: activation
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
For pages accessed multiple times via file descriptors, instead of
activating them upon the second access, we activate them based on the
refault rates of their tiers. Each generation contains at most
MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2 bits in
page->flags. Pages accessed N times via file descriptors belong to
tier order_base_2(N). Tier 0 is the base tier and it contains pages
read ahead, accessed once via file descriptors and accessed only via
page tables. Pages from the base tier are evicted regardless of the
refault rate. Pages from upper tiers that have higher refault rates
than the base tier will be moved to the next generation. A feedback
loop modeled after the PID controller monitors refault rates across
all tiers and decides when to activate pages from which upper tiers
in the reclaim path. The advantages of this model are:
1) It has a negligible cost in the buffered IO access path because
activations are done optionally in the reclaim path.
2) It takes mapped pages into account and avoids overprotecting
pages accessed multiple times via file descriptors.
3) More tiers offer better protection to pages accessed more than
twice when workloads doing intensive buffered IO are under memory
pressure.
For pages mapped upon page faults, the accessed bit is set during the
initial faults. Ideally we add them to the per-zone lists index by
max_seq, i.e., the youngest generation, so that eviction will not
consider them before the aging has scanned them. For anon pages not in
swap cache, this can be done easily in the page fault path: we rename
lru_cache_add_inactive_or_unevictable() to lru_cache_add_page_vma()
and add a new parameter, which is set to true for pages mapped upon
page faults. For pages in page cache or swap cache, we cannot
differentiate the page fault path from the read ahead path at the time
we call lru_cache_add(). So we add them to the per-zone lists index by
min_seq, i.e., the oldest generation, for now.
Finally, we need to make sure deactivation works when the
multigenerational lru is enabled. We cannot use PageActive() because
it is not set on pages from active generations, in order to spare the
aging the trouble of clearing it when active generations become
inactive. So we deactivate pages unconditionally since deactivation is
not a hot code path worth additional optimizations.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
include/linux/mm_inline.h | 40 ++++++++++++++
include/linux/swap.h | 4 +-
kernel/events/uprobes.c | 2 +-
mm/huge_memory.c | 2 +-
mm/khugepaged.c | 2 +-
mm/memory.c | 10 ++--
mm/migrate.c | 2 +-
mm/swap.c | 22 +++++---
mm/swapfile.c | 2 +-
mm/userfaultfd.c | 2 +-
mm/vmscan.c | 91 ++++++++++++++++++++++++++++++-
mm/workingset.c | 112 ++++++++++++++++++++++++++++++++++++++
12 files changed, 269 insertions(+), 22 deletions(-)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ae3e3826dd7f..f3b99f65a652 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
return seq % MAX_NR_GENS;
}
+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
+static inline int lru_tier_from_usage(int usage)
+{
+ return order_base_2(usage + 1);
+}
+
/* Return a proper index regardless whether we keep a full history of stats. */
static inline int hist_from_seq_or_gen(int seq_or_gen)
{
@@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
return true;
}
+/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
+static inline int page_tier_usage(struct page *page)
+{
+ unsigned long flags = READ_ONCE(page->flags);
+
+ return flags & BIT(PG_workingset) ?
+ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
+}
+
+/* Increment the usage counter after a page is accessed via file descriptors. */
+static inline void page_inc_usage(struct page *page)
+{
+ unsigned long usage;
+ unsigned long old_flags, new_flags;
+
+ do {
+ old_flags = READ_ONCE(page->flags);
+
+ if (!(old_flags & BIT(PG_workingset))) {
+ new_flags = old_flags | BIT(PG_workingset);
+ continue;
+ }
+
+ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
+
+ new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK);
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+
#else /* CONFIG_LRU_GEN */
static inline bool lru_gen_enabled(void)
@@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
return false;
}
+static inline void page_inc_usage(struct page *page)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
static __always_inline void add_page_to_lru_list(struct page *page,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..30b1f15f5c6e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -365,8 +365,8 @@ extern void deactivate_page(struct page *page);
extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma);
+extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+ bool faulting);
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6addc9780319..4e93e5602723 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, false);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8ac9093e5a0d..681da4a3cf61 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,7 +636,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c0185fdd815..09e5346c2754 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1198,7 +1198,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, true);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
diff --git a/mm/memory.c b/mm/memory.c
index 730daa00952b..a76196885f92 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -839,7 +839,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
page_add_new_anon_rmap(new_page, dst_vma, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
+ lru_cache_add_page_vma(new_page, dst_vma, false);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
@@ -2950,7 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ lru_cache_add_page_vma(new_page, vma, true);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -3479,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
}
@@ -3625,7 +3625,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3793,7 +3793,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, true);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
diff --git a/mm/migrate.c b/mm/migrate.c
index b234c3f3acb7..d3307c9eced4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2967,7 +2967,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
if (!is_zone_device_page(page))
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, false);
get_page(page);
if (flush) {
diff --git a/mm/swap.c b/mm/swap.c
index dfb48cf9c2c9..96ce95eeb2c9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -433,6 +433,8 @@ void mark_page_accessed(struct page *page)
* this list is never rotated or maintained, so marking an
* evictable page accessed has no effect.
*/
+ } else if (lru_gen_enabled()) {
+ page_inc_usage(page);
} else if (!PageActive(page)) {
/*
* If the page is on the LRU, queue it for activation via
@@ -478,15 +480,14 @@ void lru_cache_add(struct page *page)
EXPORT_SYMBOL(lru_cache_add);
/**
- * lru_cache_add_inactive_or_unevictable
+ * lru_cache_add_page_vma
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
*
- * Place @page on the inactive or unevictable LRU list, depending on its
- * evictability.
+ * Place @page on an LRU list, depending on its evictability.
*/
-void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma)
+void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+ bool faulting)
{
bool unevictable;
@@ -503,6 +504,11 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
+
+ /* tell the multigenerational lru that the page is being faulted in */
+ if (lru_gen_enabled() && !unevictable && faulting)
+ SetPageActive(page);
+
lru_cache_add(page);
}
@@ -529,7 +535,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
*/
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
{
- bool active = PageActive(page);
+ bool active = PageActive(page) || lru_gen_enabled();
int nr_pages = thp_nr_pages(page);
if (PageUnevictable(page))
@@ -569,7 +575,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
- if (PageActive(page) && !PageUnevictable(page)) {
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
@@ -684,7 +690,7 @@ void deactivate_file_page(struct page *page)
*/
void deactivate_page(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3598b668f533..549e94318b2f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1936,7 +1936,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
page_add_anon_rmap(page, vma, addr, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ lru_cache_add_page_vma(page, vma, false);
}
swap_free(entry);
out:
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e14b3820c6a8..175d55b4f594 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
inc_mm_counter(dst_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
+ lru_cache_add_page_vma(page, dst_vma, true);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7bbfc0b1ebd..84d25079092e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
- mem_cgroup_swapout(page, swap);
+
+ /* get a shadow entry before page_memcg() is cleared */
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
+ mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page, swap, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
@@ -2780,6 +2782,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
}
+/******************************************************************************
+ * refault feedback loop
+ ******************************************************************************/
+
+/*
+ * A feedback loop modeled after the PID controller. Currently supports the
+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
+ * added if necessary. The setpoint (SP) is the desired position; the process
+ * variable (PV) is the measured position. The error is the difference between
+ * the SP and the PV. A positive error results in a positive control output
+ * correction, which, in our case, is to allow eviction.
+ *
+ * The P term is the current refault rate refaulted/(evicted+activated), which
+ * has a weight of 1. The I term is the arithmetic mean of the last N refault
+ * rates, weighted by geometric series 1/2, 1/4, ..., 1/(1<<N).
+ *
+ * Our goal is to make sure upper tiers have similar refault rates as the base
+ * tier. That is we try to be fair to all tiers by maintaining similar refault
+ * rates across them.
+ */
+struct controller_pos {
+ unsigned long refaulted;
+ unsigned long total;
+ int gain;
+};
+
+static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec,
+ int type, int tier, int gain)
+{
+ struct lrugen *lrugen = &lruvec->evictable;
+ int hist = hist_from_seq_or_gen(lrugen->min_seq[type]);
+
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ pos->total = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ pos->total += lrugen->activated[hist][type][tier - 1];
+ pos->gain = gain;
+}
+
+static void reset_controller_pos(struct lruvec *lruvec, int gen, int type)
+{
+ int tier;
+ int hist = hist_from_seq_or_gen(gen);
+ struct lrugen *lrugen = &lruvec->evictable;
+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (!carryover && NR_STAT_GENS == 1)
+ return;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ if (carryover) {
+ unsigned long sum;
+
+ sum = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+ sum = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ sum += lrugen->activated[hist][type][tier - 1];
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+
+ if (NR_STAT_GENS > 1)
+ continue;
+ }
+
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+ if (tier)
+ WRITE_ONCE(lrugen->activated[hist][type][tier - 1], 0);
+ }
+}
+
+static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv)
+{
+ /*
+ * Allow eviction if the PV has a limited number of refaulted pages or a
+ * lower refault rate than the SP.
+ */
+ return pv->refaulted < SWAP_CLUSTER_MAX ||
+ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
+}
+
/******************************************************************************
* state change
******************************************************************************/
diff --git a/mm/workingset.c b/mm/workingset.c
index edb8aed2587e..3f3f03d51ea7 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da
return val >> MEM_CGROUP_ID_SHIFT;
}
+#ifdef CONFIG_LRU_GEN
+
+#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
+#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
+#endif
+
+static void page_set_usage(struct page *page, int usage)
+{
+ unsigned long old_flags, new_flags;
+
+ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
+
+ if (!usage)
+ return;
+
+ do {
+ old_flags = READ_ONCE(page->flags);
+ new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS |
+ ((usage - 1UL) << LRU_USAGE_PGOFF);
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+
+/* Return a token to be stored in the shadow entry of a page being evicted. */
+static void *lru_gen_eviction(struct page *page)
+{
+ int hist, tier;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lrugen *lrugen;
+ int type = page_is_file_lru(page);
+ int usage = page_tier_usage(page);
+ struct mem_cgroup *memcg = page_memcg(page);
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->evictable;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ token = (min_seq << LRU_USAGE_SHIFT) | usage;
+
+ hist = hist_from_seq_or_gen(min_seq);
+ tier = lru_tier_from_usage(usage);
+ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
+
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
+}
+
+/* Account a refaulted page based on the token stored in its shadow entry. */
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+ int hist, tier, usage;
+ int memcg_id;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lrugen *lrugen;
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ int type = page_is_file_lru(page);
+
+ token = unpack_shadow(shadow, &memcg_id, &pgdat);
+ if (page_pgdat(page) != pgdat)
+ return;
+
+ rcu_read_lock();
+ memcg = page_memcg_rcu(page);
+ if (mem_cgroup_id(memcg) != memcg_id)
+ goto unlock;
+
+ usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
+ token >>= LRU_USAGE_SHIFT;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->evictable;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
+ goto unlock;
+
+ page_set_usage(page, usage);
+
+ hist = hist_from_seq_or_gen(min_seq);
+ tier = lru_tier_from_usage(usage);
+ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
+ if (tier)
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
+unlock:
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct page *page)
+{
+ return NULL;
+}
+
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
@@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (lru_gen_enabled())
+ return lru_gen_eviction(page);
+
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
@@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow)
bool workingset;
int memcgid;
+ if (lru_gen_enabled()) {
+ lru_gen_refault(page, shadow);
+ return;
+ }
+
eviction = unpack_shadow(shadow, &memcgid, &pgdat);
rcu_read_lock();
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,789 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 658D8C433B4
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:28 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 477CA611BE
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:28 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S231165AbhETGzr (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:47 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37908 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230424AbhETGzh (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:37 -0400
Received: from mail-qv1-xf4a.google.com (mail-qv1-xf4a.google.com [IPv6:2607:f8b0:4864:20::f4a])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id EAF95C061574
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:15 -0700 (PDT)
Received: by mail-qv1-xf4a.google.com with SMTP id r11-20020a0cb28b0000b02901c87a178503so12393761qve.22
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:15 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=P78haeNjzr5Qg1JjQymXtCqtqXQumRFjJWFx1f2kmKM=;
b=Tjsj7/GeS8mUtREXLxPPRM0sVotzXnOQ/Dq8MvDajXLm9nT1QjyleqN5ONXOxfHJSb
gOKQ1YJhBwyuC3HCKJXdOCqgqOmQbjJGjOkM9uXhZa9/9W+Bvnszx1RDX4YRwIqqWgFX
flJvQvCE2SODYJwvTs6wKWKKQlvvw9WY05ct8oakXuEPnAOblfqTR+pbk7GoCJo67kNf
enTegbyR2yRwGi9N5coUMJM8TYP+BoBWQaHNTVR3nL7a6nEjAg1IrL1w4WaZ+/fsdDdF
6FlorKJ31sPCd2wxkCOnn+o98vuymHUDmyr+h9KxZtecLKHCkTsolSRuLiyHQvlzqY3q
md3Q==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=P78haeNjzr5Qg1JjQymXtCqtqXQumRFjJWFx1f2kmKM=;
b=oK3flk/MdWi/bqnKFxC7O7BqH1b1apkGTQgT4OLVuSurUs5o7HcTTMvjXuljN/KmMh
/OGEWIkS+BHD6OkEE9W7Q/5GoGXL7Np1sLByjbiNrfCNZHtmEvYLHtP9lYulkcWaLTgA
XEr3n9zWofP9Jw0bPM24RW8jqzAlzld2tkrpDSgnfmMEpyzmjuFEURnKsx/ubUbuQ8Vd
rkIngqIt1YDBI+x6EZEdq4OpP+8H9TDr8KZBjUVfzpvASnMYn2y9gZX4Obd5/t+wys2m
zn5+4aqeR8mtxQVzHwPM48LG5wPqbTtMF0+Mhoba0Enk55ZL29+xKT00ltswnvHNJDj9
UduQ==
X-Gm-Message-State: AOAM5324lhHETXZQ7vXVsQ3UhfF140iLgXV/soebRFc0ECp355pnwH5X
pEYaLnlH20Lc9hBvEeYp/HXipMEwsdE=
X-Google-Smtp-Source: ABdhPJxAkOjDRLTPPi669WBE6Bb6QiyW8Wr0JRRG09c2L2y7UvYt7Th6JQxML99ZXqbjrM7T5yJPx76NwGo=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a05:6214:76b:: with SMTP id
f11mr3992753qvz.8.1621493655061; Wed, 19 May 2021 23:54:15 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:50 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-10-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 09/14] mm: multigenerational lru: mm_struct list
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
In order to scan page tables, we add an infrastructure to maintain
either a system-wide mm_struct list or per-memcg mm_struct lists, and
track whether an mm_struct is being used or has been used since the
last scan.
Multiple threads can concurrently work on the same mm_struct list, and
each of them will be given a different mm_struct belonging to a
process that has been scheduled since the last scan.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
fs/exec.c | 2 +
include/linux/memcontrol.h | 6 +
include/linux/mm_types.h | 107 ++++++++++++
kernel/exit.c | 1 +
kernel/fork.c | 10 ++
kernel/kthread.c | 1 +
kernel/sched/core.c | 2 +
mm/memcontrol.c | 28 ++++
mm/vmscan.c | 324 +++++++++++++++++++++++++++++++++++++
9 files changed, 481 insertions(+)
diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..c691d4d7720c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm)
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
+ lru_gen_add_mm(mm);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
@@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm)
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
activate_mm(active_mm, mm);
+ lru_gen_switch_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
tsk->mm->vmacache_seqnum = 0;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6bcac3d91dd1..60601a997433 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -230,6 +230,8 @@ struct obj_cgroup {
};
};
+struct lru_gen_mm_list;
+
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
@@ -349,6 +351,10 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_LRU_GEN
+ struct lru_gen_mm_list *mm_list;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5aacc1c10a45..b0f662555eae 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -15,6 +15,8 @@
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
+#include <linux/nodemask.h>
+#include <linux/mmdebug.h>
#include <asm/mmu.h>
@@ -561,6 +563,22 @@ struct mm_struct {
#ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
+#endif
+#ifdef CONFIG_LRU_GEN
+ struct {
+ /* the node of a global or per-memcg mm_struct list */
+ struct list_head list;
+#ifdef CONFIG_MEMCG
+ /* points to the memcg of the owner task above */
+ struct mem_cgroup *memcg;
+#endif
+ /* whether this mm_struct has been used since the last walk */
+ nodemask_t nodes;
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ /* the number of CPUs using this mm_struct */
+ atomic_t nr_cpus;
+#endif
+ } lrugen;
#endif
} __randomize_layout;
@@ -588,6 +606,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}
+#ifdef CONFIG_LRU_GEN
+
+void lru_gen_init_mm(struct mm_struct *mm);
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg);
+void lru_gen_free_mm_list(struct mem_cgroup *memcg);
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+/* Track the usage of each mm_struct so that we can skip inactive ones. */
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
+{
+ /* exclude init_mm, efi_mm, etc. */
+ if (!core_kernel_data((unsigned long)old)) {
+ VM_BUG_ON(old == &init_mm);
+
+ nodes_setall(old->lrugen.nodes);
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ atomic_dec(&old->lrugen.nr_cpus);
+ VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old);
+#endif
+ } else
+ VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) ||
+ READ_ONCE(old->lrugen.list.next), old);
+
+ if (!core_kernel_data((unsigned long)new)) {
+ VM_BUG_ON(new == &init_mm);
+
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ atomic_inc(&new->lrugen.nr_cpus);
+ VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new);
+#endif
+ } else
+ VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) ||
+ READ_ONCE(new->lrugen.list.next), new);
+}
+
+/* Return whether this mm_struct is being used on any CPUs. */
+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
+{
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ return !cpumask_empty(mm_cpumask(mm));
+#else
+ return atomic_read(&mm->lrugen.nr_cpus);
+#endif
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
+{
+}
+
+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/kernel/exit.c b/kernel/exit.c
index fd1c04193e18..b362179852f1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -423,6 +423,7 @@ void mm_update_next_owner(struct mm_struct *mm)
goto retry;
}
WRITE_ONCE(mm->owner, c);
+ lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index dc06afd725cb..2fd7dae9afcb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,6 +669,7 @@ static void check_mm(struct mm_struct *mm)
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
+ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm);
}
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
@@ -1061,6 +1062,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
fail_nocontext:
@@ -1103,6 +1105,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
mmdrop(mm);
}
@@ -2524,6 +2527,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
+
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fe3f2a40d61e..b81e49ed31a7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1325,6 +1325,7 @@ void kthread_use_mm(struct mm_struct *mm)
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
+ lru_gen_switch_mm(active_mm, mm);
local_irq_enable();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5226cc26a095..2d4b77f173db 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4323,6 +4323,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_switch_mm(prev->active_mm, next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
@@ -7603,6 +7604,7 @@ void idle_task_exit(void)
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
+ lru_gen_switch_mm(mm, &init_mm);
finish_arch_post_lock_switch();
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 64ada9e650a5..58b610ffa0e0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4981,6 +4981,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
+ lru_gen_free_mm_list(memcg);
kfree(memcg);
}
@@ -5030,6 +5031,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (alloc_mem_cgroup_per_node_info(memcg, node))
goto fail;
+ if (lru_gen_alloc_mm_list(memcg))
+ goto fail;
+
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
goto fail;
@@ -5991,6 +5995,29 @@ static void mem_cgroup_move_task(void)
}
#endif
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *task = NULL;
+
+ cgroup_taskset_for_each_leader(task, css, tset)
+ ;
+
+ if (!task)
+ return;
+
+ task_lock(task);
+ if (task->mm && task->mm->owner == task)
+ lru_gen_migrate_mm(task->mm);
+ task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif
+
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
if (value == PAGE_COUNTER_MAX)
@@ -6332,6 +6359,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_reset = mem_cgroup_css_reset,
.css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
+ .attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.dfl_cftypes = memory_files,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 84d25079092e..d93d2272e475 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2869,6 +2869,323 @@ static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *
sp->refaulted * max(pv->total, 1UL) * pv->gain;
}
+/******************************************************************************
+ * mm_struct list
+ ******************************************************************************/
+
+enum {
+ MM_SCHED_ACTIVE, /* running processes */
+ MM_SCHED_INACTIVE, /* sleeping processes */
+ MM_LOCK_CONTENTION, /* lock contentions */
+ MM_VMA_INTERVAL, /* VMAs within the range of each PUD/PMD/PTE */
+ MM_LEAF_OTHER_NODE, /* entries not from the node under reclaim */
+ MM_LEAF_OTHER_MEMCG, /* entries not from the memcg under reclaim */
+ MM_LEAF_OLD, /* old entries */
+ MM_LEAF_YOUNG, /* young entries */
+ MM_LEAF_DIRTY, /* dirty entries */
+ MM_LEAF_HOLE, /* non-present entries */
+ MM_NONLEAF_OLD, /* old non-leaf PMD entries */
+ MM_NONLEAF_YOUNG, /* young non-leaf PMD entries */
+ NR_MM_STATS
+};
+
+/* mnemonic codes for the stats above */
+#define MM_STAT_CODES "aicvnmoydhlu"
+
+struct lru_gen_mm_list {
+ /* the head of a global or per-memcg mm_struct list */
+ struct list_head head;
+ /* protects the list */
+ spinlock_t lock;
+ struct {
+ /* set to max_seq after each round of walk */
+ unsigned long cur_seq;
+ /* the next mm on the list to walk */
+ struct list_head *iter;
+ /* to wait for the last worker to finish */
+ struct wait_queue_head wait;
+ /* the number of concurrent workers */
+ int nr_workers;
+ /* stats for debugging */
+ unsigned long stats[NR_STAT_GENS][NR_MM_STATS];
+ } nodes[0];
+};
+
+static struct lru_gen_mm_list *global_mm_list;
+
+static struct lru_gen_mm_list *alloc_mm_list(void)
+{
+ int nid;
+ struct lru_gen_mm_list *mm_list;
+
+ mm_list = kzalloc(struct_size(mm_list, nodes, nr_node_ids), GFP_KERNEL);
+ if (!mm_list)
+ return NULL;
+
+ INIT_LIST_HEAD(&mm_list->head);
+ spin_lock_init(&mm_list->lock);
+
+ for_each_node(nid) {
+ mm_list->nodes[nid].cur_seq = MIN_NR_GENS;
+ mm_list->nodes[nid].iter = &mm_list->head;
+ init_waitqueue_head(&mm_list->nodes[nid].wait);
+ }
+
+ return mm_list;
+}
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+#ifdef CONFIG_MEMCG
+ if (!mem_cgroup_disabled())
+ return memcg ? memcg->mm_list : root_mem_cgroup->mm_list;
+#endif
+ VM_BUG_ON(memcg);
+
+ return global_mm_list;
+}
+
+void lru_gen_init_mm(struct mm_struct *mm)
+{
+ INIT_LIST_HEAD(&mm->lrugen.list);
+#ifdef CONFIG_MEMCG
+ mm->lrugen.memcg = NULL;
+#endif
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ atomic_set(&mm->lrugen.nr_cpus, 0);
+#endif
+ nodes_clear(mm->lrugen.nodes);
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
+#ifdef CONFIG_MEMCG
+ VM_BUG_ON_MM(mm->lrugen.memcg, mm);
+ WRITE_ONCE(mm->lrugen.memcg, memcg);
+#endif
+ spin_lock(&mm_list->lock);
+ list_add_tail(&mm->lrugen.list, &mm_list->head);
+ spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+ int nid;
+#ifdef CONFIG_MEMCG
+ struct lru_gen_mm_list *mm_list = get_mm_list(mm->lrugen.memcg);
+#else
+ struct lru_gen_mm_list *mm_list = get_mm_list(NULL);
+#endif
+
+ spin_lock(&mm_list->lock);
+
+ for_each_node(nid) {
+ if (mm_list->nodes[nid].iter != &mm->lrugen.list)
+ continue;
+
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+ if (mm_list->nodes[nid].iter == &mm_list->head)
+ WRITE_ONCE(mm_list->nodes[nid].cur_seq,
+ mm_list->nodes[nid].cur_seq + 1);
+ }
+
+ list_del_init(&mm->lrugen.list);
+
+ spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+ mem_cgroup_put(mm->lrugen.memcg);
+ WRITE_ONCE(mm->lrugen.memcg, NULL);
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+
+ memcg->mm_list = alloc_mm_list();
+
+ return memcg->mm_list ? 0 : -ENOMEM;
+}
+
+void lru_gen_free_mm_list(struct mem_cgroup *memcg)
+{
+ kfree(memcg->mm_list);
+ memcg->mm_list = NULL;
+}
+
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+
+ lockdep_assert_held(&mm->owner->alloc_lock);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(mm->owner);
+ rcu_read_unlock();
+ if (memcg == mm->lrugen.memcg)
+ return;
+
+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
+
+ lru_gen_del_mm(mm);
+ lru_gen_add_mm(mm);
+}
+
+static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg)
+{
+ return READ_ONCE(mm->lrugen.memcg) != memcg;
+}
+#else
+static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg)
+{
+ return false;
+}
+#endif
+
+struct mm_walk_args {
+ struct mem_cgroup *memcg;
+ unsigned long max_seq;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+ unsigned long next_addr;
+ int node_id;
+ int swappiness;
+ int batch_size;
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ int mm_stats[NR_MM_STATS];
+ unsigned long bitmap[0];
+};
+
+static int size_of_mm_walk_args(void)
+{
+ int size = sizeof(struct mm_walk_args);
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) ||
+ IS_ENABLED(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG))
+ size += sizeof(unsigned long) * BITS_TO_LONGS(PTRS_PER_PMD);
+
+ return size;
+}
+
+static void reset_mm_stats(struct lru_gen_mm_list *mm_list, bool last,
+ struct mm_walk_args *args)
+{
+ int i;
+ int nid = args->node_id;
+ int hist = hist_from_seq_or_gen(args->max_seq);
+
+ lockdep_assert_held(&mm_list->lock);
+
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(mm_list->nodes[nid].stats[hist][i],
+ mm_list->nodes[nid].stats[hist][i] + args->mm_stats[i]);
+ args->mm_stats[i] = 0;
+ }
+
+ if (!last || NR_STAT_GENS == 1)
+ return;
+
+ hist = hist_from_seq_or_gen(args->max_seq + 1);
+ for (i = 0; i < NR_MM_STATS; i++)
+ WRITE_ONCE(mm_list->nodes[nid].stats[hist][i], 0);
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
+{
+ int type;
+ unsigned long size = 0;
+
+ if (!lru_gen_mm_is_active(mm) && !node_isset(args->node_id, mm->lrugen.nodes))
+ return true;
+
+ if (mm_is_oom_victim(mm))
+ return true;
+
+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+ get_mm_counter(mm, MM_ANONPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
+ }
+
+ /* leave the legwork to the rmap if mappings are too sparse */
+ if (size < max(SWAP_CLUSTER_MAX, mm_pgtables_bytes(mm) / PAGE_SIZE))
+ return true;
+
+ return !mmget_not_zero(mm);
+}
+
+/* To support multiple workers that concurrently walk an mm_struct list. */
+static bool get_next_mm(struct mm_walk_args *args, struct mm_struct **iter)
+{
+ bool last = true;
+ struct mm_struct *mm = NULL;
+ int nid = args->node_id;
+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
+
+ if (*iter)
+ mmput_async(*iter);
+ else if (args->max_seq <= READ_ONCE(mm_list->nodes[nid].cur_seq))
+ return false;
+
+ spin_lock(&mm_list->lock);
+
+ VM_BUG_ON(args->max_seq > mm_list->nodes[nid].cur_seq + 1);
+ VM_BUG_ON(*iter && args->max_seq < mm_list->nodes[nid].cur_seq);
+ VM_BUG_ON(*iter && !mm_list->nodes[nid].nr_workers);
+
+ if (args->max_seq <= mm_list->nodes[nid].cur_seq) {
+ last = *iter;
+ goto done;
+ }
+
+ if (mm_list->nodes[nid].iter == &mm_list->head) {
+ VM_BUG_ON(*iter || mm_list->nodes[nid].nr_workers);
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+ }
+
+ while (!mm && mm_list->nodes[nid].iter != &mm_list->head) {
+ mm = list_entry(mm_list->nodes[nid].iter, struct mm_struct, lrugen.list);
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+ if (should_skip_mm(mm, args))
+ mm = NULL;
+
+ args->mm_stats[mm ? MM_SCHED_ACTIVE : MM_SCHED_INACTIVE]++;
+ }
+
+ if (mm_list->nodes[nid].iter == &mm_list->head)
+ WRITE_ONCE(mm_list->nodes[nid].cur_seq,
+ mm_list->nodes[nid].cur_seq + 1);
+done:
+ if (*iter && !mm)
+ mm_list->nodes[nid].nr_workers--;
+ if (!*iter && mm)
+ mm_list->nodes[nid].nr_workers++;
+
+ last = last && !mm_list->nodes[nid].nr_workers &&
+ mm_list->nodes[nid].iter == &mm_list->head;
+
+ reset_mm_stats(mm_list, last, args);
+
+ spin_unlock(&mm_list->lock);
+
+ *iter = mm;
+ if (mm)
+ node_clear(nid, mm->lrugen.nodes);
+
+ return last;
+}
+
/******************************************************************************
* state change
******************************************************************************/
@@ -3096,6 +3413,13 @@ static int __init init_lru_gen(void)
{
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
+
+ if (mem_cgroup_disabled()) {
+ global_mm_list = alloc_mm_list();
+ if (WARN_ON_ONCE(!global_mm_list))
+ return -ENOMEM;
+ }
if (hotplug_memory_notifier(lru_gen_online_mem, 0))
pr_err("lru_gen: failed to subscribe hotplug notifications\n");
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,768 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id A01B6C433B4
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:39 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 7B61B6108C
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:39 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S231207AbhETGz5 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:55:57 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37946 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230519AbhETGzl (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:41 -0400
Received: from mail-yb1-xb49.google.com (mail-yb1-xb49.google.com [IPv6:2607:f8b0:4864:20::b49])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2E0BFC06175F
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:19 -0700 (PDT)
Received: by mail-yb1-xb49.google.com with SMTP id o6-20020a5b06460000b02905004326697dso21269948ybq.22
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:19 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=u1VH9oezkgLqdxJ2J45QA+bE6HFSfI1t2pM/Z9SSfcw=;
b=ZCwqDV3PtHM/LJ1Jk3mVLSR0meKIBgFwo1J8fy1XCqpRUSN2IaxDKRl6kQ+Kr5x6il
ONEGQ71NTF3X5YriYi1HDhha3PmMaPofh1moI1cvhXBQ3BC7QtVM3R2+bDqRzq1heN8I
AIXSKXUdwikQDrunmGAxvTK29DMwl/KeHCe+4v24DaVODm4+A+McG4cMpvigEHvQjTyF
v8VcycT2kwKRw3j6yPu6tWP+l/IwnXQiY+KsQ1ti1IgPSlH/WyvqWlUCVB7h2C+o5ZS+
/wKVmM36EtyVbHuHWwWCJkvkjGaJnzvDjISmaVK9XCh1D8kFXjAL3uXkcExirtkdXQBN
na+A==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=u1VH9oezkgLqdxJ2J45QA+bE6HFSfI1t2pM/Z9SSfcw=;
b=LJDAHspg6fnPnue5XfzSf/BNfnVkyVlvJxf0/6UoekRIfXnLhw2K1izgIrVveHAz3I
J/NCoJhs8jZ/aPXP/cQXIGGSdtjJW5eDfEf4zm2qn/9oNaQnLZ7BV6aCDANBoPUCqa/r
AJIKGX9sXqevfdwgMdyFFNCF5HROG3lCnszQrQm+Y91p8HixZQRUngPI+mUlfY2VvvbM
MK5IMhmus/o35uuc/UPt0oRdz7fAgWg0WMJL5aZJMbFtZ9K8x8KpzSQqTf5tF9cNg3jZ
+0F1HWd4vCibOEBYJ0aENh+LFGxZoC+et9tQi5mJM42r+AlcYVzilVzRLzpjG0KKaHNi
FKBQ==
X-Gm-Message-State: AOAM5318NWZEOBXD8F42C3giE31Ee6cXk+kglz/8je4dTQkxkTvmIP2F
D7kBlXDwCzr93Jm/4pjbod1wRW/kBWo=
X-Google-Smtp-Source: ABdhPJxfQJokAdXpqmzcdeUweiPSNNLZPWEWbQ8Rs0Vczp0sf0utxd24KmzGgE8wHfpeqesf2U2Y+MnqF+Q=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a25:b3c3:: with SMTP id x3mr5173887ybf.334.1621493658204;
Wed, 19 May 2021 23:54:18 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:52 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-12-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 11/14] mm: multigenerational lru: eviction
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
The eviction consumes old generations. Given an lruvec, the eviction
scans the pages on the per-zone lists indexed by either of min_seq[2].
It first tries to select a type based on the values of min_seq[2].
When anon and file types are both available from the same generation,
it selects the one that has a lower refault rate.
During a scan, the eviction sorts pages according to their new
generation numbers, if the aging has found them referenced. It also
moves pages from the tiers that have higher refault rates than tier 0
to the next generation. When it finds all the per-zone lists of a
selected type are empty, the eviction increments min_seq[2] indexed by
this selected type.
With the aging and the eviction in place, we can build page reclaim in
a straightforward manner:
1) In order to reduce the latency, direct reclaim only invokes the
aging when both min_seq[2] reaches max_seq-1; otherwise it invokes
the eviction.
2) In order to avoid the aging in the direct reclaim path, kswapd
does the background aging. It invokes the aging when either of
min_seq[2] reaches max_seq-1; otherwise it invokes the eviction.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
include/linux/mmzone.h | 5 +
mm/vmscan.c | 540 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 545 insertions(+)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 38de59fcbe54..ded72f44d7e7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -863,6 +863,8 @@ struct deferred_split {
};
#endif
+struct mm_walk_args;
+
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
@@ -968,6 +970,9 @@ typedef struct pglist_data {
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ struct mm_walk_args *mm_walk_args;
+#endif
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 837d5e6a821e..2f86dcc04c56 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1311,6 +1311,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
+ /* in case the page was found accessed by lru_gen_scan_around() */
+ if (lru_gen_enabled() && !ignore_references &&
+ page_mapped(page) && PageReferenced(page))
+ goto keep_locked;
+
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
@@ -2431,6 +2436,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
unsigned long file;
struct lruvec *target_lruvec;
+ if (lru_gen_enabled())
+ return;
+
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
/*
@@ -3970,6 +3978,489 @@ void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
set_page_dirty(pte_page(pte[i]));
}
+/******************************************************************************
+ * the eviction
+ ******************************************************************************/
+
+static bool should_skip_page(struct page *page, struct scan_control *sc)
+{
+ if (!sc->may_unmap && page_mapped(page))
+ return true;
+
+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
+ return true;
+
+ if (!get_page_unless_zero(page))
+ return true;
+
+ if (!TestClearPageLRU(page)) {
+ put_page(page);
+ return true;
+ }
+
+ return false;
+}
+
+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_to_isolate)
+{
+ bool success;
+ int gen = page_lru_gen(page);
+ int type = page_is_file_lru(page);
+ int zone = page_zonenum(page);
+ int tier = lru_tier_from_usage(page_tier_usage(page));
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ VM_BUG_ON_PAGE(gen == -1, page);
+ VM_BUG_ON_PAGE(tier_to_isolate < 0, page);
+
+ /* a lazy-free page that has been written into? */
+ if (type && PageDirty(page) && PageAnon(page)) {
+ success = lru_gen_deletion(page, lruvec);
+ VM_BUG_ON_PAGE(!success, page);
+ SetPageSwapBacked(page);
+ add_page_to_lru_list_tail(page, lruvec);
+ return true;
+ }
+
+ /* page_update_gen() has updated the gen #? */
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
+ /* activate this page if its tier has a higher refault rate */
+ if (tier_to_isolate < tier) {
+ int hist = hist_from_seq_or_gen(gen);
+
+ page_inc_gen(page, lruvec, false);
+ WRITE_ONCE(lrugen->activated[hist][type][tier - 1],
+ lrugen->activated[hist][type][tier - 1] + thp_nr_pages(page));
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type);
+ return true;
+ }
+
+ /* mark this page for reclaim if it's pending writeback */
+ if (PageWriteback(page) || (type && PageDirty(page))) {
+ page_inc_gen(page, lruvec, true);
+ return true;
+ }
+
+ return false;
+}
+
+static void isolate_page(struct page *page, struct lruvec *lruvec)
+{
+ bool success;
+
+ success = lru_gen_deletion(page, lruvec);
+ VM_BUG_ON_PAGE(!success, page);
+
+ if (PageActive(page)) {
+ ClearPageActive(page);
+ /* make sure shrink_page_list() rejects this page */
+ SetPageReferenced(page);
+ return;
+ }
+
+ /* make sure shrink_page_list() doesn't try to write this page */
+ ClearPageReclaim(page);
+ /* make sure shrink_page_list() doesn't reject this page */
+ ClearPageReferenced(page);
+}
+
+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, long *nr_to_scan,
+ int type, int tier, struct list_head *list)
+{
+ bool success;
+ int gen, zone;
+ enum vm_event_item item;
+ int sorted = 0;
+ int scanned = 0;
+ int isolated = 0;
+ int batch_size = 0;
+ struct lrugen *lrugen = &lruvec->evictable;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ VM_BUG_ON(!list_empty(list));
+
+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
+ return -ENOENT;
+
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+ LIST_HEAD(moved);
+ int skipped = 0;
+ struct list_head *head = &lrugen->lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct page *page = lru_to_page(head);
+ int delta = thp_nr_pages(page);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
+
+ prefetchw_prev_lru_page(page, head, flags);
+
+ scanned += delta;
+
+ if (sort_page(page, lruvec, tier))
+ sorted += delta;
+ else if (should_skip_page(page, sc)) {
+ list_move(&page->lru, &moved);
+ skipped += delta;
+ } else {
+ isolate_page(page, lruvec);
+ list_add(&page->lru, list);
+ isolated += delta;
+ }
+
+ if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX ||
+ ++batch_size == MAX_BATCH_SIZE)
+ break;
+ }
+
+ list_splice(&moved, head);
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
+
+ if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX ||
+ batch_size == MAX_BATCH_SIZE)
+ break;
+ }
+
+ success = try_inc_min_seq(lruvec, type);
+
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (!cgroup_reclaim(sc)) {
+ __count_vm_events(item, scanned);
+ __count_vm_events(PGREFILL, sorted);
+ }
+ __count_memcg_events(memcg, item, scanned);
+ __count_memcg_events(memcg, PGREFILL, sorted);
+ __count_vm_events(PGSCAN_ANON + type, scanned);
+
+ *nr_to_scan -= scanned;
+
+ if (*nr_to_scan <= 0 || success || isolated)
+ return isolated;
+ /*
+ * We may have trouble finding eligible pages due to reclaim_idx,
+ * may_unmap and may_writepage. The following check makes sure we won't
+ * be stuck if we aren't making enough progress.
+ */
+ return batch_size == MAX_BATCH_SIZE && sorted >= SWAP_CLUSTER_MAX ? 0 : -ENOENT;
+}
+
+static int get_tier_to_isolate(struct lruvec *lruvec, int type)
+{
+ int tier;
+ struct controller_pos sp, pv;
+
+ /*
+ * Ideally we don't want to evict upper tiers that have higher refault
+ * rates. However, we need to leave a margin for the fluctuations in
+ * refault rates. So we use a larger gain factor to make sure upper
+ * tiers are indeed more active. We choose 2 because the lowest upper
+ * tier would have twice of the refault rate of the base tier, according
+ * to their numbers of accesses.
+ */
+ read_controller_pos(&sp, lruvec, type, 0, 1);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_controller_pos(&pv, lruvec, type, tier, 2);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ return tier - 1;
+}
+
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_to_isolate)
+{
+ int type, tier;
+ struct controller_pos sp, pv;
+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+
+ /*
+ * Compare the refault rates between the base tiers of anon and file to
+ * determine which type to evict. Also need to compare the refault rates
+ * of the upper tiers of the selected type with that of the base tier of
+ * the other type to determine which tier of the selected type to evict.
+ */
+ read_controller_pos(&sp, lruvec, 0, 0, gain[0]);
+ read_controller_pos(&pv, lruvec, 1, 0, gain[1]);
+ type = positive_ctrl_err(&sp, &pv);
+
+ read_controller_pos(&sp, lruvec, !type, 0, gain[!type]);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_controller_pos(&pv, lruvec, type, tier, gain[type]);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ *tier_to_isolate = tier - 1;
+
+ return type;
+}
+
+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ long *nr_to_scan, int *type_to_scan, struct list_head *list)
+{
+ int i;
+ int type;
+ int isolated;
+ int tier = -1;
+ DEFINE_MAX_SEQ();
+ DEFINE_MIN_SEQ();
+
+ VM_BUG_ON(!seq_is_valid(lruvec));
+
+ if (max_nr_gens(max_seq, min_seq, swappiness) == MIN_NR_GENS)
+ return 0;
+ /*
+ * Try to select a type based on generations and swappiness, and if that
+ * fails, fall back to get_type_to_scan(). When anon and file are both
+ * available from the same generation, swappiness 200 is interpreted as
+ * anon first and swappiness 1 is interpreted as file first.
+ */
+ type = !swappiness || min_seq[0] > min_seq[1] ||
+ (min_seq[0] == min_seq[1] && swappiness != 200 &&
+ (swappiness == 1 || get_type_to_scan(lruvec, swappiness, &tier)));
+
+ if (tier == -1)
+ tier = get_tier_to_isolate(lruvec, type);
+
+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
+ isolated = scan_pages(lruvec, sc, nr_to_scan, type, tier, list);
+ if (isolated >= 0)
+ break;
+
+ type = !type;
+ tier = get_tier_to_isolate(lruvec, type);
+ }
+
+ if (isolated < 0)
+ isolated = *nr_to_scan = 0;
+
+ *type_to_scan = type;
+
+ return isolated;
+}
+
+/* Main function used by the foreground, the background and the user-triggered eviction. */
+static bool evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ long *nr_to_scan)
+{
+ int type;
+ int isolated;
+ int reclaimed;
+ LIST_HEAD(list);
+ struct page *page;
+ enum vm_event_item item;
+ struct reclaim_stat stat;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ isolated = isolate_pages(lruvec, sc, swappiness, nr_to_scan, &type, &list);
+ VM_BUG_ON(list_empty(&list) == !!isolated);
+
+ if (isolated)
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, isolated);
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ if (!isolated)
+ goto done;
+
+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
+ /*
+ * We need to prevent rejected pages from being added back to the same
+ * lists they were isolated from. Otherwise we may risk looping on them
+ * forever. We use PageActive() or !PageReferenced() && PageWorkingset()
+ * to tell lru_gen_addition() not to add them to the oldest generation.
+ */
+ list_for_each_entry(page, &list, lru) {
+ if (PageMlocked(page))
+ continue;
+
+ if (page_mapped(page) && PageReferenced(page))
+ SetPageActive(page);
+ else {
+ ClearPageActive(page);
+ SetPageWorkingset(page);
+ }
+ ClearPageReferenced(page);
+ }
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ move_pages_to_lru(lruvec, &list);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, -isolated);
+
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, reclaimed);
+ __count_memcg_events(lruvec_memcg(lruvec), item, reclaimed);
+ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_uncharge_list(&list);
+ free_unref_page_list(&list);
+
+ sc->nr_reclaimed += reclaimed;
+done:
+ return *nr_to_scan > 0 && sc->nr_reclaimed < sc->nr_to_reclaim;
+}
+
+/******************************************************************************
+ * page reclaim
+ ******************************************************************************/
+
+static int get_swappiness(struct lruvec *lruvec)
+{
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ int swappiness = mem_cgroup_get_nr_swap_pages(memcg) >= (long)SWAP_CLUSTER_MAX ?
+ mem_cgroup_swappiness(memcg) : 0;
+
+ VM_BUG_ON(swappiness > 200U);
+
+ return swappiness;
+}
+
+static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+ int swappiness)
+{
+ int gen, type, zone;
+ long nr_to_scan = 0;
+ struct lrugen *lrugen = &lruvec->evictable;
+ DEFINE_MAX_SEQ();
+ DEFINE_MIN_SEQ();
+
+ lru_add_drain();
+
+ for (type = !swappiness; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone <= sc->reclaim_idx; zone++)
+ nr_to_scan += READ_ONCE(lrugen->sizes[gen][type][zone]);
+ }
+ }
+
+ nr_to_scan = max(nr_to_scan, 0L);
+ nr_to_scan = round_up(nr_to_scan >> sc->priority, SWAP_CLUSTER_MAX);
+
+ if (max_nr_gens(max_seq, min_seq, swappiness) > MIN_NR_GENS)
+ return nr_to_scan;
+
+ /* kswapd uses lru_gen_age_node() */
+ if (current_is_kswapd())
+ return 0;
+
+ return walk_mm_list(lruvec, max_seq, sc, swappiness, NULL) ? nr_to_scan : 0;
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct blk_plug plug;
+ unsigned long scanned = 0;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ blk_start_plug(&plug);
+
+ while (true) {
+ long nr_to_scan;
+ int swappiness = sc->may_swap ? get_swappiness(lruvec) : 0;
+
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness) - scanned;
+ if (nr_to_scan < (long)SWAP_CLUSTER_MAX)
+ break;
+
+ scanned += nr_to_scan;
+
+ if (!evict_pages(lruvec, sc, swappiness, &nr_to_scan))
+ break;
+
+ scanned -= nr_to_scan;
+
+ if (mem_cgroup_below_min(memcg) ||
+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+ break;
+
+ cond_resched();
+ }
+
+ blk_finish_plug(&plug);
+}
+
+/******************************************************************************
+ * the background aging
+ ******************************************************************************/
+
+static int lru_gen_spread = MIN_NR_GENS;
+
+static void try_walk_mm_list(struct lruvec *lruvec, struct scan_control *sc)
+{
+ int gen, type, zone;
+ long old_and_young[2] = {};
+ int spread = READ_ONCE(lru_gen_spread);
+ int swappiness = get_swappiness(lruvec);
+ struct lrugen *lrugen = &lruvec->evictable;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ DEFINE_MAX_SEQ();
+ DEFINE_MIN_SEQ();
+
+ lru_add_drain();
+
+ for (type = !swappiness; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ old_and_young[seq == max_seq] +=
+ READ_ONCE(lrugen->sizes[gen][type][zone]);
+ }
+ }
+
+ old_and_young[0] = max(old_and_young[0], 0L);
+ old_and_young[1] = max(old_and_young[1], 0L);
+
+ /* try to spread pages out across spread+1 generations */
+ if (old_and_young[0] >= old_and_young[1] * spread &&
+ min_nr_gens(max_seq, min_seq, swappiness) > max(spread, MIN_NR_GENS))
+ return;
+
+ walk_mm_list(lruvec, max_seq, sc, swappiness, pgdat->mm_walk_args);
+}
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON(!current_is_kswapd());
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ if (!mem_cgroup_below_min(memcg) &&
+ (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
+ try_walk_mm_list(lruvec, sc);
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+}
+
/******************************************************************************
* state change
******************************************************************************/
@@ -4172,6 +4663,21 @@ static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *se
return NOTIFY_DONE;
}
+static void lru_gen_start_kswapd(int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+ pgdat->mm_walk_args = kvzalloc_node(size_of_mm_walk_args(), GFP_KERNEL, nid);
+ WARN_ON_ONCE(!pgdat->mm_walk_args);
+}
+
+static void lru_gen_stop_kswapd(int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+ kvfree(pgdat->mm_walk_args);
+}
+
/******************************************************************************
* initialization
******************************************************************************/
@@ -4220,6 +4726,24 @@ static int __init init_lru_gen(void)
*/
arch_initcall(init_lru_gen);
+#else /* CONFIG_LRU_GEN */
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+}
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
+static void lru_gen_start_kswapd(int nid)
+{
+}
+
+static void lru_gen_stop_kswapd(int nid)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
@@ -4233,6 +4757,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
struct blk_plug plug;
bool scan_adjusted;
+ if (lru_gen_enabled()) {
+ lru_gen_shrink_lruvec(lruvec, sc);
+ return;
+ }
+
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
@@ -4699,6 +5228,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
struct lruvec *target_lruvec;
unsigned long refaults;
+ if (lru_gen_enabled())
+ return;
+
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
target_lruvec->refaults[0] = refaults;
@@ -5073,6 +5605,11 @@ static void age_active_anon(struct pglist_data *pgdat,
struct mem_cgroup *memcg;
struct lruvec *lruvec;
+ if (lru_gen_enabled()) {
+ lru_gen_age_node(pgdat, sc);
+ return;
+ }
+
if (!total_swap_pages)
return;
@@ -5753,6 +6290,8 @@ int kswapd_run(int nid)
if (pgdat->kswapd)
return 0;
+ lru_gen_start_kswapd(nid);
+
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
@@ -5775,6 +6314,7 @@ void kswapd_stop(int nid)
if (kswapd) {
kthread_stop(kswapd);
NODE_DATA(nid)->kswapd = NULL;
+ lru_gen_stop_kswapd(nid);
}
}
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,572 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 17ED0C43460
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:41 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id E04C861184
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:40 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S231223AbhETG4A (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:56:00 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37944 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S230473AbhETGzl (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:41 -0400
Received: from mail-qv1-xf49.google.com (mail-qv1-xf49.google.com [IPv6:2607:f8b0:4864:20::f49])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id C0A0DC061761
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:20 -0700 (PDT)
Received: by mail-qv1-xf49.google.com with SMTP id d9-20020a0ce4490000b02901f0bee07112so6151672qvm.7
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:20 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=D+kCP8KjWdhzq6b9AfWqzFHrIcC1HBgTAlg7o1thC8s=;
b=Ao3JFmOKgU6GUK7wOdKwO7smRq1lLjob3ltec82Ju9mPzN+QmdjLHzBqk1xnUggESF
TqhhI3jybr858NfIj3PCXK9+qR3zojc5Pd/Quyp44VSHbor2BjBUQqP/t8M487uM4XwV
WngIjYnvrYzwh9qjiSWbyBv7yV1ee386Z4r6QxKE99zk0yauu04cnFkSyQcJzvL7ST9Y
gunIrZGlwh/QB3VgMvJBx8LLRtENwU2C6hFb2JqIhNx7ECiYmfTdxZ3hqTeciT6fp1mo
VJhTuLMD0zN+BmbL7udJFNaRaLEzDq8aaX3Qgn7+HzfVXcaIkWuHdLfLiqx6NOEuXJPh
aFOw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=D+kCP8KjWdhzq6b9AfWqzFHrIcC1HBgTAlg7o1thC8s=;
b=Gjw4qCU2aoOaAwRGh+lY4+hcMXHU7TPGrsgdc0GeQGBjEbSelYAeLx6lfapzEMs4gS
OINghBuL7TEDPHzWY92K4Snh4Pm597qGEmIgplE4cMHoWrN8rxc/C+gB/gsW/UgvllX2
o0zgNR9ve4/y3vOdD7xGYl0wDq608mGKsoRKDgVf/SEkDCldm1xmB/MaJihWPSw4niAH
KRDP84OugEgIRgRj3MrqoREu5cyjw4ClxbQ8HeaRnw1wt6isXGqlXjiBVveyjSbrV+Q/
luG3YEEGwMlCYbMovQTSmBB7n0pN8Ihg0qVPmr6GcmbpcwYKQWv1tIves1vbV0kdb4aN
u9HQ==
X-Gm-Message-State: AOAM530vodqQk47GPTvruvJy4njfXeKD7559Rhxl39MVYv2cMgQ/XPuI
uuzSatZDrJOCQGdTQCyuRTP/IMipOlM=
X-Google-Smtp-Source: ABdhPJwiri0QbWt8YjsEa+N+Ooz0Ku0LVYpwKy1ZvcZJzOwoHQf1X931BLtxTF10spH4XfRsXE5x6SYzq+w=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:ad4:5767:: with SMTP id r7mr3879143qvx.1.1621493659852;
Wed, 19 May 2021 23:54:19 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:53 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-13-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 12/14] mm: multigenerational lru: user interface
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
Add a sysfs file /sys/kernel/mm/lru_gen/enabled to enable and disable
the multigenerational lru at runtime.
Add a sysfs file /sys/kernel/mm/lru_gen/spread to optionally spread
pages out across more than three generations. More generations make
the background aging more aggressive.
Add a debugfs file /sys/kernel/debug/lru_gen to monitor the
multigenerational lru and trigger the aging and the eviction. This
file has the following output:
memcg memcg_id memcg_path
node node_id
min_gen birth_time anon_size file_size
...
max_gen birth_time anon_size file_size
Given a memcg and a node, "min_gen" is the oldest generation (number)
and "max_gen" is the youngest. Birth time is in milliseconds. The
sizes of anon and file types are in pages.
This file takes the following input:
+ memcg_id node_id gen [swappiness]
- memcg_id node_id gen [swappiness] [nr_to_reclaim]
The first command line accounts referenced pages to generation
"max_gen" and creates the next generation "max_gen"+1. In this case,
"gen" should be equal to "max_gen". A swap file and a non-zero
"swappiness" are required to scan anon type. If swapping is not
desired, set vm.swappiness to 0. The second command line evicts
generations less than or equal to "gen". In this case, "gen" should be
less than "max_gen"-1 as "max_gen" and "max_gen"-1 are active
generations and therefore protected from the eviction. Use
"nr_to_reclaim" to limit the number of pages to evict. Multiple
command lines are supported, so does concatenation with delimiters ","
and ";".
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
mm/vmscan.c | 403 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 403 insertions(+)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2f86dcc04c56..ff2deec24c64 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,8 @@
#include <linux/memory.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -4678,6 +4680,401 @@ static void lru_gen_stop_kswapd(int nid)
kvfree(pgdat->mm_walk_args);
}
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t show_lru_gen_spread(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", READ_ONCE(lru_gen_spread));
+}
+
+static ssize_t store_lru_gen_spread(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int spread;
+
+ if (kstrtoint(buf, 10, &spread) || spread >= MAX_NR_GENS)
+ return -EINVAL;
+
+ WRITE_ONCE(lru_gen_spread, spread);
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_spread_attr = __ATTR(
+ spread, 0644, show_lru_gen_spread, store_lru_gen_spread
+);
+
+static ssize_t show_lru_gen_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled());
+}
+
+static ssize_t store_lru_gen_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int enable;
+
+ if (kstrtoint(buf, 10, &enable))
+ return -EINVAL;
+
+ lru_gen_set_state(enable, true, false);
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+ enabled, 0644, show_lru_gen_enabled, store_lru_gen_enabled
+);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_spread_attr.attr,
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
+/******************************************************************************
+ * debugfs interface
+ ******************************************************************************/
+
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg;
+ loff_t nr_to_skip = *pos;
+
+ m->private = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!m->private)
+ return ERR_PTR(-ENOMEM);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (!nr_to_skip--)
+ return mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ return NULL;
+}
+
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
+{
+ if (!IS_ERR_OR_NULL(v))
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
+
+ kfree(m->private);
+ m->private = NULL;
+}
+
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int nid = lruvec_pgdat(v)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(v);
+
+ ++*pos;
+
+ nid = next_memory_node(nid);
+ if (nid == MAX_NUMNODES) {
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ if (!memcg)
+ return NULL;
+
+ nid = first_memory_node;
+ }
+
+ return mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+}
+
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ unsigned long max_seq, unsigned long *min_seq,
+ unsigned long seq)
+{
+ int i;
+ int type, tier;
+ int hist = hist_from_seq_or_gen(seq);
+ struct lrugen *lrugen = &lruvec->evictable;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ seq_printf(m, " %10d", tier);
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long n[3] = {};
+
+ if (seq == max_seq) {
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
+
+ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]);
+ } else if (seq == min_seq[type] || NR_STAT_GENS > 1) {
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ n[2] = READ_ONCE(lrugen->activated[hist][type][tier - 1]);
+
+ seq_printf(m, " %10lur %10lue %10lua", n[0], n[1], n[2]);
+ } else
+ seq_puts(m, " 0 0 0 ");
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, " ");
+ for (i = 0; i < NR_MM_STATS; i++) {
+ if (seq == max_seq && NR_STAT_GENS == 1)
+ seq_printf(m, " %10lu%c", READ_ONCE(mm_list->nodes[nid].stats[hist][i]),
+ toupper(MM_STAT_CODES[i]));
+ else if (seq != max_seq && NR_STAT_GENS > 1)
+ seq_printf(m, " %10lu%c", READ_ONCE(mm_list->nodes[nid].stats[hist][i]),
+ MM_STAT_CODES[i]);
+ else
+ seq_puts(m, " 0 ");
+ }
+ seq_putc(m, '\n');
+}
+
+static int lru_gen_seq_show(struct seq_file *m, void *v)
+{
+ unsigned long seq;
+ bool full = !debugfs_real_fops(m->file)->write;
+ struct lruvec *lruvec = v;
+ struct lrugen *lrugen = &lruvec->evictable;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ();
+ DEFINE_MIN_SEQ();
+
+ if (nid == first_memory_node) {
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
+#endif
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), (char *)m->private);
+ }
+
+ seq_printf(m, " node %5d\n", nid);
+
+ seq = full ? (max_seq < MAX_NR_GENS ? 0 : max_seq - MAX_NR_GENS + 1) :
+ min(min_seq[0], min_seq[1]);
+
+ for (; seq <= max_seq; seq++) {
+ int gen, type, zone;
+ unsigned int msecs;
+
+ gen = lru_gen_from_seq(seq);
+ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen]));
+
+ seq_printf(m, " %10lu %10u", seq, msecs);
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ long size = 0;
+
+ if (seq < min_seq[type]) {
+ seq_puts(m, " -0 ");
+ continue;
+ }
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += READ_ONCE(lrugen->sizes[gen][type][zone]);
+
+ seq_printf(m, " %10lu ", max(size, 0L));
+ }
+
+ seq_putc(m, '\n');
+
+ if (full)
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lru_gen_seq_ops = {
+ .start = lru_gen_seq_start,
+ .stop = lru_gen_seq_stop,
+ .next = lru_gen_seq_next,
+ .show = lru_gen_seq_show,
+};
+
+static int advance_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
+{
+ struct scan_control sc = {
+ .target_mem_cgroup = lruvec_memcg(lruvec),
+ };
+ DEFINE_MAX_SEQ();
+
+ if (seq == max_seq)
+ walk_mm_list(lruvec, max_seq, &sc, swappiness, NULL);
+
+ return seq > max_seq ? -EINVAL : 0;
+}
+
+static int advance_min_seq(struct lruvec *lruvec, unsigned long seq, int swappiness,
+ unsigned long nr_to_reclaim)
+{
+ struct blk_plug plug;
+ int err = -EINTR;
+ long nr_to_scan = LONG_MAX;
+ struct scan_control sc = {
+ .nr_to_reclaim = nr_to_reclaim,
+ .target_mem_cgroup = lruvec_memcg(lruvec),
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+ DEFINE_MAX_SEQ();
+
+ if (seq >= max_seq - 1)
+ return -EINVAL;
+
+ blk_start_plug(&plug);
+
+ while (!signal_pending(current)) {
+ DEFINE_MIN_SEQ();
+
+ if (seq < min(min_seq[!swappiness], min_seq[swappiness < 200]) ||
+ !evict_pages(lruvec, &sc, swappiness, &nr_to_scan)) {
+ err = 0;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ blk_finish_plug(&plug);
+
+ return err;
+}
+
+static int advance_seq(char cmd, int memcg_id, int nid, unsigned long seq,
+ int swappiness, unsigned long nr_to_reclaim)
+{
+ struct lruvec *lruvec;
+ int err = -EINVAL;
+ struct mem_cgroup *memcg = NULL;
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcg_id);
+#ifdef CONFIG_MEMCG
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
+#endif
+ rcu_read_unlock();
+
+ if (!memcg)
+ goto done;
+ }
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto done;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
+ goto done;
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+
+ if (swappiness == -1)
+ swappiness = get_swappiness(lruvec);
+ else if (swappiness > 200U)
+ goto done;
+
+ switch (cmd) {
+ case '+':
+ err = advance_max_seq(lruvec, seq, swappiness);
+ break;
+ case '-':
+ err = advance_min_seq(lruvec, seq, swappiness, nr_to_reclaim);
+ break;
+ }
+done:
+ mem_cgroup_put(memcg);
+
+ return err;
+}
+
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ size_t len, loff_t *pos)
+{
+ void *buf;
+ char *cur, *next;
+ int err = 0;
+
+ buf = kvmalloc(len + 1, GFP_USER);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, src, len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ next = buf;
+ next[len] = '\0';
+
+ while ((cur = strsep(&next, ",;\n"))) {
+ int n;
+ int end;
+ char cmd;
+ unsigned int memcg_id;
+ unsigned int nid;
+ unsigned long seq;
+ unsigned int swappiness = -1;
+ unsigned long nr_to_reclaim = -1;
+
+ cur = skip_spaces(cur);
+ if (!*cur)
+ continue;
+
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, &swappiness, &end, &nr_to_reclaim, &end);
+ if (n < 4 || cur[end]) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = advance_seq(cmd, memcg_id, nid, seq, swappiness, nr_to_reclaim);
+ if (err)
+ break;
+ }
+
+ kvfree(buf);
+
+ return err ? : len;
+}
+
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lru_gen_seq_ops);
+}
+
+static const struct file_operations lru_gen_rw_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .write = lru_gen_seq_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations lru_gen_ro_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
/******************************************************************************
* initialization
******************************************************************************/
@@ -4718,6 +5115,12 @@ static int __init init_lru_gen(void)
if (hotplug_memory_notifier(lru_gen_online_mem, 0))
pr_err("lru_gen: failed to subscribe hotplug notifications\n");
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
return 0;
};
/*
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,177 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 1B6E6C433B4
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:47 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id 01DA3613BA
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:46 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S231250AbhETG4G (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:56:06 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37952 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S231130AbhETGzn (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:43 -0400
Received: from mail-yb1-xb49.google.com (mail-yb1-xb49.google.com [IPv6:2607:f8b0:4864:20::b49])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 38C6EC061574
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:22 -0700 (PDT)
Received: by mail-yb1-xb49.google.com with SMTP id e138-20020a25e7900000b029050df4b648fcso15235225ybh.7
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:22 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=RJZRi3A5hgCw5vurGae674wlMdAubVgl39rxTDtDyVU=;
b=JLF6ekZqpQB+K5YXwsizPGhNysHViGntJ8r9yptD6ne6XqGKoI0Wr7dT82jfftMw45
KOQGieGgRV+BKuZtTbu4cD96tkttpjfquqm9xT5G1x+H3lcXyVbFnA/e0Iz2BGxOx/oJ
BP1OK9ib2xvMirn2ogaiMLLuQUMqUVLP4SPszQpLdYUpmsyvtcBjEJsyZiQXMIULinqi
S2oaVggbJoWpCxB/3pF4W62fMm5D/LXGAxEWoOTfyY0Ng+NdQ206TROqcoNsbbncUKfa
mpyuoyCTOYlALfoNN1kP2lNPrNTUz+UQK31nuEwEnfTBGdsmWTsTelrdxl+7zutfQ7Vh
E6Ag==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=RJZRi3A5hgCw5vurGae674wlMdAubVgl39rxTDtDyVU=;
b=I4OJloc251SYLj8eJFPP7eJ7uA2r6NXUj5S3hGT6Cv3INg/pbFfz6U56wMbDKeUUx2
PFZUxNbewINRi+Xyu0XORumSFYK8cRNAA2xJjsiB/Mi20wJutqQp8eggHjJ4klnT3Arg
fC/Qi7JDEKR9akyObrL1SszlU1EyBRMRlSuA56tL8Ayw3KSXAha5WNL73FfjPvnDX4Jn
bmGmhKmr4OxIJyYH+35RFfAEzVEoAkRi3miAuWb8eWC6T+GXdpovk6EqvlqSAS4RZyph
hWXp4amXtIPmK3meD1g4aF/hJ6IDATp3RWD8SfDV+tNQHU5Wvz3exLpVhhmIWwqq93UL
V4jg==
X-Gm-Message-State: AOAM532JcEmtEayMiu5r4FpU7325mYFfTWklXmMaEvDn+KHT0zqmqSZv
de4I079gO1eY+8FLxSzEABlOF2R4isA=
X-Google-Smtp-Source: ABdhPJxxpSuYeOHDU9e04y/REOjF/gJdL+d+nc25sd9W9QvVBy/CaC3vEGm8uybifzDpCi76iP47Kw7dnfI=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:a05:6902:4b3:: with SMTP id
r19mr5173987ybs.290.1621493661349; Wed, 19 May 2021 23:54:21 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:54 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-14-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 13/14] mm: multigenerational lru: Kconfig
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
Add configuration options for the multigenerational lru.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
mm/Kconfig | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/mm/Kconfig b/mm/Kconfig
index 02d44e3420f5..da125f145bc4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -901,4 +901,62 @@ config KMAP_LOCAL
# struct io_mapping based helper. Selected by drivers that need them
config IO_MAPPING
bool
+
+# the multigenerational lru {
+config LRU_GEN
+ bool "Multigenerational LRU"
+ depends on MMU
+ help
+ A high performance LRU implementation to heavily overcommit workloads
+ that are not IO bound. See Documentation/vm/multigen_lru.rst for
+ details.
+
+ Warning: do not enable this option unless you plan to use it because
+ it introduces a small per-process and per-memcg and per-node memory
+ overhead.
+
+config LRU_GEN_ENABLED
+ bool "Turn on by default"
+ depends on LRU_GEN
+ help
+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
+ changes it to 1.
+
+ Warning: the default value is the fast path. See
+ Documentation/static-keys.txt for details.
+
+config LRU_GEN_STATS
+ bool "Full stats for debugging"
+ depends on LRU_GEN
+ help
+ This option keeps full stats for each generation, which can be read
+ from /sys/kernel/debug/lru_gen_full.
+
+ Warning: do not enable this option unless you plan to use it because
+ it introduces an additional small per-process and per-memcg and
+ per-node memory overhead.
+
+config NR_LRU_GENS
+ int "Max number of generations"
+ depends on LRU_GEN
+ range 4 31
+ default 7
+ help
+ This will use order_base_2(N+1) spare bits from page flags.
+
+ Warning: do not use numbers larger than necessary because each
+ generation introduces a small per-node and per-memcg memory overhead.
+
+config TIERS_PER_GEN
+ int "Number of tiers per generation"
+ depends on LRU_GEN
+ range 2 5
+ default 4
+ help
+ This will use N-2 spare bits from page flags.
+
+ Larger values generally offer better protection to active pages under
+ heavy buffered I/O workloads.
+# }
+
endmenu
--
2.31.1.751.gd2f1c929bd-goog

View File

@ -0,0 +1,273 @@
From mboxrd@z Thu Jan 1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level:
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
by smtp.lore.kernel.org (Postfix) with ESMTP id 10B58C433ED
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:48 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
by mail.kernel.org (Postfix) with ESMTP id E99D16108C
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:47 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
id S231251AbhETG4H (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
Thu, 20 May 2021 02:56:07 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37970 "EHLO
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
with ESMTP id S231152AbhETGzq (ORCPT
<rfc822;linux-kernel@vger.kernel.org>);
Thu, 20 May 2021 02:55:46 -0400
Received: from mail-qt1-x84a.google.com (mail-qt1-x84a.google.com [IPv6:2607:f8b0:4864:20::84a])
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id C8635C06175F
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:23 -0700 (PDT)
Received: by mail-qt1-x84a.google.com with SMTP id x9-20020ac84a090000b0290203194f1f86so3499707qtq.13
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:23 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=google.com; s=20161025;
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
:cc;
bh=tmnYUMpAe2KoFw1JK5DEOLa6QKjWz+/jEuUps2TjE0M=;
b=DnoKJgXGcZrakGIsy2wdggTSzr8gNr5Cga30A6c8a6Hf9x2dffeKxupvvvPjuu1gFH
aGdEv0BQdUdQtd0c3PTB1yYrqJsJcPp5S6L8/JeU1mBsAkTgRAJC+WwYC2oJaN+K/+rh
m7SHkphIH6F6L72NTt2b96CmRop8AS7h70mGFoqBtxgJZEEG0JjTr93/mLmeGl1DrblN
ViY8g/jh939e21AJjULOIlpeBbxplek6u+fXKVxsYdCV2JKDsA0LwaCxMlx08fCc/j9n
pt2cBRltMZSTctDaJlkHWcEOuGP8bGJA/JzG0MeUfva0r9KcYGAVy5zcvXU4Mkz8AXA/
v3JQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20161025;
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
:references:subject:from:to:cc;
bh=tmnYUMpAe2KoFw1JK5DEOLa6QKjWz+/jEuUps2TjE0M=;
b=eSQlkp99GhbOJbbfWHaqWXYyj8f2uV+mVQE23pf6QSUOoTukthTWydqV3fgiwXFIDZ
SDohHvXcyn6N5BbFXVm6CtNfXtb315OJsMSEplLbhXduGrLKjsp7Zfpa0MW/pBEJOfNH
/go5cnOUxmpFFo2+nAoIm8Xug3YYddsalK9BH0YMXpESvTCgOPpHU8wev9wLTU4zDG2s
NSpyxsj72ahnHDJFkm3eEio8zmWqdEa9MYXuSU+QTZ/HJ0OwLb4BOdRwaOx/GeFoWGTu
We7/PREhKWf+7tUeB8o2wbzSdGaKSWLh2SOQR0Ydr269QRIv3J4q6e/zT85DkE6XOcYE
ziTg==
X-Gm-Message-State: AOAM530QSwtf5Kda/I2DOnicxCl9Xadwo6H9cConTRpJ9+gh6AP7aLlU
qfB+G5KAp3JrJXlL4Qf1Gmbl32aZCU0=
X-Google-Smtp-Source: ABdhPJyfR8302KuxyD/mIOKCO+jxW1RXoZnlJejF8SLfwvo9YuRoFSL43tZzQ7DdKcZXlLzVckFytBbp+9s=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
(user=yuzhao job=sendgmr) by 2002:ad4:5a52:: with SMTP id ej18mr3968319qvb.31.1621493662894;
Wed, 19 May 2021 23:54:22 -0700 (PDT)
Date: Thu, 20 May 2021 00:53:55 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-15-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 14/14] mm: multigenerational lru: documentation
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Dave Chinner <david@fromorbit.com>,
Dave Hansen <dave.hansen@linux.intel.com>,
Donald Carr <sirspudd@gmail.com>,
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
Jonathan Corbet <corbet@lwn.net>,
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
Konstantin Kharlamov <hi-angel@yandex.ru>,
Marcus Seyfarth <m.seyfarth@gmail.com>,
Matthew Wilcox <willy@infradead.org>,
Mel Gorman <mgorman@suse.de>,
Miaohe Lin <linmiaohe@huawei.com>,
Michael Larabel <michael@michaellarabel.com>,
Michal Hocko <mhocko@suse.com>,
Michel Lespinasse <michel@lespinasse.org>,
Rik van Riel <riel@surriel.com>,
Roman Gushchin <guro@fb.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Yang Shi <shy828301@gmail.com>,
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
linux-kernel@vger.kernel.org, lkp@lists.01.org,
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>
Add Documentation/vm/multigen_lru.rst.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
Documentation/vm/index.rst | 1 +
Documentation/vm/multigen_lru.rst | 143 ++++++++++++++++++++++++++++++
2 files changed, 144 insertions(+)
create mode 100644 Documentation/vm/multigen_lru.rst
diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
index eff5fbd492d0..c353b3f55924 100644
--- a/Documentation/vm/index.rst
+++ b/Documentation/vm/index.rst
@@ -17,6 +17,7 @@ various features of the Linux memory management
swap_numa
zswap
+ multigen_lru
Kernel developers MM documentation
==================================
diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst
new file mode 100644
index 000000000000..a18416ed7e92
--- /dev/null
+++ b/Documentation/vm/multigen_lru.rst
@@ -0,0 +1,143 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Multigenerational LRU
+=====================
+
+Quick Start
+===========
+Build Options
+-------------
+:Required: Set ``CONFIG_LRU_GEN=y``.
+
+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
+ default.
+
+:Optional: Change ``CONFIG_NR_LRU_GENS`` to a number ``X`` to support
+ a maximum of ``X`` generations.
+
+:Optional: Change ``CONFIG_TIERS_PER_GEN`` to a number ``Y`` to
+ support a maximum of ``Y`` tiers per generation.
+
+Runtime Options
+---------------
+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
+ feature was not turned on by default.
+
+:Optional: Change ``/sys/kernel/mm/lru_gen/spread`` to a number ``N``
+ to spread pages out across ``N+1`` generations. ``N`` should be less
+ than ``X``. Larger values make the background aging more aggressive.
+
+:Optional: Read ``/sys/kernel/debug/lru_gen`` to verify the feature.
+ This file has the following output:
+
+::
+
+ memcg memcg_id memcg_path
+ node node_id
+ min_gen birth_time anon_size file_size
+ ...
+ max_gen birth_time anon_size file_size
+
+Given a memcg and a node, ``min_gen`` is the oldest generation
+(number) and ``max_gen`` is the youngest. Birth time is in
+milliseconds. The sizes of anon and file types are in pages.
+
+Recipes
+-------
+:Android on ARMv8.1+: ``X=4``, ``Y=3`` and ``N=0``.
+
+:Android on pre-ARMv8.1 CPUs: Not recommended due to the lack of
+ ``ARM64_HW_AFDBM``.
+
+:Laptops and workstations running Chrome on x86_64: Use the default
+ values.
+
+:Working set estimation: Write ``+ memcg_id node_id gen [swappiness]``
+ to ``/sys/kernel/debug/lru_gen`` to account referenced pages to
+ generation ``max_gen`` and create the next generation ``max_gen+1``.
+ ``gen`` should be equal to ``max_gen``. A swap file and a non-zero
+ ``swappiness`` are required to scan anon type. If swapping is not
+ desired, set ``vm.swappiness`` to ``0``.
+
+:Proactive reclaim: Write ``- memcg_id node_id gen [swappiness]
+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to evict
+ generations less than or equal to ``gen``. ``gen`` should be less
+ than ``max_gen-1`` as ``max_gen`` and ``max_gen-1`` are active
+ generations and therefore protected from the eviction. Use
+ ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
+ command lines are supported, so does concatenation with delimiters
+ ``,`` and ``;``.
+
+Framework
+=========
+For each ``lruvec``, evictable pages are divided into multiple
+generations. The youngest generation number is stored in ``max_seq``
+for both anon and file types as they are aged on an equal footing. The
+oldest generation numbers are stored in ``min_seq[2]`` separately for
+anon and file types as clean file pages can be evicted regardless of
+swap and write-back constraints. These three variables are
+monotonically increasing. Generation numbers are truncated into
+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
+``page->flags``. The sliding window technique is used to prevent
+truncated generation numbers from overlapping. Each truncated
+generation number is an index to an array of per-type and per-zone
+lists. Evictable pages are added to the per-zone lists indexed by
+``max_seq`` or ``min_seq[2]`` (modulo ``CONFIG_NR_LRU_GENS``),
+depending on their types.
+
+Each generation is then divided into multiple tiers. Tiers represent
+levels of usage from file descriptors only. Pages accessed N times via
+file descriptors belong to tier order_base_2(N). Each generation
+contains at most CONFIG_TIERS_PER_GEN tiers, and they require
+additional CONFIG_TIERS_PER_GEN-2 bits in page->flags. In contrast to
+moving across generations which requires the lru lock for the list
+operations, moving across tiers only involves an atomic operation on
+``page->flags`` and therefore has a negligible cost. A feedback loop
+modeled after the PID controller monitors the refault rates across all
+tiers and decides when to activate pages from which tiers in the
+reclaim path.
+
+The framework comprises two conceptually independent components: the
+aging and the eviction, which can be invoked separately from user
+space for the purpose of working set estimation and proactive reclaim.
+
+Aging
+-----
+The aging produces young generations. Given an ``lruvec``, the aging
+scans page tables for referenced pages of this ``lruvec``. Upon
+finding one, the aging updates its generation number to ``max_seq``.
+After each round of scan, the aging increments ``max_seq``.
+
+The aging maintains either a system-wide ``mm_struct`` list or
+per-memcg ``mm_struct`` lists, and it only scans page tables of
+processes that have been scheduled since the last scan.
+
+The aging is due when both of ``min_seq[2]`` reaches ``max_seq-1``,
+assuming both anon and file types are reclaimable.
+
+Eviction
+--------
+The eviction consumes old generations. Given an ``lruvec``, the
+eviction scans the pages on the per-zone lists indexed by either of
+``min_seq[2]``. It first tries to select a type based on the values of
+``min_seq[2]``. When anon and file types are both available from the
+same generation, it selects the one that has a lower refault rate.
+
+During a scan, the eviction sorts pages according to their new
+generation numbers, if the aging has found them referenced. It also
+moves pages from the tiers that have higher refault rates than tier 0
+to the next generation.
+
+When it finds all the per-zone lists of a selected type are empty, the
+eviction increments ``min_seq[2]`` indexed by this selected type.
+
+To-do List
+==========
+KVM Optimization
+----------------
+Support shadow page table scanning.
+
+NUMA Optimization
+-----------------
+Optimize page table scan for NUMA.
--
2.31.1.751.gd2f1c929bd-goog

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,9 @@
post_upgrade() {
if findmnt --fstab -uno SOURCE /boot &>/dev/null && ! mountpoint -q /boot; then
echo "WARNING: /boot appears to be a separate partition but is not mounted."
fi
}
post_remove() {
rm -f boot/initramfs-linux.img
}

View File

@ -0,0 +1,11 @@
# mkinitcpio preset file for the '%PKGBASE%' package
ALL_config="/etc/mkinitcpio.conf"
ALL_kver="%KERNVER%"
PRESETS=('default')
#default_config="/etc/mkinitcpio.conf"
default_image="/boot/initramfs-linux.img"
#default_options=""

View File

@ -0,0 +1,409 @@
From f062022f2a2781d6b8ca63c460b0e72ebac30870 Mon Sep 17 00:00:00 2001
From: Martijn Braam <martijn@brixit.nl>
Date: Mon, 28 Sep 2020 14:26:11 +0200
Subject: [PATCH] media: ov5640: Implement autofocus
The autofocus functionality needs a firmware blob loaded into the
internal microcontroller.
V4L2 doesn't have an api to control all autofocus functionality, but
this at least makes it possible to focus on the center of the sensor.
Signed-off-by: Martijn Braam <martijn@brixit.nl>
---
drivers/media/i2c/ov5640.c | 254 +++++++++++++++++++++++++++++++++++++
1 file changed, 254 insertions(+)
diff --git a/drivers/media/i2c/ov5640.c b/drivers/media/i2c/ov5640.c
index df0a507c211f..08a5304c0e95 100644
--- a/drivers/media/i2c/ov5640.c
+++ b/drivers/media/i2c/ov5640.c
@@ -9,6 +9,7 @@
#include <linux/clkdev.h>
#include <linux/ctype.h>
#include <linux/delay.h>
+#include <linux/firmware.h>
#include <linux/device.h>
#include <linux/gpio/consumer.h>
#include <linux/i2c.h>
@@ -31,7 +32,11 @@
#define OV5640_DEFAULT_SLAVE_ID 0x3c
+#define OV5640_REG_SYS_RESET00 0x3000
+#define OV5640_REG_SYS_RESET01 0x3001
#define OV5640_REG_SYS_RESET02 0x3002
+#define OV5640_REG_SYS_CLOCK_ENABLE00 0x3004
+#define OV5640_REG_SYS_CLOCK_ENABLE01 0x3005
#define OV5640_REG_SYS_CLOCK_ENABLE02 0x3006
#define OV5640_REG_SYS_CTRL0 0x3008
#define OV5640_REG_SYS_CTRL0_SW_PWDN 0x42
@@ -41,6 +46,14 @@
#define OV5640_REG_PAD_OUTPUT_ENABLE01 0x3017
#define OV5640_REG_PAD_OUTPUT_ENABLE02 0x3018
#define OV5640_REG_PAD_OUTPUT00 0x3019
+#define OV5640_REG_FW_CMD_MAIN 0x3022
+#define OV5640_REG_FW_CMD_ACK 0x3023
+#define OV5640_REG_FW_CMD_PARA0 0x3024
+#define OV5640_REG_FW_CMD_PARA1 0x3025
+#define OV5640_REG_FW_CMD_PARA2 0x3026
+#define OV5640_REG_FW_CMD_PARA3 0x3027
+#define OV5640_REG_FW_CMD_PARA4 0x3028
+#define OV5640_REG_FW_STATUS 0x3029
#define OV5640_REG_SYSTEM_CONTROL1 0x302e
#define OV5640_REG_SC_PLL_CTRL0 0x3034
#define OV5640_REG_SC_PLL_CTRL1 0x3035
@@ -59,6 +72,7 @@
#define OV5640_REG_AEC_PK_MANUAL 0x3503
#define OV5640_REG_AEC_PK_REAL_GAIN 0x350a
#define OV5640_REG_AEC_PK_VTS 0x350c
+#define OV5640_REG_VCM_CONTROL4 0x3606
#define OV5640_REG_TIMING_DVPHO 0x3808
#define OV5640_REG_TIMING_DVPVO 0x380a
#define OV5640_REG_TIMING_HTS 0x380c
@@ -95,6 +109,20 @@
#define OV5640_REG_SDE_CTRL4 0x5584
#define OV5640_REG_SDE_CTRL5 0x5585
#define OV5640_REG_AVG_READOUT 0x56a1
+#define OV5640_REG_FIRMWARE_BASE 0x8000
+
+#define OV5640_FW_STATUS_S_FIRMWARE 0x7f
+#define OV5640_FW_STATUS_S_STARTUP 0x7e
+#define OV5640_FW_STATUS_S_IDLE 0x70
+#define OV5640_FW_STATUS_S_FOCUSING 0x00
+#define OV5640_FW_STATUS_S_FOCUSED 0x10
+
+#define OV5640_FW_CMD_TRIGGER_FOCUS 0x03
+#define OV5640_FW_CMD_CONTINUOUS_FOCUS 0x04
+#define OV5640_FW_CMD_GET_FOCUS_RESULT 0x07
+#define OV5640_FW_CMD_RELEASE_FOCUS 0x08
+#define OV5640_FW_CMD_ZONE_CONFIG 0x12
+#define OV5640_FW_CMD_DEFAULT_ZONES 0x80
enum ov5640_mode_id {
OV5640_MODE_QCIF_176_144 = 0,
@@ -218,6 +246,12 @@ struct ov5640_ctrls {
struct v4l2_ctrl *auto_gain;
struct v4l2_ctrl *gain;
};
+ struct {
+ struct v4l2_ctrl *focus_auto;
+ struct v4l2_ctrl *af_start;
+ struct v4l2_ctrl *af_stop;
+ struct v4l2_ctrl *af_status;
+ };
struct v4l2_ctrl *brightness;
struct v4l2_ctrl *light_freq;
struct v4l2_ctrl *saturation;
@@ -261,6 +295,8 @@ struct ov5640_dev {
bool pending_mode_change;
bool streaming;
+
+ bool af_initialized;
};
static inline struct ov5640_dev *to_ov5640_dev(struct v4l2_subdev *sd)
@@ -1967,6 +2003,118 @@ static void ov5640_reset(struct ov5640_dev *sensor)
usleep_range(20000, 25000);
}
+static int ov5640_copy_fw_to_device(struct ov5640_dev *sensor,
+ const struct firmware *fw)
+{
+ struct i2c_client *client = sensor->i2c_client;
+ const u8 *data = (const u8 *)fw->data;
+ u8 fw_status;
+ int i;
+ int ret;
+
+ // Putting MCU in reset state
+ ret = ov5640_write_reg(sensor, OV5640_REG_SYS_RESET00, 0x20);
+ if (ret)
+ return ret;
+
+ // Write firmware
+ for (i = 0; i < fw->size / sizeof(u8); i++)
+ ov5640_write_reg(sensor,
+ OV5640_REG_FIRMWARE_BASE + i,
+ data[i]);
+
+ // Reset MCU state
+ ov5640_write_reg(sensor, OV5640_REG_FW_CMD_MAIN, 0x00);
+ ov5640_write_reg(sensor, OV5640_REG_FW_CMD_ACK, 0x00);
+ ov5640_write_reg(sensor, OV5640_REG_FW_CMD_PARA0, 0x00);
+ ov5640_write_reg(sensor, OV5640_REG_FW_CMD_PARA1, 0x00);
+ ov5640_write_reg(sensor, OV5640_REG_FW_CMD_PARA2, 0x00);
+ ov5640_write_reg(sensor, OV5640_REG_FW_CMD_PARA3, 0x00);
+ ov5640_write_reg(sensor, OV5640_REG_FW_CMD_PARA4, 0x00);
+ ov5640_write_reg(sensor, OV5640_REG_FW_STATUS, 0x7f);
+
+ // Start AF MCU
+ ret = ov5640_write_reg(sensor, OV5640_REG_SYS_RESET00, 0x00);
+ if (ret)
+ return ret;
+
+ dev_info(&client->dev, "firmware upload success\n");
+
+ // Wait for firmware to be ready
+ for (i = 0; i < 5; i++) {
+ ret = ov5640_read_reg(sensor, OV5640_REG_FW_STATUS, &fw_status);
+ if (fw_status == OV5640_FW_STATUS_S_IDLE) {
+ dev_info(&client->dev, "fw started after %d ms\n", i * 50);
+ return ret;
+ }
+ msleep(50);
+ }
+ dev_err(&client->dev, "uploaded firmware didn't start, got to 0x%x, retrying...\n", fw_status);
+
+ // Putting MCU in reset state
+ ret = ov5640_write_reg(sensor, OV5640_REG_SYS_RESET00, 0x20);
+ if (ret)
+ return ret;
+ // Start AF MCU
+ ret = ov5640_write_reg(sensor, OV5640_REG_SYS_RESET00, 0x00);
+ if (ret)
+ return ret;
+ // Wait for firmware to be ready
+ for (i = 0; i < 5; i++) {
+ ret = ov5640_read_reg(sensor, OV5640_REG_FW_STATUS, &fw_status);
+ if (fw_status == OV5640_FW_STATUS_S_IDLE) {
+ dev_info(&client->dev, "fw started after %d ms\n", i * 50);
+ return ret;
+ }
+ msleep(50);
+ }
+ dev_err(&client->dev, "uploaded firmware didn't start, got to 0x%x\n", fw_status);
+ return -ETIMEDOUT;
+}
+
+static int ov5640_af_init(struct ov5640_dev *sensor)
+{
+ struct i2c_client *client = sensor->i2c_client;
+ const char* fwname = "ov5640_af.bin";
+ const struct firmware *fw;
+ int ret;
+
+ if (sensor->af_initialized) {
+ return 0;
+ }
+
+ if (firmware_request_nowarn(&fw, fwname, &client->dev) == 0) {
+ ret = ov5640_copy_fw_to_device(sensor, fw);
+ if (ret == 0)
+ sensor->af_initialized = 1;
+ } else {
+ dev_warn(&client->dev, "%s: no autofocus firmware available (%s)\n",
+ __func__, fwname);
+ ret = -1;
+ }
+ release_firmware(fw);
+
+ if (ret)
+ return ret;
+
+ // Enable AF systems
+ ret = ov5640_mod_reg(sensor, OV5640_REG_SYS_CLOCK_ENABLE00,
+ (BIT(6) | BIT(5)), (BIT(6) | BIT(5)));
+ if (ret)
+ return ret;
+ ret = ov5640_mod_reg(sensor, OV5640_REG_SYS_CLOCK_ENABLE01,
+ BIT(6), BIT(6));
+ if (ret)
+ return ret;
+
+ // Set lens focus driver on
+ ov5640_write_reg(sensor, OV5640_REG_VCM_CONTROL4, 0x3f);
+ if (ret)
+ return ret;
+
+ return ret;
+}
+
static int ov5640_set_power_on(struct ov5640_dev *sensor)
{
struct i2c_client *client = sensor->i2c_client;
@@ -1988,6 +2117,8 @@ static int ov5640_set_power_on(struct ov5640_dev *sensor)
goto xclk_off;
}
+ sensor->af_initialized = 0;
+
ov5640_reset(sensor);
ov5640_power(sensor, true);
@@ -2416,6 +2547,35 @@ static int ov5640_set_framefmt(struct ov5640_dev *sensor,
is_jpeg ? (BIT(5) | BIT(3)) : 0);
}
+static int ov5640_fw_command(struct ov5640_dev *sensor, int command)
+{
+ u8 fw_ack;
+ int i;
+ int ret;
+
+ ret = ov5640_write_reg(sensor, OV5640_REG_FW_CMD_ACK, 0x01);
+ if(ret)
+ return ret;
+
+ ret = ov5640_write_reg(sensor, OV5640_REG_FW_CMD_MAIN, command);
+ if(ret)
+ return ret;
+
+ for (i = 0; i < 100; i++) {
+ ret = ov5640_read_reg(sensor, OV5640_REG_FW_CMD_ACK, &fw_ack);
+ if (ret)
+ return ret;
+
+ if (fw_ack == 0){
+ return ret;
+ }
+
+ msleep(50);
+ }
+ return -ETIMEDOUT;
+}
+
+
/*
* Sensor Controls.
*/
@@ -2532,6 +2692,41 @@ static int ov5640_set_ctrl_exposure(struct ov5640_dev *sensor,
return ret;
}
+static int ov5640_set_ctrl_focus(struct ov5640_dev *sensor, int command)
+{
+ struct i2c_client *client = sensor->i2c_client;
+ int ret;
+
+ ret = ov5640_af_init(sensor);
+ if (ret) {
+ dev_err(&client->dev, "%s: no autofocus firmware loaded\n",
+ __func__);
+ return 0;
+ }
+
+ if (command == OV5640_FW_CMD_RELEASE_FOCUS) {
+ dev_dbg(&client->dev, "%s: Releasing autofocus\n",
+ __func__);
+ return ov5640_fw_command(sensor, OV5640_FW_CMD_RELEASE_FOCUS);
+ }
+
+ // Restart zone config
+ ret = ov5640_fw_command(sensor, OV5640_FW_CMD_ZONE_CONFIG);
+ if (ret)
+ return ret;
+
+ // Set default focus zones
+ ret = ov5640_fw_command(sensor, OV5640_FW_CMD_DEFAULT_ZONES);
+ if (ret)
+ return ret;
+
+ dev_dbg(&client->dev, "%s: Triggering autofocus\n",
+ __func__);
+
+ // Start focussing
+ return ov5640_fw_command(sensor, command);
+}
+
static int ov5640_set_ctrl_gain(struct ov5640_dev *sensor, bool auto_gain)
{
struct ov5640_ctrls *ctrls = &sensor->ctrls;
@@ -2638,6 +2833,32 @@ static int ov5640_set_ctrl_vflip(struct ov5640_dev *sensor, int value)
(BIT(2) | BIT(1)) : 0);
}
+static int ov5640_get_af_status(struct ov5640_dev *sensor)
+{
+ u8 fw_status;
+ int ret;
+
+ ret = ov5640_read_reg(sensor, OV5640_REG_FW_STATUS, &fw_status);
+ if (ret)
+ return ret;
+
+ switch (fw_status) {
+ case OV5640_FW_STATUS_S_FIRMWARE:
+ case OV5640_FW_STATUS_S_STARTUP:
+ return V4L2_AUTO_FOCUS_STATUS_FAILED;
+ break;
+ case OV5640_FW_STATUS_S_IDLE:
+ return V4L2_AUTO_FOCUS_STATUS_IDLE;
+ break;
+ case OV5640_FW_STATUS_S_FOCUSED:
+ return V4L2_AUTO_FOCUS_STATUS_REACHED;
+ break;
+ default:
+ return V4L2_AUTO_FOCUS_STATUS_BUSY;
+ break;
+ }
+}
+
static int ov5640_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
{
struct v4l2_subdev *sd = ctrl_to_sd(ctrl);
@@ -2659,6 +2880,12 @@ static int ov5640_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
return val;
sensor->ctrls.exposure->val = val;
break;
+ case V4L2_CID_FOCUS_AUTO:
+ val = ov5640_get_af_status(sensor);
+ if (val < 0)
+ return val;
+ sensor->ctrls.af_status->val = val;
+ break;
}
return 0;
@@ -2690,6 +2917,18 @@ static int ov5640_s_ctrl(struct v4l2_ctrl *ctrl)
case V4L2_CID_AUTO_WHITE_BALANCE:
ret = ov5640_set_ctrl_white_balance(sensor, ctrl->val);
break;
+ case V4L2_CID_FOCUS_AUTO:
+ if (ctrl->val)
+ ret = ov5640_set_ctrl_focus(sensor, OV5640_FW_CMD_CONTINUOUS_FOCUS);
+ else
+ ret = ov5640_set_ctrl_focus(sensor, OV5640_FW_CMD_RELEASE_FOCUS);
+ break;
+ case V4L2_CID_AUTO_FOCUS_START:
+ ret = ov5640_set_ctrl_focus(sensor, OV5640_FW_CMD_TRIGGER_FOCUS);
+ break;
+ case V4L2_CID_AUTO_FOCUS_STOP:
+ ret = ov5640_set_ctrl_focus(sensor, OV5640_FW_CMD_RELEASE_FOCUS);
+ break;
case V4L2_CID_HUE:
ret = ov5640_set_ctrl_hue(sensor, ctrl->val);
break;
@@ -2762,6 +3001,20 @@ static int ov5640_init_controls(struct ov5640_dev *sensor)
ctrls->gain = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_GAIN,
0, 1023, 1, 0);
+ /* Autofocus */
+ ctrls->focus_auto = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_FOCUS_AUTO,
+ 0, 1, 1, 0);
+ ctrls->af_start = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_AUTO_FOCUS_START,
+ 0, 1, 1, 0);
+ ctrls->af_stop = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_AUTO_FOCUS_STOP,
+ 0, 1, 1, 0);
+ ctrls->af_status = v4l2_ctrl_new_std(hdl, ops,
+ V4L2_CID_AUTO_FOCUS_STATUS, 0,
+ (V4L2_AUTO_FOCUS_STATUS_BUSY |
+ V4L2_AUTO_FOCUS_STATUS_REACHED |
+ V4L2_AUTO_FOCUS_STATUS_FAILED),
+ 0, V4L2_AUTO_FOCUS_STATUS_IDLE);
+
ctrls->saturation = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_SATURATION,
0, 255, 1, 64);
ctrls->hue = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_HUE,
@@ -2795,6 +3048,7 @@ static int ov5640_init_controls(struct ov5640_dev *sensor)
v4l2_ctrl_auto_cluster(3, &ctrls->auto_wb, 0, false);
v4l2_ctrl_auto_cluster(2, &ctrls->auto_gain, 0, true);
v4l2_ctrl_auto_cluster(2, &ctrls->auto_exp, 1, true);
+ v4l2_ctrl_cluster(4, &ctrls->focus_auto);
sensor->sd.ctrl_handler = hdl;
return 0;
--
2.25.4

View File

@ -0,0 +1,12 @@
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi
index 1c555456b..05fab5d79 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi
@@ -78,6 +78,7 @@ green {
};
led-2 {
+ linux,default-trigger = "panic";
function = LED_FUNCTION_INDICATOR;
color = <LED_COLOR_ID_RED>;
gpios = <&pio 3 19 GPIO_ACTIVE_HIGH>; /* PD19 */

View File

@ -0,0 +1,21 @@
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
index a72c2ec8c..b3a7bef13 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
@@ -227,7 +227,15 @@ &i2c0_pins {
&i2c1 {
status = "okay";
- /* TODO: add Bochs BMA223 accelerometer here */
+ bma223@18 {
+ compatible = "bosch,bma223", "bosch,bma222e";
+ reg = <0x18>;
+ interrupt-parent = <&pio>;
+ interrupts = <7 5 IRQ_TYPE_LEVEL_HIGH>; /* PH5 */
+ mount-matrix = "0", "-1", "0",
+ "-1", "0", "0",
+ "0", "0", "-1";
+ };
};
&lradc {

View File

@ -0,0 +1,44 @@
From 330d05da1b6e8118c9c4655f0b234cf32a2f1ce4 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Sun, 14 Apr 2019 23:46:47 +0800
Subject: [PATCH] arm64: allwinner: a64: pinetab: enable RTL8723CS bluetooth
PineTab has a RTL8723CS Wi-Fi/BT combo chip on board, the bluetooth part
of it communicates with A64 via UART, and the power of it is controlled
with some GPIO at PL bank.
Enable the bluetooth in the device tree.
Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
---
.../boot/dts/allwinner/sun50i-a64-pinetab.dts | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
index 84d6e8cb2b88..e8b823875740 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
@@ -447,6 +447,20 @@ &uart0 {
status = "okay";
};
+&uart1 {
+ pinctrl-names = "default";
+ pinctrl-0 = <&uart1_pins>, <&uart1_rts_cts_pins>;
+ status = "okay";
+
+ bluetooth {
+ compatible = "realtek,rtl8723cs-bt";
+ reset-gpios = <&r_pio 0 4 GPIO_ACTIVE_LOW>; /* PL4 */
+ device-wake-gpios = <&r_pio 0 5 GPIO_ACTIVE_LOW>; /* PL5 */
+ host-wake-gpios = <&r_pio 0 6 GPIO_ACTIVE_HIGH>; /* PL6 */
+ firmware-postfix = "pinebook";
+ };
+};
+
&usb_otg {
dr_mode = "otg";
status = "okay";
--
GitLab

View File

@ -0,0 +1,82 @@
# Copyright 1999-2021 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
EAPI="6"
K_NOUSENAME="yes"
K_NOSETEXTRAVERSION="yes"
K_SECURITY_UNSUPPORTED="1"
ETYPE="sources"
inherit kernel-2
detect_version
KEYWORDS="~alpha ~amd64 ~arm ~arm64 ~hppa ~ia64 ~mips ~ppc ~ppc64 ~s390 ~sparc ~x86"
# Copyright 1999-2021 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
DEPEND="${RDEPEND}
>=sys-devel/patch-2.7.5"
DESCRIPTION="Full sources for the Linux kernel, with megi's patch for pinephone"
MEGI_PATCH_URI="https://xff.cz/kernels/${PV:0:4}/patches/all.patch"
SRC_URI="${KERNEL_URI} ${MEGI_PATCH_URI} -> all-${PV}.patch"
PATCHES=(
${DISTDIR}/all-${PV}.patch
${FILESDIR}/enable-hdmi-output-pinetab.patch
${FILESDIR}/enable-jack-detection-pinetab.patch
${FILESDIR}/pinetab-bluetooth.patch
${FILESDIR}/pinetab-accelerometer.patch
${FILESDIR}/dts-pinephone-drop-modem-power-node.patch
#${FILESDIR}/dts-headphone-jack-detection.patch
${FILESDIR}/media-ov5640-Implement-autofocus.patch
${FILESDIR}/0011-dts-pinetab-hardcode-mmc-numbers.patch
#${FILESDIR}/0012-pinephone-fix-pogopin-i2c.patch
${FILESDIR}/0107-quirk-kernel-org-bug-210681-firmware_rome_error.patch
${FILESDIR}/0177-leds-gpio-make-max_brightness-configurable.patch
#${FILESDIR}/0178-sun8i-codec-fix-headphone-jack-pin-name.patch
#${FILESDIR}/0179-arm64-dts-allwinner-pinephone-improve-device-tree-5.12.patch
${FILESDIR}/panic-led-5.12.patch
#${FILESDIR}/improve-jack-button-handling-and-mic.patch
${FILESDIR}/PATCH-v3-01-14-include-linux-memcontrol.h-do-not-warn-in-page_memcg_rcu-if-CONFIG_MEMCG.patch
${FILESDIR}/PATCH-v3-02-14-include-linux-nodemask.h-define-next_memory_node-if-CONFIG_NUMA.patch
${FILESDIR}/PATCH-v3-03-14-include-linux-cgroup.h-export-cgroup_mutex.patch
${FILESDIR}/PATCH-v3-04-14-mm-x86-support-the-access-bit-on-non-leaf-PMD-entries.patch
${FILESDIR}/PATCH-v3-05-14-mm-vmscan.c-refactor-shrink_node.patch
${FILESDIR}/PATCH-v3-06-14-mm-workingset.c-refactor-pack_shadow-and-unpack_shadow.patch
${FILESDIR}/PATCH-v3-07-14-mm-multigenerational-lru-groundwork.patch
${FILESDIR}/PATCH-v3-08-14-mm-multigenerational-lru-activation.patch
${FILESDIR}/PATCH-v3-09-14-mm-multigenerational-lru-mm_struct-list.patch
${FILESDIR}/PATCH-v3-10-14-mm-multigenerational-lru-aging.patch
${FILESDIR}/PATCH-v3-11-14-mm-multigenerational-lru-eviction.patch
${FILESDIR}/PATCH-v3-12-14-mm-multigenerational-lru-user-interface.patch
${FILESDIR}/PATCH-v3-13-14-mm-multigenerational-lru-Kconfig.patch
${FILESDIR}/PATCH-v3-14-14-mm-multigenerational-lru-documentation.patch
)
src_prepare() {
default
eapply_user
}
pkg_postinst() {
kernel-2_pkg_postinst
einfo "For more info on this patchset, and how to report problems, see:"
einfo "${HOMEPAGE}"
einfo "To build the kernel use the following command:"
einfo "make Image Image.gz modules"
einfo "make DTC_FLAGS="-@" dtbs"
einfo "make install; make modules_intall; make dtbs_install"
einfo "If you use kernel config coming with this ebuild, don't forget to also copy dracut-pp.conf to /etc/dracut.conf.d/"
einfo "to make sure proper kernel modules are loaded into initramfs"
einfo "if you want to cross compile pinephone kernel on amd64 host, follow the https://wiki.gentoo.org/wiki/Cross_build_environment"
einfo "to setup cross toolchain environment, then create a xmake wrapper like the following, and replace make with xmake in above commands"
einfo "#!/bin/sh"
einfo "exec make ARCH='arm64' CROSS_COMPILE='aarch64-unknown-linux-gnu-' INSTALL_MOD_PATH='${SYSROOT}' '$@'"
}
pkg_postrm() {
kernel-2_pkg_postrm
}