From 22db82dab7b7e017ca61b1582d82a7cfebac7729 Mon Sep 17 00:00:00 2001 From: Gerben Jan Dijkman Date: Mon, 17 Oct 2022 16:04:49 +0200 Subject: [PATCH] Combined both kernels and updated to 5.19.12 --- sys-kernel/pinephone-pro-sources/Manifest | 1 - ...-dts-pinephone-drop-modem-power-node.patch | 175 - ...-dts-pinephone-pro-remove-modem-node.patch | 86 - .../files/1500_XATTR_USER_PREFIX.patch | 67 - ...ink-security-restrictions-by-default.patch | 17 - ..._sparc-address-warray-bound-warnings.patch | 17 - ...nly-if-Secure-Simple-Pairing-enabled.patch | 37 - ...-build-issue-by-selecting-CONFIG_REG.patch | 30 - .../2920_sign-file-patch-for-libressl.patch | 16 - .../3000_Support-printing-firmware-info.patch | 14 - .../files/4567_distro-Gentoo-Kconfig.patch | 341 - ...0_enable-cpu-optimizations-universal.patch | 675 -- ...20_BMQ-and-PDS-io-scheduler-v5.19-r0.patch | 9956 ----------------- .../5021_BMQ-and-PDS-gentoo-defaults.patch | 13 - .../pinephone-pro-sources-5.19.3.ebuild | 70 - sys-kernel/pinephone-sources/Manifest | 2 +- ...k3399-pinephone-pro-add-modem-RI-pin.patch | 0 ...104-PPP-Add-reset-resume-to-usb_wwan.patch | 21 + ...dd-USB_QUIRK_RESET-for-Quectel-EG25G.patch | 25 + ...rk818_charger-use-type-battery-again.patch | 11 + ...hip-i2s-Dont-disable-mclk-on-suspend.patch | 29 + ...usued-softback_lines-cursor-argument.patch | 150 + ...-fbcon-remove-no-op-fbcon_set_origin.patch | 31 + ...rt-fbcon-remove-soft-scrollback-code.patch | 500 + .../pinephone-sources/files/5.19.10-11.patch | 1231 ++ .../pinephone-sources/files/5.19.11-12.patch | 9776 ++++++++++++++++ .../pinephone-sources/files/5.19.8-9.patch | 8234 ++++++++++++++ .../pinephone-sources/files/5.19.9-10.patch | 1723 +++ .../files/Multi-Gen-LRU-Framework.patch | 8901 --------------- .../files/config-ppp | 0 .../files/config-ppp-old | 0 .../files/dracut-ppp.conf | 0 .../pinephone-sources/files/pp-keyboard.patch | 176 - ...build => pinephone-sources-5.19.12.ebuild} | 25 +- 34 files changed, 21745 insertions(+), 20605 deletions(-) delete mode 100644 sys-kernel/pinephone-pro-sources/Manifest delete mode 100644 sys-kernel/pinephone-pro-sources/files/0101-arm64-dts-pinephone-drop-modem-power-node.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/0102-arm64-dts-pinephone-pro-remove-modem-node.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/1500_XATTR_USER_PREFIX.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/1700_sparc-address-warray-bound-warnings.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/2920_sign-file-patch-for-libressl.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/3000_Support-printing-firmware-info.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/4567_distro-Gentoo-Kconfig.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/5010_enable-cpu-optimizations-universal.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch delete mode 100644 sys-kernel/pinephone-pro-sources/files/5021_BMQ-and-PDS-gentoo-defaults.patch delete mode 100644 sys-kernel/pinephone-pro-sources/pinephone-pro-sources-5.19.3.ebuild rename sys-kernel/{pinephone-pro-sources => pinephone-sources}/files/0103-arm64-dts-rk3399-pinephone-pro-add-modem-RI-pin.patch (100%) create mode 100644 sys-kernel/pinephone-sources/files/0104-PPP-Add-reset-resume-to-usb_wwan.patch create mode 100644 sys-kernel/pinephone-sources/files/0104-Revert-usb-quirks-Add-USB_QUIRK_RESET-for-Quectel-EG25G.patch create mode 100644 sys-kernel/pinephone-sources/files/0104-rk818_charger-use-type-battery-again.patch create mode 100644 sys-kernel/pinephone-sources/files/0106-sound-rockchip-i2s-Dont-disable-mclk-on-suspend.patch create mode 100644 sys-kernel/pinephone-sources/files/0201-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch create mode 100644 sys-kernel/pinephone-sources/files/0202-revert-fbcon-remove-no-op-fbcon_set_origin.patch create mode 100644 sys-kernel/pinephone-sources/files/0203-revert-fbcon-remove-soft-scrollback-code.patch create mode 100644 sys-kernel/pinephone-sources/files/5.19.10-11.patch create mode 100644 sys-kernel/pinephone-sources/files/5.19.11-12.patch create mode 100644 sys-kernel/pinephone-sources/files/5.19.8-9.patch create mode 100644 sys-kernel/pinephone-sources/files/5.19.9-10.patch delete mode 100644 sys-kernel/pinephone-sources/files/Multi-Gen-LRU-Framework.patch rename sys-kernel/{pinephone-pro-sources => pinephone-sources}/files/config-ppp (100%) rename sys-kernel/{pinephone-pro-sources => pinephone-sources}/files/config-ppp-old (100%) rename sys-kernel/{pinephone-pro-sources => pinephone-sources}/files/dracut-ppp.conf (100%) delete mode 100644 sys-kernel/pinephone-sources/files/pp-keyboard.patch rename sys-kernel/pinephone-sources/{pinephone-sources-5.19.0.ebuild => pinephone-sources-5.19.12.ebuild} (71%) diff --git a/sys-kernel/pinephone-pro-sources/Manifest b/sys-kernel/pinephone-pro-sources/Manifest deleted file mode 100644 index 285950d..0000000 --- a/sys-kernel/pinephone-pro-sources/Manifest +++ /dev/null @@ -1 +0,0 @@ -DIST orange-pi-5.19-20220822-1337.tar.gz 215018577 BLAKE2B b598aee2fb3aece41e83a9916a62b450ab351ed7cd65c6006ed20f04656d260f619b4786d3dd0efcd19b7b6cbd1cec14dd2233e791d9b9e77368160dcf989c60 SHA512 93a0d29647c732716adce044af19b2ae303e6469ead0a90b364972237cc7a24ca9715e9a1d491c2f08126fe79c72072e58294453758ae80d9bf4fb5220485f1f diff --git a/sys-kernel/pinephone-pro-sources/files/0101-arm64-dts-pinephone-drop-modem-power-node.patch b/sys-kernel/pinephone-pro-sources/files/0101-arm64-dts-pinephone-drop-modem-power-node.patch deleted file mode 100644 index b90eced..0000000 --- a/sys-kernel/pinephone-pro-sources/files/0101-arm64-dts-pinephone-drop-modem-power-node.patch +++ /dev/null @@ -1,175 +0,0 @@ -From 602d05e416ae0d0fba3022fa2c3d195164b406c6 Mon Sep 17 00:00:00 2001 -From: Clayton Craft -Date: Wed, 16 Dec 2020 20:16:14 -0800 -Subject: [PATCH] dts: pinephone: drop modem-power node - ---- - .../allwinner/sun50i-a64-pinephone-1.0.dts | 26 +++--------------- - .../allwinner/sun50i-a64-pinephone-1.1.dts | 27 +++---------------- - .../allwinner/sun50i-a64-pinephone-1.2.dts | 27 +++---------------- - .../dts/allwinner/sun50i-a64-pinephone.dtsi | 12 +++++++++ - 4 files changed, 24 insertions(+), 68 deletions(-) - -diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.0.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.0.dts -index a21c6d78a..7f0cfdafe 100644 ---- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.0.dts -+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.0.dts -@@ -86,28 +86,6 @@ ®_drivevbus { - status = "okay"; - }; - --&uart3 { -- modem { -- compatible = "quectel,eg25"; -- char-device-name = "modem-power"; -- -- power-supply = <®_vbat_bb>; /* PL7 */ -- -- enable-gpios = <&pio 7 8 GPIO_ACTIVE_LOW>; /* PH8 */ -- reset-gpios = <&pio 2 4 GPIO_ACTIVE_HIGH>; /* PC4 */ -- pwrkey-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; /* PB3 */ -- -- sleep-gpios = <&pio 7 7 GPIO_ACTIVE_HIGH>; /* PH7 */ -- wakeup-gpios = <&pio 1 2 GPIO_ACTIVE_HIGH>; /* PB2-RI */ -- -- cts-gpios = <&pio 3 5 GPIO_ACTIVE_HIGH>; /* PD5-CTS */ -- dtr-gpios = <&r_pio 0 6 GPIO_ACTIVE_HIGH>; /* PL6-DTR */ -- rts-gpios = <&pio 3 4 GPIO_ACTIVE_HIGH>; /* PD4-RTS */ -- -- quectel,qdai = "1,1,0,1,0,0,1,1"; -- }; --}; -- - &usbphy { - usb-role-switch; - -@@ -118,6 +96,10 @@ usb0_drd_sw: endpoint { - }; - }; - -+&ring_indicator { -+ gpios = <&pio 1 2 GPIO_ACTIVE_LOW>; /* PB2 */ -+}; -+ - &sgm3140 { - enable-gpios = <&pio 2 3 GPIO_ACTIVE_HIGH>; /* PC3 */ - flash-gpios = <&pio 3 24 GPIO_ACTIVE_HIGH>; /* PD24 */ -diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.1.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.1.dts -index 61ff56b17..5e85ddc12 100644 ---- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.1.dts -+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.1.dts -@@ -109,34 +109,15 @@ ®_drivevbus { - status = "okay"; - }; - -+&ring_indicator { -+ gpios = <&pio 1 2 GPIO_ACTIVE_LOW>; /* PB2 */ -+}; -+ - &sgm3140 { - enable-gpios = <&pio 3 24 GPIO_ACTIVE_HIGH>; /* PD24 */ - flash-gpios = <&pio 2 3 GPIO_ACTIVE_HIGH>; /* PC3 */ - }; - --&uart3 { -- modem { -- compatible = "quectel,eg25"; -- char-device-name = "modem-power"; -- -- power-supply = <®_vbat_bb>; /* PL7 */ -- -- enable-gpios = <&pio 7 8 GPIO_ACTIVE_LOW>; /* PH8 */ -- reset-gpios = <&pio 2 4 GPIO_ACTIVE_HIGH>; /* PC4 */ -- pwrkey-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; /* PB3 */ -- //status-pwrkey-multiplexed; /* status acts as pwrkey */ -- -- sleep-gpios = <&pio 7 7 GPIO_ACTIVE_HIGH>; /* PH7 */ -- wakeup-gpios = <&pio 1 2 GPIO_ACTIVE_HIGH>; /* PB2-RI */ -- -- dtr-gpios = <&r_pio 0 6 GPIO_ACTIVE_HIGH>; /* PL6-DTR */ -- cts-gpios = <&pio 3 5 GPIO_ACTIVE_HIGH>; /* PD5-CTS */ -- rts-gpios = <&pio 3 4 GPIO_ACTIVE_HIGH>; /* PD4-RTS */ -- -- quectel,qdai = "1,1,0,1,0,0,1,1"; -- }; --}; -- - &usbphy { - usb-role-switch; - -diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.2.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.2.dts -index fe7d567a8..f4b9b0991 100644 ---- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.2.dts -+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.2.dts -@@ -101,34 +101,15 @@ ®_anx1v0 { - enable-active-high; - }; - -+&ring_indicator { -+ gpios = <&r_pio 0 6 GPIO_ACTIVE_LOW>; /* PL6 */ -+}; -+ - &sgm3140 { - enable-gpios = <&pio 3 24 GPIO_ACTIVE_HIGH>; /* PD24 */ - flash-gpios = <&pio 2 3 GPIO_ACTIVE_HIGH>; /* PC3 */ - }; - --&uart3 { -- modem { -- compatible = "quectel,eg25"; -- char-device-name = "modem-power"; -- -- power-supply = <®_vbat_bb>; /* PL7 */ -- -- enable-gpios = <&pio 7 8 GPIO_ACTIVE_LOW>; /* PH8 */ -- reset-gpios = <&pio 2 4 GPIO_ACTIVE_HIGH>; /* PC4 */ -- status-gpios = <&pio 7 9 GPIO_ACTIVE_HIGH>; /* PH9 */ -- pwrkey-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; /* PB3 */ -- -- host-ready-gpios = <&pio 7 7 GPIO_ACTIVE_HIGH>; /* PH7 */ -- wakeup-gpios = <&r_pio 0 6 GPIO_ACTIVE_HIGH>; /* PL6-RI */ -- -- dtr-gpios = <&pio 1 2 GPIO_ACTIVE_HIGH>; /* PB2-DTR */ -- cts-gpios = <&pio 3 5 GPIO_ACTIVE_HIGH>; /* PD5-CTS */ -- rts-gpios = <&pio 3 4 GPIO_ACTIVE_HIGH>; /* PD4-RTS */ -- -- quectel,qdai = "1,1,0,1,0,0,1,1"; -- }; --}; -- - &usbphy { - usb-role-switch; - -diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -index 346113382..7b48126d1 100644 ---- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -@@ -192,6 +192,17 @@ ec25_codec: ec25-codec { - sound-name-prefix = "Modem"; - }; - -+ gpio-keys { -+ compatible = "gpio-keys"; -+ -+ ring_indicator: ring-indicator { -+ label = "Ring Indicator"; -+ linux,can-disable; -+ linux,code = ; -+ wakeup-source; -+ }; -+ }; -+ - i2c_csi: i2c-csi { - compatible = "i2c-gpio"; - sda-gpios = <&pio 4 13 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; /* PE13 */ -@@ -264,6 +275,7 @@ reg_usb_5v: usb-5v { - reg_vbat_bb: vbat-bb { - compatible = "regulator-fixed"; - regulator-name = "vbat-bb"; -+ regulator-always-on; - regulator-min-microvolt = <3500000>; - regulator-max-microvolt = <3500000>; - gpio = <&r_pio 0 7 GPIO_ACTIVE_HIGH>; /* PL7 */ --- -2.31.1 - diff --git a/sys-kernel/pinephone-pro-sources/files/0102-arm64-dts-pinephone-pro-remove-modem-node.patch b/sys-kernel/pinephone-pro-sources/files/0102-arm64-dts-pinephone-pro-remove-modem-node.patch deleted file mode 100644 index 24be3b4..0000000 --- a/sys-kernel/pinephone-pro-sources/files/0102-arm64-dts-pinephone-pro-remove-modem-node.patch +++ /dev/null @@ -1,86 +0,0 @@ -From 60d8aedea7c8c390ee744730ab3e565ea84496fb Mon Sep 17 00:00:00 2001 -From: Danct12 -Date: Fri, 10 Dec 2021 23:01:34 +0700 -Subject: [PATCH] arm64: dts: rk3399-pinephone-pro: Remove modem node - -Since we don't use modem-power driver, this can be removed -for eg25-manager. ---- - .../dts/rockchip/rk3399-pinephone-pro.dts | 40 +------------------ - 1 file changed, 2 insertions(+), 38 deletions(-) - -diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts -index 61c990764..13141c643 100644 ---- a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts -+++ b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts -@@ -326,6 +326,7 @@ vcc_4g_5v: vcc-4g-5v { - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; - vin-supply = <&vcc5v0_sys>; -+ regulator-always-on; - }; - - vcc_4g: vcc-4g { -@@ -338,6 +339,7 @@ vcc_4g: vcc-4g { - regulator-min-microvolt = <3800000>; - regulator-max-microvolt = <3800000>; - vin-supply = <&vcc_sysin>; -+ regulator-always-on; - }; - - vcc1v8_codec: vcc1v8-codec-regulator { -@@ -1058,31 +1060,6 @@ mipi_in_panel: endpoint { - - &uart3 { - status = "okay"; -- -- modem { -- compatible = "quectel,eg25"; -- char-device-name = "modem-power"; -- -- pinctrl-names = "default"; -- pinctrl-0 = <&modem_control_pins>; -- -- power-supply = <&vcc_4g>; -- vbus-supply = <&vcc_4g_5v>; -- -- enable-gpios = <&gpio0 RK_PB0 GPIO_ACTIVE_HIGH>; // W_DISABLE# -- reset-gpios = <&gpio3 RK_PB0 GPIO_ACTIVE_HIGH>; -- status-gpios = <&gpio3 RK_PA6 GPIO_ACTIVE_HIGH>; -- pwrkey-gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>; -- -- host-ready-gpios = <&gpio0 RK_PB4 GPIO_ACTIVE_HIGH>; // apready -- wakeup-gpios = <&gpio0 RK_PA1 GPIO_ACTIVE_HIGH>; // ri -- -- dtr-gpios = <&gpio0 RK_PA3 GPIO_ACTIVE_HIGH>; -- cts-gpios = <&gpio3 RK_PC0 GPIO_ACTIVE_HIGH>; -- rts-gpios = <&gpio3 RK_PC1 GPIO_ACTIVE_HIGH>; -- -- quectel,qdai = "3,0,0,4,0,0,1,1"; -- }; - }; - - &pmu_io_domains { -@@ -1153,19 +1130,6 @@ vcc_4g_5v_en: vcc-4g-5v-en-pin { - vcc_4g_en: vcc-4g-en-pin { - rockchip,pins = <4 RK_PC7 RK_FUNC_GPIO &pcfg_pull_none>; - }; -- -- modem_control_pins: modem-control-pins { -- rockchip,pins = -- <0 RK_PB0 RK_FUNC_GPIO &pcfg_pull_none>, -- <3 RK_PB0 RK_FUNC_GPIO &pcfg_pull_none>, -- <3 RK_PA6 RK_FUNC_GPIO &pcfg_pull_none>, -- <0 RK_PB5 RK_FUNC_GPIO &pcfg_pull_none>, -- <0 RK_PB4 RK_FUNC_GPIO &pcfg_pull_none>, -- <0 RK_PA1 RK_FUNC_GPIO &pcfg_pull_none>, -- <0 RK_PA3 RK_FUNC_GPIO &pcfg_pull_none>, -- <3 RK_PC0 RK_FUNC_GPIO &pcfg_pull_none>, -- <3 RK_PC1 RK_FUNC_GPIO &pcfg_pull_none>; -- }; - }; - - pmic { --- -2.34.1 - diff --git a/sys-kernel/pinephone-pro-sources/files/1500_XATTR_USER_PREFIX.patch b/sys-kernel/pinephone-pro-sources/files/1500_XATTR_USER_PREFIX.patch deleted file mode 100644 index 245dcc2..0000000 --- a/sys-kernel/pinephone-pro-sources/files/1500_XATTR_USER_PREFIX.patch +++ /dev/null @@ -1,67 +0,0 @@ -From: Anthony G. Basile - -This patch adds support for a restricted user-controlled namespace on -tmpfs filesystem used to house PaX flags. The namespace must be of the -form user.pax.* and its value cannot exceed a size of 8 bytes. - -This is needed even on all Gentoo systems so that XATTR_PAX flags -are preserved for users who might build packages using portage on -a tmpfs system with a non-hardened kernel and then switch to a -hardened kernel with XATTR_PAX enabled. - -The namespace is added to any user with Extended Attribute support -enabled for tmpfs. Users who do not enable xattrs will not have -the XATTR_PAX flags preserved. - -diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h -index 1590c49..5eab462 100644 ---- a/include/uapi/linux/xattr.h -+++ b/include/uapi/linux/xattr.h -@@ -73,5 +73,9 @@ - #define XATTR_POSIX_ACL_DEFAULT "posix_acl_default" - #define XATTR_NAME_POSIX_ACL_DEFAULT XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_DEFAULT - -+/* User namespace */ -+#define XATTR_PAX_PREFIX XATTR_USER_PREFIX "pax." -+#define XATTR_PAX_FLAGS_SUFFIX "flags" -+#define XATTR_NAME_PAX_FLAGS XATTR_PAX_PREFIX XATTR_PAX_FLAGS_SUFFIX - - #endif /* _UAPI_LINUX_XATTR_H */ ---- a/mm/shmem.c 2020-05-04 15:30:27.042035334 -0400 -+++ b/mm/shmem.c 2020-05-04 15:34:57.013881725 -0400 -@@ -3238,6 +3238,14 @@ static int shmem_xattr_handler_set(const - struct shmem_inode_info *info = SHMEM_I(inode); - - name = xattr_full_name(handler, name); -+ -+ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { -+ if (strcmp(name, XATTR_NAME_PAX_FLAGS)) -+ return -EOPNOTSUPP; -+ if (size > 8) -+ return -EINVAL; -+ } -+ - return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); - } - -@@ -3253,6 +3261,12 @@ static const struct xattr_handler shmem_ - .set = shmem_xattr_handler_set, - }; - -+static const struct xattr_handler shmem_user_xattr_handler = { -+ .prefix = XATTR_USER_PREFIX, -+ .get = shmem_xattr_handler_get, -+ .set = shmem_xattr_handler_set, -+}; -+ - static const struct xattr_handler *shmem_xattr_handlers[] = { - #ifdef CONFIG_TMPFS_POSIX_ACL - &posix_acl_access_xattr_handler, -@@ -3260,6 +3274,7 @@ static const struct xattr_handler *shmem - #endif - &shmem_security_xattr_handler, - &shmem_trusted_xattr_handler, -+ &shmem_user_xattr_handler, - NULL - }; - diff --git a/sys-kernel/pinephone-pro-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch b/sys-kernel/pinephone-pro-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch deleted file mode 100644 index e8c3015..0000000 --- a/sys-kernel/pinephone-pro-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch +++ /dev/null @@ -1,17 +0,0 @@ ---- a/fs/namei.c 2022-01-23 13:02:27.876558299 -0500 -+++ b/fs/namei.c 2022-03-06 12:47:39.375719693 -0500 -@@ -1020,10 +1020,10 @@ static inline void put_link(struct namei - path_put(&last->link); - } - --static int sysctl_protected_symlinks __read_mostly; --static int sysctl_protected_hardlinks __read_mostly; --static int sysctl_protected_fifos __read_mostly; --static int sysctl_protected_regular __read_mostly; -+static int sysctl_protected_symlinks __read_mostly = 1; -+static int sysctl_protected_hardlinks __read_mostly = 1; -+int sysctl_protected_fifos __read_mostly = 1; -+int sysctl_protected_regular __read_mostly = 1; - - #ifdef CONFIG_SYSCTL - static struct ctl_table namei_sysctls[] = { diff --git a/sys-kernel/pinephone-pro-sources/files/1700_sparc-address-warray-bound-warnings.patch b/sys-kernel/pinephone-pro-sources/files/1700_sparc-address-warray-bound-warnings.patch deleted file mode 100644 index f939355..0000000 --- a/sys-kernel/pinephone-pro-sources/files/1700_sparc-address-warray-bound-warnings.patch +++ /dev/null @@ -1,17 +0,0 @@ ---- a/arch/sparc/mm/init_64.c 2022-05-24 16:48:40.749677491 -0400 -+++ b/arch/sparc/mm/init_64.c 2022-05-24 16:55:15.511356945 -0400 -@@ -3052,11 +3052,11 @@ static inline resource_size_t compute_ke - static void __init kernel_lds_init(void) - { - code_resource.start = compute_kern_paddr(_text); -- code_resource.end = compute_kern_paddr(_etext - 1); -+ code_resource.end = compute_kern_paddr(_etext) - 1; - data_resource.start = compute_kern_paddr(_etext); -- data_resource.end = compute_kern_paddr(_edata - 1); -+ data_resource.end = compute_kern_paddr(_edata) - 1; - bss_resource.start = compute_kern_paddr(__bss_start); -- bss_resource.end = compute_kern_paddr(_end - 1); -+ bss_resource.end = compute_kern_paddr(_end) - 1; - } - - static int __init report_memory(void) diff --git a/sys-kernel/pinephone-pro-sources/files/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch b/sys-kernel/pinephone-pro-sources/files/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch deleted file mode 100644 index 394ad48..0000000 --- a/sys-kernel/pinephone-pro-sources/files/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch +++ /dev/null @@ -1,37 +0,0 @@ -The encryption is only mandatory to be enforced when both sides are using -Secure Simple Pairing and this means the key size check makes only sense -in that case. - -On legacy Bluetooth 2.0 and earlier devices like mice the encryption was -optional and thus causing an issue if the key size check is not bound to -using Secure Simple Pairing. - -Fixes: d5bb334a8e17 ("Bluetooth: Align minimum encryption key size for LE and BR/EDR connections") -Signed-off-by: Marcel Holtmann -Cc: stable@vger.kernel.org ---- - net/bluetooth/hci_conn.c | 9 +++++++-- - 1 file changed, 7 insertions(+), 2 deletions(-) - -diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c -index 3cf0764d5793..7516cdde3373 100644 ---- a/net/bluetooth/hci_conn.c -+++ b/net/bluetooth/hci_conn.c -@@ -1272,8 +1272,13 @@ int hci_conn_check_link_mode(struct hci_conn *conn) - return 0; - } - -- if (hci_conn_ssp_enabled(conn) && -- !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) -+ /* If Secure Simple Pairing is not enabled, then legacy connection -+ * setup is used and no encryption or key sizes can be enforced. -+ */ -+ if (!hci_conn_ssp_enabled(conn)) -+ return 1; -+ -+ if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) - return 0; - - /* The minimum encryption key size needs to be enforced by the --- -2.20.1 diff --git a/sys-kernel/pinephone-pro-sources/files/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch b/sys-kernel/pinephone-pro-sources/files/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch deleted file mode 100644 index 4335685..0000000 --- a/sys-kernel/pinephone-pro-sources/files/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch +++ /dev/null @@ -1,30 +0,0 @@ -From dc328d75a6f37f4ff11a81ae16b1ec88c3197640 Mon Sep 17 00:00:00 2001 -From: Mike Pagano -Date: Mon, 23 Mar 2020 08:20:06 -0400 -Subject: [PATCH 1/1] This driver requires REGMAP_I2C to build. Select it by - default in Kconfig. Reported at gentoo bugzilla: - https://bugs.gentoo.org/710790 -Cc: mpagano@gentoo.org - -Reported-by: Phil Stracchino - -Signed-off-by: Mike Pagano ---- - drivers/hwmon/Kconfig | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig -index 47ac20aee06f..530b4f29ba85 100644 ---- a/drivers/hwmon/Kconfig -+++ b/drivers/hwmon/Kconfig -@@ -1769,6 +1769,7 @@ config SENSORS_TMP421 - config SENSORS_TMP513 - tristate "Texas Instruments TMP513 and compatibles" - depends on I2C -+ select REGMAP_I2C - help - If you say yes here you get support for Texas Instruments TMP512, - and TMP513 temperature and power supply sensor chips. --- -2.24.1 - diff --git a/sys-kernel/pinephone-pro-sources/files/2920_sign-file-patch-for-libressl.patch b/sys-kernel/pinephone-pro-sources/files/2920_sign-file-patch-for-libressl.patch deleted file mode 100644 index e6ec017..0000000 --- a/sys-kernel/pinephone-pro-sources/files/2920_sign-file-patch-for-libressl.patch +++ /dev/null @@ -1,16 +0,0 @@ ---- a/scripts/sign-file.c 2020-05-20 18:47:21.282820662 -0400 -+++ b/scripts/sign-file.c 2020-05-20 18:48:37.991081899 -0400 -@@ -41,9 +41,10 @@ - * signing with anything other than SHA1 - so we're stuck with that if such is - * the case. - */ --#if defined(LIBRESSL_VERSION_NUMBER) || \ -- OPENSSL_VERSION_NUMBER < 0x10000000L || \ -- defined(OPENSSL_NO_CMS) -+#if defined(OPENSSL_NO_CMS) || \ -+ ( defined(LIBRESSL_VERSION_NUMBER) \ -+ && (LIBRESSL_VERSION_NUMBER < 0x3010000fL) ) || \ -+ OPENSSL_VERSION_NUMBER < 0x10000000L - #define USE_PKCS7 - #endif - #ifndef USE_PKCS7 diff --git a/sys-kernel/pinephone-pro-sources/files/3000_Support-printing-firmware-info.patch b/sys-kernel/pinephone-pro-sources/files/3000_Support-printing-firmware-info.patch deleted file mode 100644 index a630cfb..0000000 --- a/sys-kernel/pinephone-pro-sources/files/3000_Support-printing-firmware-info.patch +++ /dev/null @@ -1,14 +0,0 @@ ---- a/drivers/base/firmware_loader/main.c 2021-08-24 15:42:07.025482085 -0400 -+++ b/drivers/base/firmware_loader/main.c 2021-08-24 15:44:40.782975313 -0400 -@@ -809,6 +809,11 @@ _request_firmware(const struct firmware - - ret = _request_firmware_prepare(&fw, name, device, buf, size, - offset, opt_flags); -+ -+#ifdef CONFIG_GENTOO_PRINT_FIRMWARE_INFO -+ printk(KERN_NOTICE "Loading firmware: %s\n", name); -+#endif -+ - if (ret <= 0) /* error or already assigned */ - goto out; - diff --git a/sys-kernel/pinephone-pro-sources/files/4567_distro-Gentoo-Kconfig.patch b/sys-kernel/pinephone-pro-sources/files/4567_distro-Gentoo-Kconfig.patch deleted file mode 100644 index 0a38098..0000000 --- a/sys-kernel/pinephone-pro-sources/files/4567_distro-Gentoo-Kconfig.patch +++ /dev/null @@ -1,341 +0,0 @@ ---- a/Kconfig 2022-05-11 13:20:07.110347567 -0400 -+++ b/Kconfig 2022-05-11 13:21:12.127174393 -0400 -@@ -30,3 +30,5 @@ source "lib/Kconfig" - source "lib/Kconfig.debug" - - source "Documentation/Kconfig" -+ -+source "distro/Kconfig" ---- /dev/null 2022-05-10 13:47:17.750578524 -0400 -+++ b/distro/Kconfig 2022-05-11 13:21:20.540529032 -0400 -@@ -0,0 +1,290 @@ -+menu "Gentoo Linux" -+ -+config GENTOO_LINUX -+ bool "Gentoo Linux support" -+ -+ default y -+ -+ select CPU_FREQ_DEFAULT_GOV_SCHEDUTIL -+ -+ help -+ In order to boot Gentoo Linux a minimal set of config settings needs to -+ be enabled in the kernel; to avoid the users from having to enable them -+ manually as part of a Gentoo Linux installation or a new clean config, -+ we enable these config settings by default for convenience. -+ -+ See the settings that become available for more details and fine-tuning. -+ -+config GENTOO_LINUX_UDEV -+ bool "Linux dynamic and persistent device naming (userspace devfs) support" -+ -+ depends on GENTOO_LINUX -+ default y if GENTOO_LINUX -+ -+ select DEVTMPFS -+ select TMPFS -+ select UNIX -+ -+ select MMU -+ select SHMEM -+ -+ help -+ In order to boot Gentoo Linux a minimal set of config settings needs to -+ be enabled in the kernel; to avoid the users from having to enable them -+ manually as part of a Gentoo Linux installation or a new clean config, -+ we enable these config settings by default for convenience. -+ -+ Currently this only selects TMPFS, DEVTMPFS and their dependencies. -+ TMPFS is enabled to maintain a tmpfs file system at /dev/shm, /run and -+ /sys/fs/cgroup; DEVTMPFS to maintain a devtmpfs file system at /dev. -+ -+ Some of these are critical files that need to be available early in the -+ boot process; if not available, it causes sysfs and udev to malfunction. -+ -+ To ensure Gentoo Linux boots, it is best to leave this setting enabled; -+ if you run a custom setup, you could consider whether to disable this. -+ -+config GENTOO_LINUX_PORTAGE -+ bool "Select options required by Portage features" -+ -+ depends on GENTOO_LINUX -+ default y if GENTOO_LINUX -+ -+ select CGROUPS -+ select NAMESPACES -+ select IPC_NS -+ select NET_NS -+ select PID_NS -+ select SYSVIPC -+ select USER_NS -+ select UTS_NS -+ -+ help -+ This enables options required by various Portage FEATURES. -+ Currently this selects: -+ -+ CGROUPS (required for FEATURES=cgroup) -+ IPC_NS (required for FEATURES=ipc-sandbox) -+ NET_NS (required for FEATURES=network-sandbox) -+ PID_NS (required for FEATURES=pid-sandbox) -+ SYSVIPC (required by IPC_NS) -+ -+ -+ It is highly recommended that you leave this enabled as these FEATURES -+ are, or will soon be, enabled by default. -+ -+menu "Support for init systems, system and service managers" -+ visible if GENTOO_LINUX -+ -+config GENTOO_LINUX_INIT_SCRIPT -+ bool "OpenRC, runit and other script based systems and managers" -+ -+ default y if GENTOO_LINUX -+ -+ depends on GENTOO_LINUX -+ -+ select BINFMT_SCRIPT -+ select CGROUPS -+ select EPOLL -+ select FILE_LOCKING -+ select INOTIFY_USER -+ select SIGNALFD -+ select TIMERFD -+ -+ help -+ The init system is the first thing that loads after the kernel booted. -+ -+ These config settings allow you to select which init systems to support; -+ instead of having to select all the individual settings all over the -+ place, these settings allows you to select all the settings at once. -+ -+ This particular setting enables all the known requirements for OpenRC, -+ runit and similar script based systems and managers. -+ -+ If you are unsure about this, it is best to leave this setting enabled. -+ -+config GENTOO_LINUX_INIT_SYSTEMD -+ bool "systemd" -+ -+ default n -+ -+ depends on GENTOO_LINUX && GENTOO_LINUX_UDEV -+ -+ select AUTOFS_FS -+ select BLK_DEV_BSG -+ select BPF_SYSCALL -+ select CGROUP_BPF -+ select CGROUPS -+ select CRYPTO_HMAC -+ select CRYPTO_SHA256 -+ select CRYPTO_USER_API_HASH -+ select DEVPTS_MULTIPLE_INSTANCES -+ select DMIID if X86_32 || X86_64 || X86 -+ select EPOLL -+ select FANOTIFY -+ select FHANDLE -+ select FILE_LOCKING -+ select INOTIFY_USER -+ select IPV6 -+ select KCMP -+ select NET -+ select NET_NS -+ select PROC_FS -+ select SECCOMP if HAVE_ARCH_SECCOMP -+ select SECCOMP_FILTER if HAVE_ARCH_SECCOMP_FILTER -+ select SIGNALFD -+ select SYSFS -+ select TIMERFD -+ select TMPFS_POSIX_ACL -+ select TMPFS_XATTR -+ -+ select ANON_INODES -+ select BLOCK -+ select EVENTFD -+ select FSNOTIFY -+ select INET -+ select NLATTR -+ -+ help -+ The init system is the first thing that loads after the kernel booted. -+ -+ These config settings allow you to select which init systems to support; -+ instead of having to select all the individual settings all over the -+ place, these settings allows you to select all the settings at once. -+ -+ This particular setting enables all the known requirements for systemd; -+ it also enables suggested optional settings, as the package suggests to. -+ -+endmenu -+ -+menuconfig GENTOO_KERNEL_SELF_PROTECTION -+ bool "Kernel Self Protection Project" -+ depends on GENTOO_LINUX -+ help -+ Recommended Kernel settings based on the suggestions from the Kernel Self Protection Project -+ See: https://kernsec.org/wiki/index.php/Kernel_Self_Protection_Project/Recommended_Settings -+ Note, there may be additional settings for which the CONFIG_ setting is invisible in menuconfig due -+ to unmet dependencies. Search for GENTOO_KERNEL_SELF_PROTECTION_COMMON and search for -+ GENTOO_KERNEL_SELF_PROTECTION_{X86_64, ARM64, X86_32, ARM} for dependency information on your -+ specific architecture. -+ Note 2: Please see the URL above for numeric settings, e.g. CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -+ for X86_64 -+ -+if GENTOO_KERNEL_SELF_PROTECTION -+config GENTOO_KERNEL_SELF_PROTECTION_COMMON -+ bool "Enable Kernel Self Protection Project Recommendations" -+ -+ depends on GENTOO_LINUX && !ACPI_CUSTOM_METHOD && !COMPAT_BRK && !PROC_KCORE && !COMPAT_VDSO && !KEXEC && !HIBERNATION && !LEGACY_PTYS && !X86_X32 && !MODIFY_LDT_SYSCALL && GCC_PLUGINS && !IOMMU_DEFAULT_DMA_LAZY && !IOMMU_DEFAULT_PASSTHROUGH && IOMMU_DEFAULT_DMA_STRICT -+ -+ select BUG -+ select STRICT_KERNEL_RWX -+ select DEBUG_WX -+ select STACKPROTECTOR -+ select STACKPROTECTOR_STRONG -+ select STRICT_DEVMEM if DEVMEM=y -+ select IO_STRICT_DEVMEM if DEVMEM=y -+ select SYN_COOKIES -+ select DEBUG_CREDENTIALS -+ select DEBUG_NOTIFIERS -+ select DEBUG_LIST -+ select DEBUG_SG -+ select HARDENED_USERCOPY if HAVE_HARDENED_USERCOPY_ALLOCATOR=y -+ select KFENCE if HAVE_ARCH_KFENCE && (!SLAB || SLUB) -+ select RANDOMIZE_KSTACK_OFFSET_DEFAULT if HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET && (INIT_STACK_NONE || !CC_IS_CLANG || CLANG_VERSION>=140000) -+ select SCHED_CORE if SCHED_SMT -+ select BUG_ON_DATA_CORRUPTION -+ select SCHED_STACK_END_CHECK -+ select SECCOMP if HAVE_ARCH_SECCOMP -+ select SECCOMP_FILTER if HAVE_ARCH_SECCOMP_FILTER -+ select SECURITY_YAMA -+ select SLAB_FREELIST_RANDOM -+ select SLAB_FREELIST_HARDENED -+ select SHUFFLE_PAGE_ALLOCATOR -+ select SLUB_DEBUG -+ select PAGE_POISONING -+ select PAGE_POISONING_NO_SANITY -+ select PAGE_POISONING_ZERO -+ select INIT_ON_ALLOC_DEFAULT_ON -+ select INIT_ON_FREE_DEFAULT_ON -+ select REFCOUNT_FULL -+ select FORTIFY_SOURCE -+ select SECURITY_DMESG_RESTRICT -+ select PANIC_ON_OOPS -+ select GCC_PLUGIN_LATENT_ENTROPY -+ select GCC_PLUGIN_STRUCTLEAK -+ select GCC_PLUGIN_STRUCTLEAK_BYREF_ALL -+ select GCC_PLUGIN_RANDSTRUCT -+ select GCC_PLUGIN_RANDSTRUCT_PERFORMANCE -+ select ZERO_CALL_USED_REGS if CC_HAS_ZERO_CALL_USED_REGS -+ -+ help -+ Search for GENTOO_KERNEL_SELF_PROTECTION_{X86_64, ARM64, X86_32, ARM} for dependency -+ information on your specific architecture. Note 2: Please see the URL above for -+ numeric settings, e.g. CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 for X86_64 -+ -+config GENTOO_KERNEL_SELF_PROTECTION_X86_64 -+ bool "X86_64 KSPP Settings" if GENTOO_KERNEL_SELF_PROTECTION_COMMON -+ -+ depends on !X86_MSR && X86_64 && GENTOO_KERNEL_SELF_PROTECTION -+ default n -+ -+ select RANDOMIZE_BASE -+ select RANDOMIZE_MEMORY -+ select RELOCATABLE -+ select LEGACY_VSYSCALL_NONE -+ select PAGE_TABLE_ISOLATION -+ select GCC_PLUGIN_STACKLEAK -+ select VMAP_STACK -+ -+ -+config GENTOO_KERNEL_SELF_PROTECTION_ARM64 -+ bool "ARM64 KSPP Settings" -+ -+ depends on ARM64 -+ default n -+ -+ select RANDOMIZE_BASE -+ select RELOCATABLE -+ select ARM64_SW_TTBR0_PAN -+ select CONFIG_UNMAP_KERNEL_AT_EL0 -+ select GCC_PLUGIN_STACKLEAK -+ select VMAP_STACK -+ -+config GENTOO_KERNEL_SELF_PROTECTION_X86_32 -+ bool "X86_32 KSPP Settings" -+ -+ depends on !X86_MSR && !MODIFY_LDT_SYSCALL && !M486 && X86_32 -+ default n -+ -+ select HIGHMEM64G -+ select X86_PAE -+ select RANDOMIZE_BASE -+ select RELOCATABLE -+ select PAGE_TABLE_ISOLATION -+ -+config GENTOO_KERNEL_SELF_PROTECTION_ARM -+ bool "ARM KSPP Settings" -+ -+ depends on !OABI_COMPAT && ARM -+ default n -+ -+ select VMSPLIT_3G -+ select STRICT_MEMORY_RWX -+ select CPU_SW_DOMAIN_PAN -+ -+endif -+ -+config GENTOO_PRINT_FIRMWARE_INFO -+ bool "Print firmware information that the kernel attempts to load" -+ -+ depends on GENTOO_LINUX -+ default y -+ -+ help -+ Enable this option to print information about firmware that the kernel -+ is attempting to load. This information can be accessible via the -+ dmesg command-line utility -+ -+ See the settings that become available for more details and fine-tuning. -+ -+endmenu -diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig -index 9e921fc72..f29bc13fa 100644 ---- a/security/selinux/Kconfig -+++ b/security/selinux/Kconfig -@@ -26,6 +26,7 @@ config SECURITY_SELINUX_BOOTPARAM - config SECURITY_SELINUX_DISABLE - bool "NSA SELinux runtime disable" - depends on SECURITY_SELINUX -+ depends on !GENTOO_KERNEL_SELF_PROTECTION - select SECURITY_WRITABLE_HOOKS - default n - help --- -2.31.1 - -From bd3ff0b16792c18c0614c2b95e148943209f460a Mon Sep 17 00:00:00 2001 -From: Georgy Yakovlev -Date: Tue, 8 Jun 2021 13:59:57 -0700 -Subject: [PATCH 2/2] set DEFAULT_MMAP_MIN_ADDR by default - ---- - mm/Kconfig | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/mm/Kconfig b/mm/Kconfig -index 24c045b24..e13fc740c 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -321,6 +321,8 @@ config KSM - config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" - depends on MMU -+ default 65536 if ( X86_64 || X86_32 || PPC64 || IA64 ) && GENTOO_KERNEL_SELF_PROTECTION -+ default 32768 if ( ARM64 || ARM ) && GENTOO_KERNEL_SELF_PROTECTION - default 4096 - help - This is the portion of low virtual memory which should be protected --- -2.31.1 -``` diff --git a/sys-kernel/pinephone-pro-sources/files/5010_enable-cpu-optimizations-universal.patch b/sys-kernel/pinephone-pro-sources/files/5010_enable-cpu-optimizations-universal.patch deleted file mode 100644 index b9c03cb..0000000 --- a/sys-kernel/pinephone-pro-sources/files/5010_enable-cpu-optimizations-universal.patch +++ /dev/null @@ -1,675 +0,0 @@ -From b5892719c43f739343c628e3d357471a3bdaa368 Mon Sep 17 00:00:00 2001 -From: graysky -Date: Tue, 15 Mar 2022 05:58:43 -0400 -Subject: [PATCH] more uarches for kernel 5.17+ -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -FEATURES -This patch adds additional CPU options to the Linux kernel accessible under: - Processor type and features ---> - Processor family ---> - -With the release of gcc 11.1 and clang 12.0, several generic 64-bit levels are -offered which are good for supported Intel or AMD CPUs: -• x86-64-v2 -• x86-64-v3 -• x86-64-v4 - -Users of glibc 2.33 and above can see which level is supported by current -hardware by running: - /lib/ld-linux-x86-64.so.2 --help | grep supported - -Alternatively, compare the flags from /proc/cpuinfo to this list.[1] - -CPU-specific microarchitectures include: -• AMD Improved K8-family -• AMD K10-family -• AMD Family 10h (Barcelona) -• AMD Family 14h (Bobcat) -• AMD Family 16h (Jaguar) -• AMD Family 15h (Bulldozer) -• AMD Family 15h (Piledriver) -• AMD Family 15h (Steamroller) -• AMD Family 15h (Excavator) -• AMD Family 17h (Zen) -• AMD Family 17h (Zen 2) -• AMD Family 19h (Zen 3)† -• Intel Silvermont low-power processors -• Intel Goldmont low-power processors (Apollo Lake and Denverton) -• Intel Goldmont Plus low-power processors (Gemini Lake) -• Intel 1st Gen Core i3/i5/i7 (Nehalem) -• Intel 1.5 Gen Core i3/i5/i7 (Westmere) -• Intel 2nd Gen Core i3/i5/i7 (Sandybridge) -• Intel 3rd Gen Core i3/i5/i7 (Ivybridge) -• Intel 4th Gen Core i3/i5/i7 (Haswell) -• Intel 5th Gen Core i3/i5/i7 (Broadwell) -• Intel 6th Gen Core i3/i5/i7 (Skylake) -• Intel 6th Gen Core i7/i9 (Skylake X) -• Intel 8th Gen Core i3/i5/i7 (Cannon Lake) -• Intel 10th Gen Core i7/i9 (Ice Lake) -• Intel Xeon (Cascade Lake) -• Intel Xeon (Cooper Lake)* -• Intel 3rd Gen 10nm++ i3/i5/i7/i9-family (Tiger Lake)* -• Intel 3rd Gen 10nm++ Xeon (Sapphire Rapids)‡ -• Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)‡ -• Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)‡ - -Notes: If not otherwise noted, gcc >=9.1 is required for support. - *Requires gcc >=10.1 or clang >=10.0 - †Required gcc >=10.3 or clang >=12.0 - ‡Required gcc >=11.1 or clang >=12.0 - -It also offers to compile passing the 'native' option which, "selects the CPU -to generate code for at compilation time by determining the processor type of -the compiling machine. Using -march=native enables all instruction subsets -supported by the local machine and will produce code optimized for the local -machine under the constraints of the selected instruction set."[2] - -Users of Intel CPUs should select the 'Intel-Native' option and users of AMD -CPUs should select the 'AMD-Native' option. - -MINOR NOTES RELATING TO INTEL ATOM PROCESSORS -This patch also changes -march=atom to -march=bonnell in accordance with the -gcc v4.9 changes. Upstream is using the deprecated -match=atom flags when I -believe it should use the newer -march=bonnell flag for atom processors.[3] - -It is not recommended to compile on Atom-CPUs with the 'native' option.[4] The -recommendation is to use the 'atom' option instead. - -BENEFITS -Small but real speed increases are measurable using a make endpoint comparing -a generic kernel to one built with one of the respective microarchs. - -See the following experimental evidence supporting this statement: -https://github.com/graysky2/kernel_gcc_patch - -REQUIREMENTS -linux version 5.17+ -gcc version >=9.0 or clang version >=9.0 - -ACKNOWLEDGMENTS -This patch builds on the seminal work by Jeroen.[5] - -REFERENCES -1. https://gitlab.com/x86-psABIs/x86-64-ABI/-/commit/77566eb03bc6a326811cb7e9 -2. https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options -3. https://bugzilla.kernel.org/show_bug.cgi?id=77461 -4. https://github.com/graysky2/kernel_gcc_patch/issues/15 -5. http://www.linuxforge.net/docs/linux/linux-gcc.php - -Signed-off-by: graysky ---- - arch/x86/Kconfig.cpu | 332 ++++++++++++++++++++++++++++++-- - arch/x86/Makefile | 40 +++- - arch/x86/include/asm/vermagic.h | 66 +++++++ - 3 files changed, 424 insertions(+), 14 deletions(-) - -diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu -index 542377cd419d..22b919cdb6d1 100644 ---- a/arch/x86/Kconfig.cpu -+++ b/arch/x86/Kconfig.cpu -@@ -157,7 +157,7 @@ config MPENTIUM4 - - - config MK6 -- bool "K6/K6-II/K6-III" -+ bool "AMD K6/K6-II/K6-III" - depends on X86_32 - help - Select this for an AMD K6-family processor. Enables use of -@@ -165,7 +165,7 @@ config MK6 - flags to GCC. - - config MK7 -- bool "Athlon/Duron/K7" -+ bool "AMD Athlon/Duron/K7" - depends on X86_32 - help - Select this for an AMD Athlon K7-family processor. Enables use of -@@ -173,12 +173,98 @@ config MK7 - flags to GCC. - - config MK8 -- bool "Opteron/Athlon64/Hammer/K8" -+ bool "AMD Opteron/Athlon64/Hammer/K8" - help - Select this for an AMD Opteron or Athlon64 Hammer-family processor. - Enables use of some extended instructions, and passes appropriate - optimization flags to GCC. - -+config MK8SSE3 -+ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" -+ help -+ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. -+ Enables use of some extended instructions, and passes appropriate -+ optimization flags to GCC. -+ -+config MK10 -+ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" -+ help -+ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, -+ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. -+ Enables use of some extended instructions, and passes appropriate -+ optimization flags to GCC. -+ -+config MBARCELONA -+ bool "AMD Barcelona" -+ help -+ Select this for AMD Family 10h Barcelona processors. -+ -+ Enables -march=barcelona -+ -+config MBOBCAT -+ bool "AMD Bobcat" -+ help -+ Select this for AMD Family 14h Bobcat processors. -+ -+ Enables -march=btver1 -+ -+config MJAGUAR -+ bool "AMD Jaguar" -+ help -+ Select this for AMD Family 16h Jaguar processors. -+ -+ Enables -march=btver2 -+ -+config MBULLDOZER -+ bool "AMD Bulldozer" -+ help -+ Select this for AMD Family 15h Bulldozer processors. -+ -+ Enables -march=bdver1 -+ -+config MPILEDRIVER -+ bool "AMD Piledriver" -+ help -+ Select this for AMD Family 15h Piledriver processors. -+ -+ Enables -march=bdver2 -+ -+config MSTEAMROLLER -+ bool "AMD Steamroller" -+ help -+ Select this for AMD Family 15h Steamroller processors. -+ -+ Enables -march=bdver3 -+ -+config MEXCAVATOR -+ bool "AMD Excavator" -+ help -+ Select this for AMD Family 15h Excavator processors. -+ -+ Enables -march=bdver4 -+ -+config MZEN -+ bool "AMD Zen" -+ help -+ Select this for AMD Family 17h Zen processors. -+ -+ Enables -march=znver1 -+ -+config MZEN2 -+ bool "AMD Zen 2" -+ help -+ Select this for AMD Family 17h Zen 2 processors. -+ -+ Enables -march=znver2 -+ -+config MZEN3 -+ bool "AMD Zen 3" -+ depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ help -+ Select this for AMD Family 19h Zen 3 processors. -+ -+ Enables -march=znver3 -+ - config MCRUSOE - bool "Crusoe" - depends on X86_32 -@@ -270,7 +356,7 @@ config MPSC - in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. - - config MCORE2 -- bool "Core 2/newer Xeon" -+ bool "Intel Core 2" - help - - Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and -@@ -278,6 +364,8 @@ config MCORE2 - family in /proc/cpuinfo. Newer ones have 6 and older ones 15 - (not a typo) - -+ Enables -march=core2 -+ - config MATOM - bool "Intel Atom" - help -@@ -287,6 +375,182 @@ config MATOM - accordingly optimized code. Use a recent GCC with specific Atom - support in order to fully benefit from selecting this option. - -+config MNEHALEM -+ bool "Intel Nehalem" -+ select X86_P6_NOP -+ help -+ -+ Select this for 1st Gen Core processors in the Nehalem family. -+ -+ Enables -march=nehalem -+ -+config MWESTMERE -+ bool "Intel Westmere" -+ select X86_P6_NOP -+ help -+ -+ Select this for the Intel Westmere formerly Nehalem-C family. -+ -+ Enables -march=westmere -+ -+config MSILVERMONT -+ bool "Intel Silvermont" -+ select X86_P6_NOP -+ help -+ -+ Select this for the Intel Silvermont platform. -+ -+ Enables -march=silvermont -+ -+config MGOLDMONT -+ bool "Intel Goldmont" -+ select X86_P6_NOP -+ help -+ -+ Select this for the Intel Goldmont platform including Apollo Lake and Denverton. -+ -+ Enables -march=goldmont -+ -+config MGOLDMONTPLUS -+ bool "Intel Goldmont Plus" -+ select X86_P6_NOP -+ help -+ -+ Select this for the Intel Goldmont Plus platform including Gemini Lake. -+ -+ Enables -march=goldmont-plus -+ -+config MSANDYBRIDGE -+ bool "Intel Sandy Bridge" -+ select X86_P6_NOP -+ help -+ -+ Select this for 2nd Gen Core processors in the Sandy Bridge family. -+ -+ Enables -march=sandybridge -+ -+config MIVYBRIDGE -+ bool "Intel Ivy Bridge" -+ select X86_P6_NOP -+ help -+ -+ Select this for 3rd Gen Core processors in the Ivy Bridge family. -+ -+ Enables -march=ivybridge -+ -+config MHASWELL -+ bool "Intel Haswell" -+ select X86_P6_NOP -+ help -+ -+ Select this for 4th Gen Core processors in the Haswell family. -+ -+ Enables -march=haswell -+ -+config MBROADWELL -+ bool "Intel Broadwell" -+ select X86_P6_NOP -+ help -+ -+ Select this for 5th Gen Core processors in the Broadwell family. -+ -+ Enables -march=broadwell -+ -+config MSKYLAKE -+ bool "Intel Skylake" -+ select X86_P6_NOP -+ help -+ -+ Select this for 6th Gen Core processors in the Skylake family. -+ -+ Enables -march=skylake -+ -+config MSKYLAKEX -+ bool "Intel Skylake X" -+ select X86_P6_NOP -+ help -+ -+ Select this for 6th Gen Core processors in the Skylake X family. -+ -+ Enables -march=skylake-avx512 -+ -+config MCANNONLAKE -+ bool "Intel Cannon Lake" -+ select X86_P6_NOP -+ help -+ -+ Select this for 8th Gen Core processors -+ -+ Enables -march=cannonlake -+ -+config MICELAKE -+ bool "Intel Ice Lake" -+ select X86_P6_NOP -+ help -+ -+ Select this for 10th Gen Core processors in the Ice Lake family. -+ -+ Enables -march=icelake-client -+ -+config MCASCADELAKE -+ bool "Intel Cascade Lake" -+ select X86_P6_NOP -+ help -+ -+ Select this for Xeon processors in the Cascade Lake family. -+ -+ Enables -march=cascadelake -+ -+config MCOOPERLAKE -+ bool "Intel Cooper Lake" -+ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) -+ select X86_P6_NOP -+ help -+ -+ Select this for Xeon processors in the Cooper Lake family. -+ -+ Enables -march=cooperlake -+ -+config MTIGERLAKE -+ bool "Intel Tiger Lake" -+ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) -+ select X86_P6_NOP -+ help -+ -+ Select this for third-generation 10 nm process processors in the Tiger Lake family. -+ -+ Enables -march=tigerlake -+ -+config MSAPPHIRERAPIDS -+ bool "Intel Sapphire Rapids" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ select X86_P6_NOP -+ help -+ -+ Select this for third-generation 10 nm process processors in the Sapphire Rapids family. -+ -+ Enables -march=sapphirerapids -+ -+config MROCKETLAKE -+ bool "Intel Rocket Lake" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ select X86_P6_NOP -+ help -+ -+ Select this for eleventh-generation processors in the Rocket Lake family. -+ -+ Enables -march=rocketlake -+ -+config MALDERLAKE -+ bool "Intel Alder Lake" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ select X86_P6_NOP -+ help -+ -+ Select this for twelfth-generation processors in the Alder Lake family. -+ -+ Enables -march=alderlake -+ - config GENERIC_CPU - bool "Generic-x86-64" - depends on X86_64 -@@ -294,6 +558,50 @@ config GENERIC_CPU - Generic x86-64 CPU. - Run equally well on all x86-64 CPUs. - -+config GENERIC_CPU2 -+ bool "Generic-x86-64-v2" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ depends on X86_64 -+ help -+ Generic x86-64 CPU. -+ Run equally well on all x86-64 CPUs with min support of x86-64-v2. -+ -+config GENERIC_CPU3 -+ bool "Generic-x86-64-v3" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ depends on X86_64 -+ help -+ Generic x86-64-v3 CPU with v3 instructions. -+ Run equally well on all x86-64 CPUs with min support of x86-64-v3. -+ -+config GENERIC_CPU4 -+ bool "Generic-x86-64-v4" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ depends on X86_64 -+ help -+ Generic x86-64 CPU with v4 instructions. -+ Run equally well on all x86-64 CPUs with min support of x86-64-v4. -+ -+config MNATIVE_INTEL -+ bool "Intel-Native optimizations autodetected by the compiler" -+ help -+ -+ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects -+ the optimum settings to use based on your processor. Do NOT use this -+ for AMD CPUs. Intel Only! -+ -+ Enables -march=native -+ -+config MNATIVE_AMD -+ bool "AMD-Native optimizations autodetected by the compiler" -+ help -+ -+ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects -+ the optimum settings to use based on your processor. Do NOT use this -+ for Intel CPUs. AMD Only! -+ -+ Enables -march=native -+ - endchoice - - config X86_GENERIC -@@ -318,7 +626,7 @@ config X86_INTERNODE_CACHE_SHIFT - config X86_L1_CACHE_SHIFT - int - default "7" if MPENTIUM4 || MPSC -- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU -+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 || GENERIC_CPU3 || GENERIC_CPU4 - default "4" if MELAN || M486SX || M486 || MGEODEGX1 - default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - -@@ -336,11 +644,11 @@ config X86_ALIGNMENT_16 - - config X86_INTEL_USERCOPY - def_bool y -- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 -+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL - - config X86_USE_PPRO_CHECKSUM - def_bool y -- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM -+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD - - # - # P6_NOPs are a relatively minor optimization that require a family >= -@@ -356,26 +664,26 @@ config X86_USE_PPRO_CHECKSUM - config X86_P6_NOP - def_bool y - depends on X86_64 -- depends on (MCORE2 || MPENTIUM4 || MPSC) -+ depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL) - - config X86_TSC - def_bool y -- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 -+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD) || X86_64 - - config X86_CMPXCHG64 - def_bool y -- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 -+ depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD - - # this should be set for all -march=.. options where the compiler - # generates cmov. - config X86_CMOV - def_bool y -- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) -+ depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD) - - config X86_MINIMUM_CPU_FAMILY - int - default "64" if X86_64 -- default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) -+ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD) - default "5" if X86_32 && X86_CMPXCHG64 - default "4" - -diff --git a/arch/x86/Makefile b/arch/x86/Makefile -index e84cdd409b64..7d3bbf060079 100644 ---- a/arch/x86/Makefile -+++ b/arch/x86/Makefile -@@ -131,8 +131,44 @@ else - # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) - cflags-$(CONFIG_MK8) += -march=k8 - cflags-$(CONFIG_MPSC) += -march=nocona -- cflags-$(CONFIG_MCORE2) += -march=core2 -- cflags-$(CONFIG_MATOM) += -march=atom -+ cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 -+ cflags-$(CONFIG_MK10) += -march=amdfam10 -+ cflags-$(CONFIG_MBARCELONA) += -march=barcelona -+ cflags-$(CONFIG_MBOBCAT) += -march=btver1 -+ cflags-$(CONFIG_MJAGUAR) += -march=btver2 -+ cflags-$(CONFIG_MBULLDOZER) += -march=bdver1 -+ cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm -+ cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm -+ cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm -+ cflags-$(CONFIG_MZEN) += -march=znver1 -+ cflags-$(CONFIG_MZEN2) += -march=znver2 -+ cflags-$(CONFIG_MZEN3) += -march=znver3 -+ cflags-$(CONFIG_MNATIVE_INTEL) += -march=native -+ cflags-$(CONFIG_MNATIVE_AMD) += -march=native -+ cflags-$(CONFIG_MATOM) += -march=bonnell -+ cflags-$(CONFIG_MCORE2) += -march=core2 -+ cflags-$(CONFIG_MNEHALEM) += -march=nehalem -+ cflags-$(CONFIG_MWESTMERE) += -march=westmere -+ cflags-$(CONFIG_MSILVERMONT) += -march=silvermont -+ cflags-$(CONFIG_MGOLDMONT) += -march=goldmont -+ cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus -+ cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge -+ cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge -+ cflags-$(CONFIG_MHASWELL) += -march=haswell -+ cflags-$(CONFIG_MBROADWELL) += -march=broadwell -+ cflags-$(CONFIG_MSKYLAKE) += -march=skylake -+ cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 -+ cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake -+ cflags-$(CONFIG_MICELAKE) += -march=icelake-client -+ cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake -+ cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake -+ cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake -+ cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids -+ cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake -+ cflags-$(CONFIG_MALDERLAKE) += -march=alderlake -+ cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 -+ cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 -+ cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 - cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic - KBUILD_CFLAGS += $(cflags-y) - -diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h -index 75884d2cdec3..4e6a08d4c7e5 100644 ---- a/arch/x86/include/asm/vermagic.h -+++ b/arch/x86/include/asm/vermagic.h -@@ -17,6 +17,48 @@ - #define MODULE_PROC_FAMILY "586MMX " - #elif defined CONFIG_MCORE2 - #define MODULE_PROC_FAMILY "CORE2 " -+#elif defined CONFIG_MNATIVE_INTEL -+#define MODULE_PROC_FAMILY "NATIVE_INTEL " -+#elif defined CONFIG_MNATIVE_AMD -+#define MODULE_PROC_FAMILY "NATIVE_AMD " -+#elif defined CONFIG_MNEHALEM -+#define MODULE_PROC_FAMILY "NEHALEM " -+#elif defined CONFIG_MWESTMERE -+#define MODULE_PROC_FAMILY "WESTMERE " -+#elif defined CONFIG_MSILVERMONT -+#define MODULE_PROC_FAMILY "SILVERMONT " -+#elif defined CONFIG_MGOLDMONT -+#define MODULE_PROC_FAMILY "GOLDMONT " -+#elif defined CONFIG_MGOLDMONTPLUS -+#define MODULE_PROC_FAMILY "GOLDMONTPLUS " -+#elif defined CONFIG_MSANDYBRIDGE -+#define MODULE_PROC_FAMILY "SANDYBRIDGE " -+#elif defined CONFIG_MIVYBRIDGE -+#define MODULE_PROC_FAMILY "IVYBRIDGE " -+#elif defined CONFIG_MHASWELL -+#define MODULE_PROC_FAMILY "HASWELL " -+#elif defined CONFIG_MBROADWELL -+#define MODULE_PROC_FAMILY "BROADWELL " -+#elif defined CONFIG_MSKYLAKE -+#define MODULE_PROC_FAMILY "SKYLAKE " -+#elif defined CONFIG_MSKYLAKEX -+#define MODULE_PROC_FAMILY "SKYLAKEX " -+#elif defined CONFIG_MCANNONLAKE -+#define MODULE_PROC_FAMILY "CANNONLAKE " -+#elif defined CONFIG_MICELAKE -+#define MODULE_PROC_FAMILY "ICELAKE " -+#elif defined CONFIG_MCASCADELAKE -+#define MODULE_PROC_FAMILY "CASCADELAKE " -+#elif defined CONFIG_MCOOPERLAKE -+#define MODULE_PROC_FAMILY "COOPERLAKE " -+#elif defined CONFIG_MTIGERLAKE -+#define MODULE_PROC_FAMILY "TIGERLAKE " -+#elif defined CONFIG_MSAPPHIRERAPIDS -+#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS " -+#elif defined CONFIG_ROCKETLAKE -+#define MODULE_PROC_FAMILY "ROCKETLAKE " -+#elif defined CONFIG_MALDERLAKE -+#define MODULE_PROC_FAMILY "ALDERLAKE " - #elif defined CONFIG_MATOM - #define MODULE_PROC_FAMILY "ATOM " - #elif defined CONFIG_M686 -@@ -35,6 +77,30 @@ - #define MODULE_PROC_FAMILY "K7 " - #elif defined CONFIG_MK8 - #define MODULE_PROC_FAMILY "K8 " -+#elif defined CONFIG_MK8SSE3 -+#define MODULE_PROC_FAMILY "K8SSE3 " -+#elif defined CONFIG_MK10 -+#define MODULE_PROC_FAMILY "K10 " -+#elif defined CONFIG_MBARCELONA -+#define MODULE_PROC_FAMILY "BARCELONA " -+#elif defined CONFIG_MBOBCAT -+#define MODULE_PROC_FAMILY "BOBCAT " -+#elif defined CONFIG_MBULLDOZER -+#define MODULE_PROC_FAMILY "BULLDOZER " -+#elif defined CONFIG_MPILEDRIVER -+#define MODULE_PROC_FAMILY "PILEDRIVER " -+#elif defined CONFIG_MSTEAMROLLER -+#define MODULE_PROC_FAMILY "STEAMROLLER " -+#elif defined CONFIG_MJAGUAR -+#define MODULE_PROC_FAMILY "JAGUAR " -+#elif defined CONFIG_MEXCAVATOR -+#define MODULE_PROC_FAMILY "EXCAVATOR " -+#elif defined CONFIG_MZEN -+#define MODULE_PROC_FAMILY "ZEN " -+#elif defined CONFIG_MZEN2 -+#define MODULE_PROC_FAMILY "ZEN2 " -+#elif defined CONFIG_MZEN3 -+#define MODULE_PROC_FAMILY "ZEN3 " - #elif defined CONFIG_MELAN - #define MODULE_PROC_FAMILY "ELAN " - #elif defined CONFIG_MCRUSOE --- -2.35.1 - diff --git a/sys-kernel/pinephone-pro-sources/files/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch b/sys-kernel/pinephone-pro-sources/files/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch deleted file mode 100644 index 610cfe8..0000000 --- a/sys-kernel/pinephone-pro-sources/files/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch +++ /dev/null @@ -1,9956 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index cc3ea8febc62..ab4c5a35b999 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -5299,6 +5299,12 @@ - sa1100ir [NET] - See drivers/net/irda/sa1100_ir.c. - -+ sched_timeslice= -+ [KNL] Time slice in ms for Project C BMQ/PDS scheduler. -+ Format: integer 2, 4 -+ Default: 4 -+ See Documentation/scheduler/sched-BMQ.txt -+ - sched_verbose [KNL] Enables verbose scheduler debug messages. - - schedstats= [KNL,X86] Enable or disable scheduled statistics. -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index ddccd1077462..e24781970a3d 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1524,3 +1524,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+yield_type: -+=========== -+ -+BMQ/PDS CPU scheduler only. This determines what type of yield calls -+to sched_yield will perform. -+ -+ 0 - No yield. -+ 1 - Deboost and requeue task. (default) -+ 2 - Set run queue skip task. -diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt -new file mode 100644 -index 000000000000..05c84eec0f31 ---- /dev/null -+++ b/Documentation/scheduler/sched-BMQ.txt -@@ -0,0 +1,110 @@ -+ BitMap queue CPU Scheduler -+ -------------------------- -+ -+CONTENT -+======== -+ -+ Background -+ Design -+ Overview -+ Task policy -+ Priority management -+ BitMap Queue -+ CPU Assignment and Migration -+ -+ -+Background -+========== -+ -+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution -+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), -+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code -+simple, while efficiency and scalable for interactive tasks, such as desktop, -+movie playback and gaming etc. -+ -+Design -+====== -+ -+Overview -+-------- -+ -+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, -+each CPU is responsible for scheduling the tasks that are putting into it's -+run queue. -+ -+The run queue is a set of priority queues. Note that these queues are fifo -+queue for non-rt tasks or priority queue for rt tasks in data structure. See -+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact -+that most applications are non-rt tasks. No matter the queue is fifo or -+priority, In each queue is an ordered list of runnable tasks awaiting execution -+and the data structures are the same. When it is time for a new task to run, -+the scheduler simply looks the lowest numbered queueue that contains a task, -+and runs the first task from the head of that queue. And per CPU idle task is -+also in the run queue, so the scheduler can always find a task to run on from -+its run queue. -+ -+Each task will assigned the same timeslice(default 4ms) when it is picked to -+start running. Task will be reinserted at the end of the appropriate priority -+queue when it uses its whole timeslice. When the scheduler selects a new task -+from the priority queue it sets the CPU's preemption timer for the remainder of -+the previous timeslice. When that timer fires the scheduler will stop execution -+on that task, select another task and start over again. -+ -+If a task blocks waiting for a shared resource then it's taken out of its -+priority queue and is placed in a wait queue for the shared resource. When it -+is unblocked it will be reinserted in the appropriate priority queue of an -+eligible CPU. -+ -+Task policy -+----------- -+ -+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the -+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's -+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each -+policy. -+ -+DEADLINE -+ It is squashed as priority 0 FIFO task. -+ -+FIFO/RR -+ All RT tasks share one single priority queue in BMQ run queue designed. The -+complexity of insert operation is O(n). BMQ is not designed for system runs -+with major rt policy tasks. -+ -+NORMAL/BATCH/IDLE -+ BATCH and IDLE tasks are treated as the same policy. They compete CPU with -+NORMAL policy tasks, but they just don't boost. To control the priority of -+NORMAL/BATCH/IDLE tasks, simply use nice level. -+ -+ISO -+ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy -+task instead. -+ -+Priority management -+------------------- -+ -+RT tasks have priority from 0-99. For non-rt tasks, there are three different -+factors used to determine the effective priority of a task. The effective -+priority being what is used to determine which queue it will be in. -+ -+The first factor is simply the task’s static priority. Which is assigned from -+task's nice level, within [-20, 19] in userland's point of view and [0, 39] -+internally. -+ -+The second factor is the priority boost. This is a value bounded between -+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is -+modified by the following cases: -+ -+*When a thread has used up its entire timeslice, always deboost its boost by -+increasing by one. -+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, -+and its switch-in time(time after last switch and run) below the thredhold -+based on its priority boost, will boost its boost by decreasing by one buti is -+capped at 0 (won’t go negative). -+ -+The intent in this system is to ensure that interactive threads are serviced -+quickly. These are usually the threads that interact directly with the user -+and cause user-perceivable latency. These threads usually do little work and -+spend most of their time blocked awaiting another user event. So they get the -+priority boost from unblocking while background threads that do most of the -+processing receive the priority penalty for using their entire timeslice. -diff --git a/fs/proc/base.c b/fs/proc/base.c -index 8dfa36a99c74..46397c606e01 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h -index 8874f681b056..59eb72bf7d5f 100644 ---- a/include/asm-generic/resource.h -+++ b/include/asm-generic/resource.h -@@ -23,7 +23,7 @@ - [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ - [RLIMIT_SIGPENDING] = { 0, 0 }, \ - [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ -- [RLIMIT_NICE] = { 0, 0 }, \ -+ [RLIMIT_NICE] = { 30, 30 }, \ - [RLIMIT_RTPRIO] = { 0, 0 }, \ - [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ - } -diff --git a/include/linux/sched.h b/include/linux/sched.h -index c46f3a63b758..7c65e6317d97 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -751,8 +751,14 @@ struct task_struct { - unsigned int ptrace; - - #ifdef CONFIG_SMP -- int on_cpu; - struct __call_single_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) -+ int on_cpu; -+#endif -+ -+#ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - unsigned int wakee_flips; - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; -@@ -766,6 +772,7 @@ struct task_struct { - */ - int recent_used_cpu; - int wake_cpu; -+#endif /* !CONFIG_SCHED_ALT */ - #endif - int on_rq; - -@@ -774,6 +781,20 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_ALT -+ u64 last_ran; -+ s64 time_slice; -+ int sq_idx; -+ struct list_head sq_node; -+#ifdef CONFIG_SCHED_BMQ -+ int boost_prio; -+#endif /* CONFIG_SCHED_BMQ */ -+#ifdef CONFIG_SCHED_PDS -+ u64 deadline; -+#endif /* CONFIG_SCHED_PDS */ -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* !CONFIG_SCHED_ALT */ - struct sched_entity se; - struct sched_rt_entity rt; - struct sched_dl_entity dl; -@@ -784,6 +805,7 @@ struct task_struct { - unsigned long core_cookie; - unsigned int core_occupation; - #endif -+#endif /* !CONFIG_SCHED_ALT */ - - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; -@@ -1517,6 +1539,15 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_ALT -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+#endif /* !CONFIG_SCHED_ALT */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 7c83d4d5a971..fa30f98cb2be 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,24 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_ALT -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHED_BMQ -+#define __tsk_deadline(p) (0UL) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+#define __tsk_deadline(p) ((((u64) ((p)->prio))<<56) | (p)->deadline) -+#endif -+ -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -21,6 +40,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_ALT */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index ab83d85e1183..6af9ae681116 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -18,6 +18,32 @@ - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -+#ifdef CONFIG_SCHED_ALT -+ -+/* Undefine MAX_PRIO and DEFAULT_PRIO */ -+#undef MAX_PRIO -+#undef DEFAULT_PRIO -+ -+/* +/- priority levels from the base priority */ -+#ifdef CONFIG_SCHED_BMQ -+#define MAX_PRIORITY_ADJ (7) -+ -+#define MIN_NORMAL_PRIO (MAX_RT_PRIO) -+#define MAX_PRIO (MIN_NORMAL_PRIO + NICE_WIDTH) -+#define DEFAULT_PRIO (MIN_NORMAL_PRIO + NICE_WIDTH / 2) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+#define MAX_PRIORITY_ADJ (0) -+ -+#define MIN_NORMAL_PRIO (128) -+#define NORMAL_PRIO_NUM (64) -+#define MAX_PRIO (MIN_NORMAL_PRIO + NORMAL_PRIO_NUM) -+#define DEFAULT_PRIO (MAX_PRIO - NICE_WIDTH / 2) -+#endif -+ -+#endif /* CONFIG_SCHED_ALT */ -+ - /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..0a7565d0d3cf 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_ALT - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h -index 56cffe42abbc..e020fc572b22 100644 ---- a/include/linux/sched/topology.h -+++ b/include/linux/sched/topology.h -@@ -233,7 +233,8 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) - - #endif /* !CONFIG_SMP */ - --#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) && \ -+ !defined(CONFIG_SCHED_ALT) - extern void rebuild_sched_domains_energy(void); - #else - static inline void rebuild_sched_domains_energy(void) -diff --git a/init/Kconfig b/init/Kconfig -index c7900e8975f1..d2b593e3807d 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -812,6 +812,7 @@ menu "Scheduler features" - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_ALT - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -858,6 +859,35 @@ config UCLAMP_BUCKETS_COUNT - - If in doubt, use the default value. - -+menuconfig SCHED_ALT -+ bool "Alternative CPU Schedulers" -+ default y -+ help -+ This feature enable alternative CPU scheduler" -+ -+if SCHED_ALT -+ -+choice -+ prompt "Alternative CPU Scheduler" -+ default SCHED_BMQ -+ -+config SCHED_BMQ -+ bool "BMQ CPU scheduler" -+ help -+ The BitMap Queue CPU scheduler for excellent interactivity and -+ responsiveness on the desktop and solid scalability on normal -+ hardware and commodity servers. -+ -+config SCHED_PDS -+ bool "PDS CPU scheduler" -+ help -+ The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler. -+ -+endchoice -+ -+endif -+ - endmenu - - # -@@ -911,6 +941,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION && !PREEMPT_RT -+ depends on !SCHED_ALT - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -1003,6 +1034,7 @@ config FAIR_GROUP_SCHED - depends on CGROUP_SCHED - default CGROUP_SCHED - -+if !SCHED_ALT - config CFS_BANDWIDTH - bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" - depends on FAIR_GROUP_SCHED -@@ -1025,6 +1057,7 @@ config RT_GROUP_SCHED - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.rst for more information. - -+endif #!SCHED_ALT - endif #CGROUP_SCHED - - config UCLAMP_TASK_GROUP -@@ -1268,6 +1301,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_ALT - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index 73cc8f03511a..2d0bad762895 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -75,9 +75,15 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_ALT -+ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+ .static_prio = DEFAULT_PRIO, -+ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .user_cpus_ptr = NULL, -@@ -88,6 +94,17 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifdef CONFIG_SCHED_ALT -+ .sq_node = LIST_HEAD_INIT(init_task.sq_node), -+#ifdef CONFIG_SCHED_BMQ -+ .boost_prio = 0, -+ .sq_idx = 15, -+#endif -+#ifdef CONFIG_SCHED_PDS -+ .deadline = 0, -+#endif -+ .time_slice = HZ, -+#else - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -95,6 +112,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a821..41654679b1b2 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -117,7 +117,7 @@ config PREEMPT_DYNAMIC - - config SCHED_CORE - bool "Core Scheduling for SMT" -- depends on SCHED_SMT -+ depends on SCHED_SMT && !SCHED_ALT - help - This option permits Core Scheduling, a means of coordinated task - selection across SMT siblings. When enabled -- see -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 71a418858a5e..7e3016873db1 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -704,7 +704,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1100,7 +1100,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 164ed9ef77a3..c974a84b056f 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -150,7 +150,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index 64c938ce36fe..a353f7ef5392 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -124,7 +124,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -145,7 +145,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index 7779ee8abc2a..5b9893cdfb1b 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -300,21 +300,25 @@ static __always_inline void - waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) - { - waiter->prio = __waiter_prio(task); -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - } - - /* - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = __tsk_deadline(p) } - - static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline < right->deadline); -+#else - if (left->prio < right->prio) - return 1; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -323,16 +327,22 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return dl_time_before(left->deadline, right->deadline); -+#endif - - return 0; -+#endif - } - - static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline == right->deadline); -+#else - if (left->prio != right->prio) - return 0; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -341,8 +351,10 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return left->deadline == right->deadline; -+#endif - - return 1; -+#endif - } - - static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 976092b7bd45..31d587c16ec1 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -28,7 +28,12 @@ endif - # These compilation units have roughly the same size and complexity - so their - # build parallelizes well and finishes roughly at once: - # -+ifdef CONFIG_SCHED_ALT -+obj-y += alt_core.o -+obj-$(CONFIG_SCHED_DEBUG) += alt_debug.o -+else - obj-y += core.o - obj-y += fair.o -+endif - obj-y += build_policy.o - obj-y += build_utility.o -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -new file mode 100644 -index 000000000000..d0ab41c4d9ad ---- /dev/null -+++ b/kernel/sched/alt_core.c -@@ -0,0 +1,7807 @@ -+/* -+ * kernel/sched/alt_core.c -+ * -+ * Core alternative kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+ -+#define CREATE_TRACE_POINTS -+#include -+#undef CREATE_TRACE_POINTS -+ -+#include "sched.h" -+ -+#include "pelt.h" -+ -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+/* -+ * Export tracepoints that act as a bare tracehook (ie: have no trace event -+ * associated with them) to allow external modules to probe them. -+ */ -+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); -+ -+#ifdef CONFIG_SCHED_DEBUG -+#define sched_feat(x) (1) -+/* -+ * Print a warning if need_resched is set for the given duration (if -+ * LATENCY_WARN is enabled). -+ * -+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown -+ * per boot. -+ */ -+__read_mostly int sysctl_resched_latency_warn_ms = 100; -+__read_mostly int sysctl_resched_latency_warn_once = 1; -+#else -+#define sched_feat(x) (0) -+#endif /* CONFIG_SCHED_DEBUG */ -+ -+#define ALT_SCHED_VERSION "v5.19-r0" -+ -+/* rt_prio(prio) defined in include/linux/sched/rt.h */ -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ -+u64 sched_timeslice_ns __read_mostly = (4 << 20); -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq, int idx); -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds.h" -+#endif -+ -+static int __init sched_timeslice(char *str) -+{ -+ int timeslice_ms; -+ -+ get_option(&str, ×lice_ms); -+ if (2 != timeslice_ms) -+ timeslice_ms = 4; -+ sched_timeslice_ns = timeslice_ms << 20; -+ sched_timeslice_imp(timeslice_ms); -+ -+ return 0; -+} -+early_param("sched_timeslice", sched_timeslice); -+ -+/* Reschedule if less than this many μs left */ -+#define RESCHED_NS (100 << 10) -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Deboost and requeue task. (default) -+ * 2: Set rq skip task. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+#ifdef CONFIG_SMP -+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_topo_end_mask); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; -+#endif -+static cpumask_t sched_rq_watermark[SCHED_QUEUE_BITS] ____cacheline_aligned_in_smp; -+ -+/* sched_queue related functions */ -+static inline void sched_queue_init(struct sched_queue *q) -+{ -+ int i; -+ -+ bitmap_zero(q->bitmap, SCHED_QUEUE_BITS); -+ for(i = 0; i < SCHED_BITS; i++) -+ INIT_LIST_HEAD(&q->heads[i]); -+} -+ -+/* -+ * Init idle task and put into queue structure of rq -+ * IMPORTANT: may be called multiple times for a single cpu -+ */ -+static inline void sched_queue_init_idle(struct sched_queue *q, -+ struct task_struct *idle) -+{ -+ idle->sq_idx = IDLE_TASK_SCHED_PRIO; -+ INIT_LIST_HEAD(&q->heads[idle->sq_idx]); -+ list_add(&idle->sq_node, &q->heads[idle->sq_idx]); -+} -+ -+/* water mark related functions */ -+static inline void update_sched_rq_watermark(struct rq *rq) -+{ -+ unsigned long watermark = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS); -+ unsigned long last_wm = rq->watermark; -+ unsigned long i; -+ int cpu; -+ -+ if (watermark == last_wm) -+ return; -+ -+ rq->watermark = watermark; -+ cpu = cpu_of(rq); -+ if (watermark < last_wm) { -+ for (i = last_wm; i > watermark; i--) -+ cpumask_clear_cpu(cpu, sched_rq_watermark + SCHED_QUEUE_BITS - i); -+#ifdef CONFIG_SCHED_SMT -+ if (static_branch_likely(&sched_smt_present) && -+ IDLE_TASK_SCHED_PRIO == last_wm) -+ cpumask_andnot(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+#endif -+ return; -+ } -+ /* last_wm < watermark */ -+ for (i = watermark; i > last_wm; i--) -+ cpumask_set_cpu(cpu, sched_rq_watermark + SCHED_QUEUE_BITS - i); -+#ifdef CONFIG_SCHED_SMT -+ if (static_branch_likely(&sched_smt_present) && -+ IDLE_TASK_SCHED_PRIO == watermark) { -+ cpumask_t tmp; -+ -+ cpumask_and(&tmp, cpu_smt_mask(cpu), sched_rq_watermark); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+ } -+#endif -+} -+ -+/* -+ * This routine assume that the idle task always in queue -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS); -+ const struct list_head *head = &rq->queue.heads[sched_prio2idx(idx, rq)]; -+ -+ return list_first_entry(head, struct task_struct, sq_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ unsigned long idx = p->sq_idx; -+ struct list_head *head = &rq->queue.heads[idx]; -+ -+ if (list_is_last(&p->sq_node, head)) { -+ idx = find_next_bit(rq->queue.bitmap, SCHED_QUEUE_BITS, -+ sched_idx2prio(idx, rq) + 1); -+ head = &rq->queue.heads[sched_prio2idx(idx, rq)]; -+ -+ return list_first_entry(head, struct task_struct, sq_node); -+ } -+ -+ return list_next_entry(p, sq_node); -+} -+ -+static inline struct task_struct *rq_runnable_task(struct rq *rq) -+{ -+ struct task_struct *next = sched_rq_first_task(rq); -+ -+ if (unlikely(next == rq->skip)) -+ next = sched_rq_next_task(next, rq); -+ -+ return next; -+} -+ -+/* -+ * Serialization rules: -+ * -+ * Lock order: -+ * -+ * p->pi_lock -+ * rq->lock -+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) -+ * -+ * rq1->lock -+ * rq2->lock where: rq1 < rq2 -+ * -+ * Regular state: -+ * -+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the -+ * local CPU's rq->lock, it optionally removes the task from the runqueue and -+ * always looks at the local rq data structures to find the most eligible task -+ * to run next. -+ * -+ * Task enqueue is also under rq->lock, possibly taken from another CPU. -+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to -+ * the local CPU to avoid bouncing the runqueue state around [ see -+ * ttwu_queue_wakelist() ] -+ * -+ * Task wakeup, specifically wakeups that involve migration, are horribly -+ * complicated to avoid having to take two rq->locks. -+ * -+ * Special state: -+ * -+ * System-calls and anything external will use task_rq_lock() which acquires -+ * both p->pi_lock and rq->lock. As a consequence the state they change is -+ * stable while holding either lock: -+ * -+ * - sched_setaffinity()/ -+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed -+ * - set_user_nice(): p->se.load, p->*prio -+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, -+ * p->se.load, p->rt_priority, -+ * p->dl.dl_{runtime, deadline, period, flags, bw, density} -+ * - sched_setnuma(): p->numa_preferred_nid -+ * - sched_move_task()/ -+ * cpu_cgroup_fork(): p->sched_task_group -+ * - uclamp_update_active() p->uclamp* -+ * -+ * p->state <- TASK_*: -+ * -+ * is changed locklessly using set_current_state(), __set_current_state() or -+ * set_special_state(), see their respective comments, or by -+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against -+ * concurrent self. -+ * -+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: -+ * -+ * is set by activate_task() and cleared by deactivate_task(), under -+ * rq->lock. Non-zero indicates the task is runnable, the special -+ * ON_RQ_MIGRATING state is used for migration without holding both -+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). -+ * -+ * p->on_cpu <- { 0, 1 }: -+ * -+ * is set by prepare_task() and cleared by finish_task() such that it will be -+ * set before p is scheduled-in and cleared after p is scheduled-out, both -+ * under rq->lock. Non-zero indicates the task is running on its CPU. -+ * -+ * [ The astute reader will observe that it is possible for two tasks on one -+ * CPU to have ->on_cpu = 1 at the same time. ] -+ * -+ * task_cpu(p): is changed by set_task_cpu(), the rules are: -+ * -+ * - Don't call set_task_cpu() on a blocked task: -+ * -+ * We don't care what CPU we're not running on, this simplifies hotplug, -+ * the CPU assignment of blocked tasks isn't required to be valid. -+ * -+ * - for try_to_wake_up(), called under p->pi_lock: -+ * -+ * This allows try_to_wake_up() to only take one rq->lock, see its comment. -+ * -+ * - for migration called under rq->lock: -+ * [ see task_on_rq_migrating() in task_rq_lock() ] -+ * -+ * o move_queued_task() -+ * o detach_task() -+ * -+ * - for migration called under double_rq_lock(): -+ * -+ * o __migrate_swap_task() -+ * o push_rt_task() / pull_rt_task() -+ * o push_dl_task() / pull_dl_task() -+ * o dl_task_offline_migration() -+ * -+ */ -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+static inline void -+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -+} -+ -+void raw_spin_rq_lock_nested(struct rq *rq, int subclass) -+{ -+ raw_spinlock_t *lock; -+ -+ /* Matches synchronize_rcu() in __sched_core_enable() */ -+ preempt_disable(); -+ -+ for (;;) { -+ lock = __rq_lockp(rq); -+ raw_spin_lock_nested(lock, subclass); -+ if (likely(lock == __rq_lockp(rq))) { -+ /* preempt_count *MUST* be > 1 */ -+ preempt_enable_no_resched(); -+ return; -+ } -+ raw_spin_unlock(lock); -+ } -+} -+ -+void raw_spin_rq_unlock(struct rq *rq) -+{ -+ raw_spin_unlock(rq_lockp(rq)); -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_time_edge(rq); -+ update_rq_clock_task(rq, delta); -+} -+ -+/* -+ * RQ Load update routine -+ */ -+#define RQ_LOAD_HISTORY_BITS (sizeof(s32) * 8ULL) -+#define RQ_UTIL_SHIFT (8) -+#define RQ_LOAD_HISTORY_TO_UTIL(l) (((l) >> (RQ_LOAD_HISTORY_BITS - 1 - RQ_UTIL_SHIFT)) & 0xff) -+ -+#define LOAD_BLOCK(t) ((t) >> 17) -+#define LOAD_HALF_BLOCK(t) ((t) >> 16) -+#define BLOCK_MASK(t) ((t) & ((0x01 << 18) - 1)) -+#define LOAD_BLOCK_BIT(b) (1UL << (RQ_LOAD_HISTORY_BITS - 1 - (b))) -+#define CURRENT_LOAD_BIT LOAD_BLOCK_BIT(0) -+ -+static inline void rq_load_update(struct rq *rq) -+{ -+ u64 time = rq->clock; -+ u64 delta = min(LOAD_BLOCK(time) - LOAD_BLOCK(rq->load_stamp), -+ RQ_LOAD_HISTORY_BITS - 1); -+ u64 prev = !!(rq->load_history & CURRENT_LOAD_BIT); -+ u64 curr = !!rq->nr_running; -+ -+ if (delta) { -+ rq->load_history = rq->load_history >> delta; -+ -+ if (delta < RQ_UTIL_SHIFT) { -+ rq->load_block += (~BLOCK_MASK(rq->load_stamp)) * prev; -+ if (!!LOAD_HALF_BLOCK(rq->load_block) ^ curr) -+ rq->load_history ^= LOAD_BLOCK_BIT(delta); -+ } -+ -+ rq->load_block = BLOCK_MASK(time) * prev; -+ } else { -+ rq->load_block += (time - rq->load_stamp) * prev; -+ } -+ if (prev ^ curr) -+ rq->load_history ^= CURRENT_LOAD_BIT; -+ rq->load_stamp = time; -+} -+ -+unsigned long rq_load_util(struct rq *rq, unsigned long max) -+{ -+ return RQ_LOAD_HISTORY_TO_UTIL(rq->load_history) * (max >> RQ_UTIL_SHIFT); -+} -+ -+#ifdef CONFIG_SMP -+unsigned long sched_cpu_util(int cpu, unsigned long max) -+{ -+ return rq_load_util(cpu_rq(cpu), max); -+} -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_CPU_FREQ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+#ifdef CONFIG_SMP -+ rq_load_update(rq); -+#endif -+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, -+ cpu_of(rq))); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+#ifdef CONFIG_SMP -+ rq_load_update(rq); -+#endif -+} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+bool sched_task_on_rq(struct task_struct *p) -+{ -+ return task_on_rq_queued(p); -+} -+ -+unsigned long get_wchan(struct task_struct *p) -+{ -+ unsigned long ip = 0; -+ unsigned int state; -+ -+ if (!p || p == current) -+ return 0; -+ -+ /* Only get wchan if task is blocked and we can keep it that way. */ -+ raw_spin_lock_irq(&p->pi_lock); -+ state = READ_ONCE(p->__state); -+ smp_rmb(); /* see try_to_wake_up() */ -+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) -+ ip = __get_wchan(p); -+ raw_spin_unlock_irq(&p->pi_lock); -+ -+ return ip; -+} -+ -+/* -+ * Add/Remove/Requeue task to/from the runqueue routines -+ * Context: rq->lock -+ */ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeue(rq, p); \ -+ \ -+ list_del(&p->sq_node); \ -+ if (list_empty(&rq->queue.heads[p->sq_idx])) \ -+ clear_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_enqueue(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->sq_idx = task_sched_prio_idx(p, rq); \ -+ list_add_tail(&p->sq_node, &rq->queue.heads[p->sq_idx]); \ -+ set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); -+ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_DEQUEUE_TASK(p, rq, flags); -+ --rq->nr_running; -+#ifdef CONFIG_SMP -+ if (1 == rq->nr_running) -+ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+} -+ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_ENQUEUE_TASK(p, rq, flags); -+ update_sched_rq_watermark(rq); -+ ++rq->nr_running; -+#ifdef CONFIG_SMP -+ if (2 == rq->nr_running) -+ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq, int idx) -+{ -+ lockdep_assert_held(&rq->lock); -+ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ list_del(&p->sq_node); -+ list_add_tail(&p->sq_node, &rq->queue.heads[idx]); -+ if (idx != p->sq_idx) { -+ if (list_empty(&rq->queue.heads[p->sq_idx])) -+ clear_bit(sched_idx2prio(p->sq_idx, rq), -+ rq->queue.bitmap); -+ p->sq_idx = idx; -+ set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); -+ update_sched_rq_watermark(rq); -+ } -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * it's already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) {} -+ -+void select_nohz_load_balancer(int stop_tick) {} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ const struct cpumask *hk_mask; -+ -+ if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); -+ -+ for (mask = per_cpu(sched_cpu_topo_masks, cpu) + 1; -+ mask < per_cpu(sched_cpu_topo_end_mask, cpu); mask++) -+ for_each_cpu_and(i, mask, hk_mask) -+ if (!idle_cpu(i)) -+ return i; -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); -+ cpu = default_cpu; -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+static inline void wake_up_idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(rq->idle)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static inline bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+ -+static void nohz_csd_func(void *info) -+{ -+ struct rq *rq = info; -+ int cpu = cpu_of(rq); -+ unsigned int flags; -+ -+ /* -+ * Release the rq::nohz_csd. -+ */ -+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); -+ WARN_ON(!(flags & NOHZ_KICK_MASK)); -+ -+ rq->idle_balance = idle_cpu(cpu); -+ if (rq->idle_balance && !need_resched()) { -+ rq->nohz_idle_balance = flags; -+ raise_softirq_irqoff(SCHED_SOFTIRQ); -+ } -+} -+ -+#endif /* CONFIG_NO_HZ_COMMON */ -+#endif /* CONFIG_SMP */ -+ -+static inline void check_preempt_curr(struct rq *rq) -+{ -+ if (sched_rq_first_task(rq) != rq->curr) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * Alt schedule FW doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time = rq->hrtick_time; -+ -+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ -+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ if (rq == this_rq()) -+ __hrtick_restart(rq); -+ else -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int __normal_prio(int policy, int rt_prio, int static_prio) -+{ -+ return rt_policy(policy) ? (MAX_RT_PRIO - 1 - rt_prio) : -+ static_prio + MAX_PRIORITY_ADJ; -+} -+ -+/* -+ * Calculate the expected normal priority: i.e. priority -+ * without taking RT-inheritance into account. Might be -+ * boosted by interactivity modifiers. Changes upon fork, -+ * setprio syscalls, and whenever the interactivity -+ * estimator recalculates. -+ */ -+static inline int normal_prio(struct task_struct *p) -+{ -+ return __normal_prio(p->policy, p->rt_priority, p->static_prio); -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT * p->in_iowait); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_util(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+} -+ -+static inline bool is_migration_disabled(struct task_struct *p) -+{ -+#ifdef CONFIG_SMP -+ return p->migration_disabled; -+#else -+ return false; -+#endif -+} -+ -+#define SCA_CHECK 0x01 -+#define SCA_USER 0x08 -+ -+#ifdef CONFIG_SMP -+ -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ unsigned int state = READ_ONCE(p->__state); -+ -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); -+ -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+ -+ WARN_ON_ONCE(is_migration_disabled(p)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+#define MDF_FORCE_ENABLED 0x80 -+ -+static void -+__do_set_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ /* -+ * This here violates the locking rules for affinity, since we're only -+ * supposed to change these variables while holding both rq->lock and -+ * p->pi_lock. -+ * -+ * HOWEVER, it magically works, because ttwu() is the only code that -+ * accesses these variables under p->pi_lock and only does so after -+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() -+ * before finish_task(). -+ * -+ * XXX do further audits, this smells like something putrid. -+ */ -+ SCHED_WARN_ON(!p->on_cpu); -+ p->cpus_ptr = new_mask; -+} -+ -+void migrate_disable(void) -+{ -+ struct task_struct *p = current; -+ int cpu; -+ -+ if (p->migration_disabled) { -+ p->migration_disabled++; -+ return; -+ } -+ -+ preempt_disable(); -+ cpu = smp_processor_id(); -+ if (cpumask_test_cpu(cpu, &p->cpus_mask)) { -+ cpu_rq(cpu)->nr_pinned++; -+ p->migration_disabled = 1; -+ p->migration_flags &= ~MDF_FORCE_ENABLED; -+ -+ /* -+ * Violates locking rules! see comment in __do_set_cpus_ptr(). -+ */ -+ if (p->cpus_ptr == &p->cpus_mask) -+ __do_set_cpus_ptr(p, cpumask_of(cpu)); -+ } -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(migrate_disable); -+ -+void migrate_enable(void) -+{ -+ struct task_struct *p = current; -+ -+ if (0 == p->migration_disabled) -+ return; -+ -+ if (p->migration_disabled > 1) { -+ p->migration_disabled--; -+ return; -+ } -+ -+ if (WARN_ON_ONCE(!p->migration_disabled)) -+ return; -+ -+ /* -+ * Ensure stop_task runs either before or after this, and that -+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). -+ */ -+ preempt_disable(); -+ /* -+ * Assumption: current should be running on allowed cpu -+ */ -+ WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &p->cpus_mask)); -+ if (p->cpus_ptr != &p->cpus_mask) -+ __do_set_cpus_ptr(p, &p->cpus_mask); -+ /* -+ * Mustn't clear migration_disabled() until cpus_ptr points back at the -+ * regular cpus_mask, otherwise things that race (eg. -+ * select_fallback_rq) get confused. -+ */ -+ barrier(); -+ p->migration_disabled = 0; -+ this_rq()->nr_pinned--; -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(migrate_enable); -+ -+static inline bool rq_has_pinned_tasks(struct rq *rq) -+{ -+ return rq->nr_pinned; -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ /* When not in the task's cpumask, no point in looking further. */ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ /* migrate_disabled() must be allowed to finish. */ -+ if (is_migration_disabled(p)) -+ return cpu_online(cpu); -+ -+ /* Non kernel threads are not allowed during either online or offline. */ -+ if (!(p->flags & PF_KTHREAD)) -+ return cpu_active(cpu) && task_cpu_possible(cpu, p); -+ -+ /* KTHREAD_IS_PER_CPU is always allowed. */ -+ if (kthread_is_per_cpu(p)) -+ return cpu_online(cpu); -+ -+ /* Regular kernel threads don't get to stay during offline. */ -+ if (cpu_dying(cpu)) -+ return false; -+ -+ /* But are allowed during online. */ -+ return cpu_online(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ dequeue_task(p, rq, 0); -+ update_sched_rq_watermark(rq); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ BUG_ON(task_cpu(p) != new_cpu); -+ sched_task_sanity_check(p, rq); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ unsigned long flags; -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_save(flags); -+ /* -+ * We need to explicitly wake pending tasks before running -+ * __migrate_task() such that we will not miss enforcing cpus_ptr -+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. -+ */ -+ flush_smp_call_function_queue(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+static void -+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ lockdep_assert_held(&p->pi_lock); -+ set_cpus_allowed_common(p, new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ __do_set_cpus_allowed(p, new_mask); -+} -+ -+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, -+ int node) -+{ -+ if (!src->user_cpus_ptr) -+ return 0; -+ -+ dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); -+ if (!dst->user_cpus_ptr) -+ return -ENOMEM; -+ -+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); -+ return 0; -+} -+ -+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) -+{ -+ struct cpumask *user_mask = NULL; -+ -+ swap(p->user_cpus_ptr, user_mask); -+ -+ return user_mask; -+} -+ -+void release_user_cpus_ptr(struct task_struct *p) -+{ -+ kfree(clear_user_cpus_ptr(p)); -+} -+ -+#endif -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || READ_ONCE(p->__state) == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (is_cpu_allowed(p, dest_cpu)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, p->cpus_ptr) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (cpuset_cpus_allowed_fallback(p)) { -+ state = possible; -+ break; -+ } -+ fallthrough; -+ case possible: -+ /* -+ * XXX When called from select_task_rq() we only -+ * hold p->pi_lock and again violate locking order. -+ * -+ * More yuck to audit. -+ */ -+ do_set_cpus_allowed(p, task_cpu_possible_mask(p)); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ cpumask_t chk_mask, tmp; -+ -+ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_active_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ if ( -+#ifdef CONFIG_SCHED_SMT -+ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || -+#endif -+ cpumask_and(&tmp, &chk_mask, sched_rq_watermark) || -+ cpumask_and(&tmp, &chk_mask, -+ sched_rq_watermark + SCHED_QUEUE_BITS - 1 - task_sched_prio(p))) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ return best_mask_cpu(task_cpu(p), &chk_mask); -+} -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ static struct lock_class_key stop_pi_lock; -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ -+ /* -+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to -+ * adjust the effective priority of a task. As a result, -+ * rt_mutex_setprio() can trigger (RT) balancing operations, -+ * which can then trigger wakeups of the stop thread to push -+ * around the current task. -+ * -+ * The stop task itself will never be part of the PI-chain, it -+ * never blocks, therefore that ->pi_lock recursion is safe. -+ * Tell lockdep about this by placing the stop->pi_lock in its -+ * own class. -+ */ -+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+static int affine_move_task(struct rq *rq, struct task_struct *p, int dest_cpu, -+ raw_spinlock_t *lock, unsigned long irq_flags) -+{ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { -+ if (p->migration_disabled) { -+ if (likely(p->cpus_ptr != &p->cpus_mask)) -+ __do_set_cpus_ptr(p, &p->cpus_mask); -+ p->migration_disabled = 0; -+ p->migration_flags |= MDF_FORCE_ENABLED; -+ /* When p is migrate_disabled, rq->lock should be held */ -+ rq->nr_pinned--; -+ } -+ -+ if (task_running(p) || READ_ONCE(p->__state) == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ } -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ return 0; -+} -+ -+static int __set_cpus_allowed_ptr_locked(struct task_struct *p, -+ const struct cpumask *new_mask, -+ u32 flags, -+ struct rq *rq, -+ raw_spinlock_t *lock, -+ unsigned long irq_flags) -+{ -+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ bool kthread = p->flags & PF_KTHREAD; -+ struct cpumask *user_mask = NULL; -+ int dest_cpu; -+ int ret = 0; -+ -+ if (kthread || is_migration_disabled(p)) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs, -+ * however, during cpu-hot-unplug, even these might get pushed -+ * away if not KTHREAD_IS_PER_CPU. -+ * -+ * Specifically, migration_disabled() tasks must not fail the -+ * cpumask_any_and_distribute() pick below, esp. so on -+ * SCA_MIGRATE_ENABLE, otherwise we'll not call -+ * set_cpus_allowed_common() and actually reset p->cpus_ptr. -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ __do_set_cpus_allowed(p, new_mask); -+ -+ if (flags & SCA_USER) -+ user_mask = clear_user_cpus_ptr(p); -+ -+ ret = affine_move_task(rq, p, dest_cpu, lock, irq_flags); -+ -+ kfree(user_mask); -+ -+ return ret; -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ -+ return ret; -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, u32 flags) -+{ -+ unsigned long irq_flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, irq_flags); -+ rq = __task_access_lock(p, &lock); -+ -+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, lock, irq_flags); -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, 0); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+/* -+ * Change a given task's CPU affinity to the intersection of its current -+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask -+ * and pointing @p->user_cpus_ptr to a copy of the old mask. -+ * If the resulting mask is empty, leave the affinity unchanged and return -+ * -EINVAL. -+ */ -+static int restrict_cpus_allowed_ptr(struct task_struct *p, -+ struct cpumask *new_mask, -+ const struct cpumask *subset_mask) -+{ -+ struct cpumask *user_mask = NULL; -+ unsigned long irq_flags; -+ raw_spinlock_t *lock; -+ struct rq *rq; -+ int err; -+ -+ if (!p->user_cpus_ptr) { -+ user_mask = kmalloc(cpumask_size(), GFP_KERNEL); -+ if (!user_mask) -+ return -ENOMEM; -+ } -+ -+ raw_spin_lock_irqsave(&p->pi_lock, irq_flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { -+ err = -EINVAL; -+ goto err_unlock; -+ } -+ -+ /* -+ * We're about to butcher the task affinity, so keep track of what -+ * the user asked for in case we're able to restore it later on. -+ */ -+ if (user_mask) { -+ cpumask_copy(user_mask, p->cpus_ptr); -+ p->user_cpus_ptr = user_mask; -+ } -+ -+ /*return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);*/ -+ return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, lock, irq_flags); -+ -+err_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ kfree(user_mask); -+ return err; -+} -+ -+/* -+ * Restrict the CPU affinity of task @p so that it is a subset of -+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the -+ * old affinity mask. If the resulting mask is empty, we warn and walk -+ * up the cpuset hierarchy until we find a suitable mask. -+ */ -+void force_compatible_cpus_allowed_ptr(struct task_struct *p) -+{ -+ cpumask_var_t new_mask; -+ const struct cpumask *override_mask = task_cpu_possible_mask(p); -+ -+ alloc_cpumask_var(&new_mask, GFP_KERNEL); -+ -+ /* -+ * __migrate_task() can fail silently in the face of concurrent -+ * offlining of the chosen destination CPU, so take the hotplug -+ * lock to ensure that the migration succeeds. -+ */ -+ cpus_read_lock(); -+ if (!cpumask_available(new_mask)) -+ goto out_set_mask; -+ -+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) -+ goto out_free_mask; -+ -+ /* -+ * We failed to find a valid subset of the affinity mask for the -+ * task, so override it based on its cpuset hierarchy. -+ */ -+ cpuset_cpus_allowed(p, new_mask); -+ override_mask = new_mask; -+ -+out_set_mask: -+ if (printk_ratelimit()) { -+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", -+ task_pid_nr(p), p->comm, -+ cpumask_pr_args(override_mask)); -+ } -+ -+ WARN_ON(set_cpus_allowed_ptr(p, override_mask)); -+out_free_mask: -+ cpus_read_unlock(); -+ free_cpumask_var(new_mask); -+} -+ -+static int -+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); -+ -+/* -+ * Restore the affinity of a task @p which was previously restricted by a -+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) -+ * @p->user_cpus_ptr. -+ * -+ * It is the caller's responsibility to serialise this with any calls to -+ * force_compatible_cpus_allowed_ptr(@p). -+ */ -+void relax_compatible_cpus_allowed_ptr(struct task_struct *p) -+{ -+ struct cpumask *user_mask = p->user_cpus_ptr; -+ unsigned long flags; -+ -+ /* -+ * Try to restore the old affinity mask. If this fails, then -+ * we free the mask explicitly to avoid it being inherited across -+ * a subsequent fork(). -+ */ -+ if (!user_mask || !__sched_setaffinity(p, user_mask)) -+ return; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ user_mask = clear_user_cpus_ptr(p); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ kfree(user_mask); -+} -+ -+#else /* CONFIG_SMP */ -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, u32 flags) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+ -+static inline bool rq_has_pinned_tasks(struct rq *rq) -+{ -+ return false; -+} -+ -+#endif /* !CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq = this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) { -+ __schedstat_inc(rq->ttwu_local); -+ __schedstat_inc(p->stats.nr_wakeups_local); -+ } else { -+ /** Alt schedule FW ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+ __schedstat_inc(p->stats.nr_wakeups); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ check_preempt_curr(rq); -+ WRITE_ONCE(p->__state, TASK_RUNNING); -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+ if ( -+#ifdef CONFIG_SMP -+ !(wake_flags & WF_MIGRATED) && -+#endif -+ p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+/* -+ * Consider @p being inside a wait loop: -+ * -+ * for (;;) { -+ * set_current_state(TASK_UNINTERRUPTIBLE); -+ * -+ * if (CONDITION) -+ * break; -+ * -+ * schedule(); -+ * } -+ * __set_current_state(TASK_RUNNING); -+ * -+ * between set_current_state() and schedule(). In this case @p is still -+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in -+ * an atomic manner. -+ * -+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq -+ * then schedule() must still happen and p->state can be changed to -+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we -+ * need to do a full wakeup with enqueue. -+ * -+ * Returns: %true when the wakeup is done, -+ * %false otherwise. -+ */ -+static int ttwu_runnable(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ /* check_preempt_curr() may use rq clock */ -+ update_rq_clock(rq); -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ update_rq_clock(rq); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { -+ if (WARN_ON_ONCE(p->on_cpu)) -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) -+ set_task_cpu(p, cpu_of(rq)); -+ -+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if -+ * necessary. The wakee CPU on receipt of the IPI will queue the task -+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost -+ * of the wakeup instead of the waker. -+ */ -+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); -+ -+ WRITE_ONCE(rq->ttwu_pending, 1); -+ __smp_call_single_queue(cpu, &p->wake_entry.llist); -+} -+ -+static inline bool ttwu_queue_cond(int cpu, int wake_flags) -+{ -+ /* -+ * Do not complicate things with the async wake_list while the CPU is -+ * in hotplug state. -+ */ -+ if (!cpu_active(cpu)) -+ return false; -+ -+ /* -+ * If the CPU does not share cache, then queue the task on the -+ * remote rqs wakelist to avoid accessing remote data. -+ */ -+ if (!cpus_share_cache(smp_processor_id(), cpu)) -+ return true; -+ -+ /* -+ * If the task is descheduling and the only running task on the -+ * CPU then use the wakelist to offload the task activation to -+ * the soon-to-be-idle CPU as the current CPU is likely busy. -+ * nr_running is checked to avoid unnecessary task stacking. -+ */ -+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) -+ return true; -+ -+ return false; -+} -+ -+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { -+ if (WARN_ON_ONCE(cpu == smp_processor_id())) -+ return false; -+ -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ __ttwu_queue_wakelist(p, cpu, wake_flags); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ resched_curr(rq); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ if (this_cpu == that_cpu) -+ return true; -+ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#else /* !CONFIG_SMP */ -+ -+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (ttwu_queue_wakelist(p, cpu, wake_flags)) -+ return; -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Invoked from try_to_wake_up() to check whether the task can be woken up. -+ * -+ * The caller holds p::pi_lock if p != current or has preemption -+ * disabled when p == current. -+ * -+ * The rules of PREEMPT_RT saved_state: -+ * -+ * The related locking code always holds p::pi_lock when updating -+ * p::saved_state, which means the code is fully serialized in both cases. -+ * -+ * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other -+ * bits set. This allows to distinguish all wakeup scenarios. -+ */ -+static __always_inline -+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) -+{ -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { -+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && -+ state != TASK_RTLOCK_WAIT); -+ } -+ -+ if (READ_ONCE(p->__state) & state) { -+ *success = 1; -+ return true; -+ } -+ -+#ifdef CONFIG_PREEMPT_RT -+ /* -+ * Saved state preserves the task state across blocking on -+ * an RT lock. If the state matches, set p::saved_state to -+ * TASK_RUNNING, but do not wake the task because it waits -+ * for a lock wakeup. Also indicate success because from -+ * the regular waker's point of view this has succeeded. -+ * -+ * After acquiring the lock the task will restore p::__state -+ * from p::saved_state which ensures that the regular -+ * wakeup is not lost. The restore will also set -+ * p::saved_state to TASK_RUNNING so any further tests will -+ * not result in false positives vs. @success -+ */ -+ if (p->saved_state & state) { -+ p->saved_state = TASK_RUNNING; -+ *success = 1; -+ } -+#endif -+ return false; -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task() -+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Conceptually does: -+ * -+ * If (@state & @p->state) @p->state = TASK_RUNNING. -+ * -+ * If the task was not queued/runnable, also place it back on a runqueue. -+ * -+ * This function is atomic against schedule() which would dequeue the task. -+ * -+ * It issues a full memory barrier before accessing @p->state, see the comment -+ * with set_current_state(). -+ * -+ * Uses p->pi_lock to serialize against concurrent wake-ups. -+ * -+ * Relies on p->pi_lock stabilizing: -+ * - p->sched_class -+ * - p->cpus_ptr -+ * - p->sched_task_group -+ * in order to do migration, see its use of select_task_rq()/set_task_cpu(). -+ * -+ * Tries really hard to only take one task_rq(p)->lock for performance. -+ * Takes rq->lock in: -+ * - ttwu_runnable() -- old rq, unavoidable, see comment there; -+ * - ttwu_queue() -- new rq, for enqueue of the task; -+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. -+ * -+ * As a consequence we race really badly with just about everything. See the -+ * many memory barriers and their comments for details. -+ * -+ * Return: %true if @p->state changes (an actual wakeup was done), -+ * %false otherwise. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_runnable()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!ttwu_state_match(p, state, &success)) -+ goto out; -+ -+ trace_sched_waking(p); -+ WRITE_ONCE(p->__state, TASK_RUNNING); -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with smp_store_mb() -+ * in set_current_state() that the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!ttwu_state_match(p, state, &success)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). -+ */ -+ smp_rmb(); -+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) -+ goto unlock; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure -+ * schedule()'s deactivate_task() has 'happened' and p will no longer -+ * care about it's own p->state. See the comment in __schedule(). -+ */ -+ smp_acquire__after_ctrl_dep(); -+ -+ /* -+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq -+ * == 0), which means we need to do an enqueue, change p->state to -+ * TASK_WAKING such that we can unlock p->pi_lock before doing the -+ * enqueue, such as ttwu_queue_wakelist(). -+ */ -+ WRITE_ONCE(p->__state, TASK_WAKING); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, considering queueing p on the remote CPUs wake_list -+ * which potentially sends an IPI instead of spinning on p->on_cpu to -+ * let the waker make forward progress. This is safe because IRQs are -+ * disabled and the IPI will deliver after on_cpu is cleared. -+ * -+ * Ensure we load task_cpu(p) after p->on_cpu: -+ * -+ * set_task_cpu(p, cpu); -+ * STORE p->cpu = @cpu -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock -+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) -+ * STORE p->on_cpu = 1 LOAD p->cpu -+ * -+ * to ensure we observe the correct CPU on which the task is currently -+ * scheduling. -+ */ -+ if (smp_load_acquire(&p->on_cpu) && -+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) -+ goto unlock; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until it's done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ sched_task_ttwu(p); -+ -+ cpu = select_task_rq(p); -+ -+ if (cpu != task_cpu(p)) { -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else -+ cpu = task_cpu(p); -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, task_cpu(p), wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * task_call_func - Invoke a function on task in fixed state -+ * @p: Process for which the function is to be invoked, can be @current. -+ * @func: Function to invoke. -+ * @arg: Argument to function. -+ * -+ * Fix the task in it's current state by avoiding wakeups and or rq operations -+ * and call @func(@arg) on it. This function can use ->on_rq and task_curr() -+ * to work out what the state is, if required. Given that @func can be invoked -+ * with a runqueue lock held, it had better be quite lightweight. -+ * -+ * Returns: -+ * Whatever @func returns -+ */ -+int task_call_func(struct task_struct *p, task_call_f func, void *arg) -+{ -+ struct rq *rq = NULL; -+ unsigned int state; -+ struct rq_flags rf; -+ int ret; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags); -+ -+ state = READ_ONCE(p->__state); -+ -+ /* -+ * Ensure we load p->on_rq after p->__state, otherwise it would be -+ * possible to, falsely, observe p->on_rq == 0. -+ * -+ * See try_to_wake_up() for a longer comment. -+ */ -+ smp_rmb(); -+ -+ /* -+ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when -+ * the task is blocked. Make sure to check @state since ttwu() can drop -+ * locks at the end, see ttwu_queue_wakelist(). -+ */ -+ if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq) -+ rq = __task_rq_lock(p, &rf); -+ -+ /* -+ * At this point the task is pinned; either: -+ * - blocked and we're holding off wakeups (pi->lock) -+ * - woken, and we're holding off enqueue (rq->lock) -+ * - queued, and we're holding off schedule (rq->lock) -+ * - running, and we're holding off de-schedule (rq->lock) -+ * -+ * The called function (@func) can use: task_curr(), p->on_rq and -+ * p->__state to differentiate between these states. -+ */ -+ ret = func(p, arg); -+ -+ if (rq) -+ __task_rq_unlock(rq, &rf); -+ -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); -+ return ret; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ * -+ * __sched_fork() is basic setup used by init_idle() too: -+ */ -+static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ p->on_rq = 0; -+ p->on_cpu = 0; -+ p->utime = 0; -+ p->stime = 0; -+ p->sched_time = 0; -+ -+#ifdef CONFIG_SCHEDSTATS -+ /* Even if schedstat is disabled, there should not be garbage */ -+ memset(&p->stats, 0, sizeof(p->stats)); -+#endif -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+#ifdef CONFIG_SMP -+ p->wake_entry.u_flags = CSD_TYPE_TTWU; -+#endif -+} -+ -+/* -+ * fork()/clone()-time setup: -+ */ -+int sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ __sched_fork(clone_flags, p); -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->__state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = p->static_prio; -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ /* -+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly -+ * required yet, but lockdep gets upset if rules are violated. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, rq->curr->time_slice); -+#endif -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = sched_timeslice_ns; -+ resched_curr(rq); -+ } -+ sched_task_fork(p, rq); -+ raw_spin_unlock(&rq->lock); -+ -+ rseq_migrate(p); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, smp_processor_id()); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+void sched_post_fork(struct task_struct *p) -+{ -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ if (!strcmp(str, "enable")) { -+ set_schedstats(true); -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ set_schedstats(false); -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+#ifdef CONFIG_PROC_SYSCTL -+static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, -+ size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+ -+static struct ctl_table sched_core_sysctls[] = { -+ { -+ .procname = "sched_schedstats", -+ .data = NULL, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0644, -+ .proc_handler = sysctl_schedstats, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+ {} -+}; -+static int __init sched_core_sysctl_init(void) -+{ -+ register_sysctl_init("kernel", sched_core_sysctls); -+ return 0; -+} -+late_initcall(sched_core_sysctl_init); -+#endif /* CONFIG_PROC_SYSCTL */ -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ WRITE_ONCE(p->__state, TASK_RUNNING); -+ rq = cpu_rq(select_task_rq(p)); -+#ifdef CONFIG_SMP -+ rseq_migrate(p); -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_ptr can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ * -+ * See the ttwu() WF_ON_CPU case and its ordering comment. -+ */ -+ WRITE_ONCE(next->on_cpu, 1); -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * This must be the very last reference to @prev from this CPU. After -+ * p->on_cpu is cleared, the task can be moved to a different CPU. We -+ * must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+#ifdef CONFIG_SMP -+ -+static void do_balance_callbacks(struct rq *rq, struct callback_head *head) -+{ -+ void (*func)(struct rq *rq); -+ struct callback_head *next; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ while (head) { -+ func = (void (*)(struct rq *))head->func; -+ next = head->next; -+ head->next = NULL; -+ head = next; -+ -+ func(rq); -+ } -+} -+ -+static void balance_push(struct rq *rq); -+ -+/* -+ * balance_push_callback is a right abuse of the callback interface and plays -+ * by significantly different rules. -+ * -+ * Where the normal balance_callback's purpose is to be ran in the same context -+ * that queued it (only later, when it's safe to drop rq->lock again), -+ * balance_push_callback is specifically targeted at __schedule(). -+ * -+ * This abuse is tolerated because it places all the unlikely/odd cases behind -+ * a single test, namely: rq->balance_callback == NULL. -+ */ -+struct callback_head balance_push_callback = { -+ .next = NULL, -+ .func = (void (*)(struct callback_head *))balance_push, -+}; -+ -+static inline struct callback_head * -+__splice_balance_callbacks(struct rq *rq, bool split) -+{ -+ struct callback_head *head = rq->balance_callback; -+ -+ if (likely(!head)) -+ return NULL; -+ -+ lockdep_assert_rq_held(rq); -+ /* -+ * Must not take balance_push_callback off the list when -+ * splice_balance_callbacks() and balance_callbacks() are not -+ * in the same rq->lock section. -+ * -+ * In that case it would be possible for __schedule() to interleave -+ * and observe the list empty. -+ */ -+ if (split && head == &balance_push_callback) -+ head = NULL; -+ else -+ rq->balance_callback = NULL; -+ -+ return head; -+} -+ -+static inline struct callback_head *splice_balance_callbacks(struct rq *rq) -+{ -+ return __splice_balance_callbacks(rq, true); -+} -+ -+static void __balance_callbacks(struct rq *rq) -+{ -+ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); -+} -+ -+static inline void balance_callbacks(struct rq *rq, struct callback_head *head) -+{ -+ unsigned long flags; -+ -+ if (unlikely(head)) { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ do_balance_callbacks(rq, head); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+} -+ -+#else -+ -+static inline void __balance_callbacks(struct rq *rq) -+{ -+} -+ -+static inline struct callback_head *splice_balance_callbacks(struct rq *rq) -+{ -+ return NULL; -+} -+ -+static inline void balance_callbacks(struct rq *rq, struct callback_head *head) -+{ -+} -+ -+#endif -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ __balance_callbacks(rq); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/* -+ * NOP if the arch has not defined these: -+ */ -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+ -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+static inline void kmap_local_sched_out(void) -+{ -+#ifdef CONFIG_KMAP_LOCAL -+ if (unlikely(current->kmap_ctrl.idx)) -+ __kmap_local_sched_out(); -+#endif -+} -+ -+static inline void kmap_local_sched_in(void) -+{ -+#ifdef CONFIG_KMAP_LOCAL -+ if (unlikely(current->kmap_ctrl.idx)) -+ __kmap_local_sched_in(); -+#endif -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ kmap_local_sched_out(); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ unsigned int prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = READ_ONCE(prev->__state); -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ tick_nohz_task_switch(); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ /* -+ * kmap_local_sched_out() is invoked with rq::lock held and -+ * interrupts disabled. There is no requirement for that, but the -+ * sched out code does not have an interrupt enabled section. -+ * Restoring the maps on sched in does not require interrupts being -+ * disabled either. -+ */ -+ kmap_local_sched_in(); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop_sched(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned int nr_running(void) -+{ -+ unsigned int i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned int nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how it's mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned int nr_iowait(void) -+{ -+ unsigned int i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * sched_exec - execve() is a valuable balancing opportunity, because at -+ * this point the task has the smallest effective memory and cache -+ * footprint. -+ */ -+void sched_exec(void) -+{ -+} -+ -+#endif -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ cgroup_account_cputime(p, ns); -+ account_group_exec_runtime(p, ns); -+ -+ p->time_slice -= ns; -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ update_curr(rq, p); -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks have less than RESCHED_NS of time slice left they will be -+ * rescheduled. -+ */ -+ if (p->time_slice >= RESCHED_NS) -+ return; -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+#ifdef CONFIG_SCHED_DEBUG -+static u64 cpu_resched_latency(struct rq *rq) -+{ -+ int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); -+ u64 resched_latency, now = rq_clock(rq); -+ static bool warned_once; -+ -+ if (sysctl_resched_latency_warn_once && warned_once) -+ return 0; -+ -+ if (!need_resched() || !latency_warn_ms) -+ return 0; -+ -+ if (system_state == SYSTEM_BOOTING) -+ return 0; -+ -+ if (!rq->last_seen_need_resched_ns) { -+ rq->last_seen_need_resched_ns = now; -+ rq->ticks_without_resched = 0; -+ return 0; -+ } -+ -+ rq->ticks_without_resched++; -+ resched_latency = now - rq->last_seen_need_resched_ns; -+ if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) -+ return 0; -+ -+ warned_once = true; -+ -+ return resched_latency; -+} -+ -+static int __init setup_resched_latency_warn_ms(char *str) -+{ -+ long val; -+ -+ if ((kstrtol(str, 0, &val))) { -+ pr_warn("Unable to set resched_latency_warn_ms\n"); -+ return 1; -+ } -+ -+ sysctl_resched_latency_warn_ms = val; -+ return 1; -+} -+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); -+#else -+static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } -+#endif /* CONFIG_SCHED_DEBUG */ -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ u64 resched_latency; -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ scheduler_task_tick(rq); -+ if (sched_feat(LATENCY_WARN)) -+ resched_latency = cpu_resched_latency(rq); -+ calc_global_load_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ if (sched_feat(LATENCY_WARN) && resched_latency) -+ resched_latency_warn(cpu, resched_latency); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_SCHED_SMT -+static inline int sg_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ cpumask_t tmp; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* _something_ may have changed the task, double check again */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask) && -+ !is_migration_disabled(p)) { -+ int cpu = cpu_of(rq); -+ int dcpu = __best_mask_cpu(&tmp, per_cpu(sched_cpu_llc_mask, cpu)); -+ rq = move_queued_task(rq, p, dcpu); -+ } -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* sg_balance_trigger - trigger slibing group balance for @cpu */ -+static inline int sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq= cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ int res; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return 0; -+ curr = rq->curr; -+ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ -+ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ -+ !is_migration_disabled(curr) && (!rq->active_balance); -+ -+ if (res) -+ rq->active_balance = 1; -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (res) -+ stop_one_cpu_nowait(cpu, sg_balance_cpu_stop, curr, -+ &rq->active_balance_work); -+ return res; -+} -+ -+/* -+ * sg_balance - slibing group balance check for run queue @rq -+ */ -+static inline void sg_balance(struct rq *rq) -+{ -+ cpumask_t chk; -+ int cpu = cpu_of(rq); -+ -+ /* exit when cpu is offline */ -+ if (unlikely(!rq->online)) -+ return; -+ -+ /* -+ * Only cpu in slibing idle group will do the checking and then -+ * find potential cpus which can migrate the current running task -+ */ -+ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && -+ cpumask_andnot(&chk, cpu_online_mask, sched_rq_watermark) && -+ cpumask_andnot(&chk, &chk, &sched_rq_pending_mask)) { -+ int i; -+ -+ for_each_cpu_wrap(i, &chk, cpu) { -+ if (cpumask_subset(cpu_smt_mask(i), &chk) && -+ sg_balance_trigger(i)) -+ return; -+ } -+ } -+} -+#endif /* CONFIG_SCHED_SMT */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ scheduler_task_tick(rq); -+ -+ calc_load_nohz_remote(rq); -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_TYPE_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_TYPE_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+ -+ if (task_scs_end_corrupted(prev)) -+ panic("corrupted shadow stack detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ SCHED_WARN_ON(ct_state() == CONTEXT_USER); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * Compile time debug macro -+ * #define ALT_SCHED_DEBUG -+ */ -+ -+#ifdef ALT_SCHED_DEBUG -+void alt_sched_debug(void) -+{ -+ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", -+ sched_rq_pending_mask.bits[0], -+ sched_rq_watermark[0].bits[0], -+ sched_sg_idle_mask.bits[0]); -+} -+#else -+inline void alt_sched_debug(void) {} -+#endif -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32U) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) -+{ -+ struct task_struct *p, *skip = rq->curr; -+ int nr_migrated = 0; -+ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); -+ -+ while (skip != rq->idle && nr_tries && -+ (p = sched_rq_next_task(skip, rq)) != rq->idle) { -+ skip = sched_rq_next_task(p, rq); -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { -+ __SCHED_DEQUEUE_TASK(p, rq, 0); -+ set_task_cpu(p, dest_cpu); -+ sched_task_sanity_check(p, dest_rq); -+ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int take_other_rq_tasks(struct rq *rq, int cpu) -+{ -+ struct cpumask *topo_mask, *end_mask; -+ -+ if (unlikely(!rq->online)) -+ return 0; -+ -+ if (cpumask_empty(&sched_rq_pending_mask)) -+ return 0; -+ -+ topo_mask = per_cpu(sched_cpu_topo_masks, cpu) + 1; -+ end_mask = per_cpu(sched_cpu_topo_end_mask, cpu); -+ do { -+ int i; -+ for_each_cpu_and(i, &sched_rq_pending_mask, topo_mask) { -+ int nr_migrated; -+ struct rq *src_rq; -+ -+ src_rq = cpu_rq(i); -+ if (!do_raw_spin_trylock(&src_rq->lock)) -+ continue; -+ spin_acquire(&src_rq->lock.dep_map, -+ SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { -+ src_rq->nr_running -= nr_migrated; -+ if (src_rq->nr_running < 2) -+ cpumask_clear_cpu(i, &sched_rq_pending_mask); -+ -+ rq->nr_running += nr_migrated; -+ if (rq->nr_running > 1) -+ cpumask_set_cpu(cpu, &sched_rq_pending_mask); -+ -+ cpufreq_update_util(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ return 1; -+ } -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ } -+ } while (++topo_mask < end_mask); -+ -+ return 0; -+} -+#endif -+ -+/* -+ * Timeslices below RESCHED_NS are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. -+ */ -+static inline void check_curr(struct task_struct *p, struct rq *rq) -+{ -+ if (unlikely(rq->idle == p)) -+ return; -+ -+ update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_NS) -+ time_slice_expired(p, rq); -+} -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next; -+ -+ if (unlikely(rq->skip)) { -+ next = rq_runnable_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ rq->skip = NULL; -+ schedstat_inc(rq->sched_goidle); -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = rq_runnable_task(rq); -+#endif -+ } -+ rq->skip = NULL; -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ return next; -+ } -+ -+ next = sched_rq_first_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ schedstat_inc(rq->sched_goidle); -+ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = sched_rq_first_task(rq); -+#endif -+ } -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, -+ * next);*/ -+ return next; -+} -+ -+/* -+ * Constants for the sched_mode argument of __schedule(). -+ * -+ * The mode argument allows RT enabled kernels to differentiate a -+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that -+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to -+ * optimize the AND operation out and just check for zero. -+ */ -+#define SM_NONE 0x0 -+#define SM_PREEMPT 0x1 -+#define SM_RTLOCK_WAIT 0x2 -+ -+#ifndef CONFIG_PREEMPT_RT -+# define SM_MASK_PREEMPT (~0U) -+#else -+# define SM_MASK_PREEMPT SM_PREEMPT -+#endif -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(unsigned int sched_mode) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ unsigned long prev_state; -+ struct rq *rq; -+ int cpu; -+ int deactivated = 0; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, !!sched_mode); -+ -+ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(!!sched_mode); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(): -+ * -+ * __set_current_state(@state) signal_wake_up() -+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) -+ * wake_up_state(p, state) -+ * LOCK rq->lock LOCK p->pi_state -+ * smp_mb__after_spinlock() smp_mb__after_spinlock() -+ * if (signal_pending_state()) if (p->state & @state) -+ * -+ * Also, the membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ /* -+ * We must load prev->state once (task_struct::state is volatile), such -+ * that we form a control dependency vs deactivate_task() below. -+ */ -+ prev_state = READ_ONCE(prev->__state); -+ if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { -+ if (signal_pending_state(prev_state, prev)) { -+ WRITE_ONCE(prev->__state, TASK_RUNNING); -+ } else { -+ prev->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & PF_FROZEN); -+ -+ if (prev->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ /* -+ * __schedule() ttwu() -+ * prev_state = prev->state; if (p->on_rq && ...) -+ * if (prev_state) goto out; -+ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); -+ * p->state = TASK_WAKING -+ * -+ * Where __schedule() and ttwu() have matching control dependencies. -+ * -+ * After this, schedule() must not care about p->state any more. -+ */ -+ sched_task_deactivate(prev, rq); -+ deactivate_task(prev, rq); -+ deactivated = 1; -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ check_curr(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+#ifdef CONFIG_SCHED_DEBUG -+ rq->last_seen_need_resched_ns = 0; -+#endif -+ -+ if (likely(prev != next)) { -+ if (deactivated) -+ update_sched_rq_watermark(rq); -+ next->last_ran = rq->clock_task; -+ rq->last_ts_switch = rq->clock; -+ -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+ } else { -+ __balance_callbacks(rq); -+ raw_spin_unlock_irq(&rq->lock); -+ } -+ -+#ifdef CONFIG_SCHED_SMT -+ sg_balance(rq); -+#endif -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ -+ __schedule(SM_NONE); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ unsigned int task_flags; -+ -+ if (task_is_running(tsk)) -+ return; -+ -+ task_flags = tsk->flags; -+ /* -+ * If a worker goes to sleep, notify and ask workqueue whether it -+ * wants to wake up a task to maintain concurrency. -+ */ -+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (task_flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ blk_flush_plug(tsk->plug, true); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(SM_NONE); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->__state); -+ do { -+ __schedule(SM_NONE); -+ } while (need_resched()); -+} -+ -+#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+#ifdef CONFIG_PREEMPT_RT -+void __sched notrace schedule_rtlock(void) -+{ -+ do { -+ preempt_disable(); -+ __schedule(SM_RTLOCK_WAIT); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+} -+NOKPROBE_SYMBOL(schedule_rtlock); -+#endif -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(SM_PREEMPT); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -+#ifndef preempt_schedule_dynamic_enabled -+#define preempt_schedule_dynamic_enabled preempt_schedule -+#define preempt_schedule_dynamic_disabled NULL -+#endif -+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); -+EXPORT_STATIC_CALL_TRAMP(preempt_schedule); -+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); -+void __sched notrace dynamic_preempt_schedule(void) -+{ -+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) -+ return; -+ preempt_schedule(); -+} -+NOKPROBE_SYMBOL(dynamic_preempt_schedule); -+EXPORT_SYMBOL(dynamic_preempt_schedule); -+#endif -+#endif -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(SM_PREEMPT); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -+#ifndef preempt_schedule_notrace_dynamic_enabled -+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace -+#define preempt_schedule_notrace_dynamic_disabled NULL -+#endif -+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); -+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); -+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); -+void __sched notrace dynamic_preempt_schedule_notrace(void) -+{ -+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) -+ return; -+ preempt_schedule_notrace(); -+} -+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); -+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); -+#endif -+#endif -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(SM_PREEMPT); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void check_task_changed(struct task_struct *p, struct rq *rq) -+{ -+ int idx; -+ -+ /* Trigger resched if task sched_prio has been modified. */ -+ if (task_on_rq_queued(p) && (idx = task_sched_prio_idx(p, rq)) != p->sq_idx) { -+ requeue_task(p, rq, idx); -+ check_preempt_curr(rq); -+ } -+} -+ -+static void __setscheduler_prio(struct task_struct *p, int prio) -+{ -+ p->prio = prio; -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guarantees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ -+ __setscheduler_prio(p, prio); -+ -+ check_task_changed(p, rq); -+out_unlock: -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ -+ __balance_callbacks(rq); -+ __task_access_unlock(p, lock); -+ -+ preempt_enable(); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ p->static_prio = NICE_TO_PRIO(nice); -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it won't have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->prio = effective_prio(p); -+ -+ check_task_changed(p, rq); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * -+ * sched policy return value kernel prio user prio/nice -+ * -+ * (BMQ)normal, batch, idle[0 ... 53] [100 ... 139] 0/[-20 ... 19]/[-7 ... 7] -+ * (PDS)normal, batch, idle[0 ... 39] 100 0/[-20 ... 19] -+ * fifo, rr [-1 ... -100] [99 ... 0] [0 ... 99] -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ return (p->prio < MAX_RT_PRIO) ? p->prio - MAX_RT_PRIO : -+ task_sched_prio_normal(p, task_rq(p)); -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (rq->ttwu_pending) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int oldpolicy = -1, policy = attr->sched_policy; -+ int retval, newprio; -+ struct callback_head *head; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ newprio = __normal_prio(policy, attr->sched_priority, NICE_TO_PRIO(attr->sched_nice)); -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ newprio = rt_effective_prio(p, newprio); -+ } -+ -+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { -+ __setscheduler_params(p, attr); -+ __setscheduler_prio(p, newprio); -+ } -+ -+ check_task_changed(p, rq); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ head = splice_balance_callbacks(rq); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ /* Run balance callbacks after we've adjusted the PI chain: */ -+ balance_callbacks(rq, head); -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Use sched_set_fifo(), read its comment. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr_nocheck); -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+ -+/* -+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally -+ * incapable of resource management, which is the one thing an OS really should -+ * be doing. -+ * -+ * This is of course the reason it is limited to privileged users only. -+ * -+ * Worse still; it is fundamentally impossible to compose static priority -+ * workloads. You cannot take two correctly working static prio workloads -+ * and smash them together and still expect them to work. -+ * -+ * For this reason 'all' FIFO tasks the kernel creates are basically at: -+ * -+ * MAX_RT_PRIO / 2 -+ * -+ * The administrator _MUST_ configure the system, the kernel simply doesn't -+ * know enough information to make a sensible choice. -+ */ -+void sched_set_fifo(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo); -+ -+/* -+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. -+ */ -+void sched_set_fifo_low(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = 1 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo_low); -+ -+void sched_set_normal(struct task_struct *p, int nice) -+{ -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ .sched_nice = nice, -+ }; -+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_normal); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setattr(p, &attr); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (p->sched_reset_on_fork) -+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ if (task_has_rt_policy(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ kattr.sched_flags &= SCHED_FLAG_ALL; -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+static int -+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) -+{ -+ int retval; -+ cpumask_var_t cpus_allowed, new_mask; -+ -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); -+ if (retval) -+ goto out_free_new_mask; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ -+out_free_new_mask: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ retval = -EPERM; -+ goto out_put_task; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_put_task; -+ -+ retval = __sched_setaffinity(p, in_mask); -+out_put_task: -+ put_task_struct(p); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ schedstat_inc(rq->yld_count); -+ -+ if (1 == sched_yield_type) { -+ if (!rt_task(current)) -+ do_sched_yield_type_1(current, rq); -+ } else if (2 == sched_yield_type) { -+ if (rq->nr_running > 1) -+ rq->skip = current; -+ } -+ -+ preempt_disable(); -+ raw_spin_unlock_irq(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. If there are no -+ * other threads running on this CPU then this function will return. -+ * -+ * Return: 0. -+ */ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) -+int __sched __cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ /* -+ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick -+ * whether the current CPU is in an RCU read-side critical section, -+ * so the tick can report quiescent states even for CPUs looping -+ * in kernel context. In contrast, in non-preemptible kernels, -+ * RCU readers leave no in-memory hints, which means that CPU-bound -+ * processes executing in kernel context might never report an -+ * RCU quiescent state. Therefore, the following code causes -+ * cond_resched() to report a quiescent state, but only when RCU -+ * is in urgent need of one. -+ */ -+#ifndef CONFIG_PREEMPT_RCU -+ rcu_all_qs(); -+#endif -+ return 0; -+} -+EXPORT_SYMBOL(__cond_resched); -+#endif -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -+#define cond_resched_dynamic_enabled __cond_resched -+#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) -+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); -+EXPORT_STATIC_CALL_TRAMP(cond_resched); -+ -+#define might_resched_dynamic_enabled __cond_resched -+#define might_resched_dynamic_disabled ((void *)&__static_call_return0) -+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); -+EXPORT_STATIC_CALL_TRAMP(might_resched); -+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); -+int __sched dynamic_cond_resched(void) -+{ -+ if (!static_branch_unlikely(&sk_dynamic_cond_resched)) -+ return 0; -+ return __cond_resched(); -+} -+EXPORT_SYMBOL(dynamic_cond_resched); -+ -+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); -+int __sched dynamic_might_resched(void) -+{ -+ if (!static_branch_unlikely(&sk_dynamic_might_resched)) -+ return 0; -+ return __cond_resched(); -+} -+EXPORT_SYMBOL(dynamic_might_resched); -+#endif -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (!_cond_resched()) -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+int __cond_resched_rwlock_read(rwlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held_read(lock); -+ -+ if (rwlock_needbreak(lock) || resched) { -+ read_unlock(lock); -+ if (!_cond_resched()) -+ cpu_relax(); -+ ret = 1; -+ read_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_rwlock_read); -+ -+int __cond_resched_rwlock_write(rwlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held_write(lock); -+ -+ if (rwlock_needbreak(lock) || resched) { -+ write_unlock(lock); -+ if (!_cond_resched()) -+ cpu_relax(); -+ ret = 1; -+ write_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_rwlock_write); -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+ -+#ifdef CONFIG_GENERIC_ENTRY -+#include -+#endif -+ -+/* -+ * SC:cond_resched -+ * SC:might_resched -+ * SC:preempt_schedule -+ * SC:preempt_schedule_notrace -+ * SC:irqentry_exit_cond_resched -+ * -+ * -+ * NONE: -+ * cond_resched <- __cond_resched -+ * might_resched <- RET0 -+ * preempt_schedule <- NOP -+ * preempt_schedule_notrace <- NOP -+ * irqentry_exit_cond_resched <- NOP -+ * -+ * VOLUNTARY: -+ * cond_resched <- __cond_resched -+ * might_resched <- __cond_resched -+ * preempt_schedule <- NOP -+ * preempt_schedule_notrace <- NOP -+ * irqentry_exit_cond_resched <- NOP -+ * -+ * FULL: -+ * cond_resched <- RET0 -+ * might_resched <- RET0 -+ * preempt_schedule <- preempt_schedule -+ * preempt_schedule_notrace <- preempt_schedule_notrace -+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched -+ */ -+ -+enum { -+ preempt_dynamic_undefined = -1, -+ preempt_dynamic_none, -+ preempt_dynamic_voluntary, -+ preempt_dynamic_full, -+}; -+ -+int preempt_dynamic_mode = preempt_dynamic_undefined; -+ -+int sched_dynamic_mode(const char *str) -+{ -+ if (!strcmp(str, "none")) -+ return preempt_dynamic_none; -+ -+ if (!strcmp(str, "voluntary")) -+ return preempt_dynamic_voluntary; -+ -+ if (!strcmp(str, "full")) -+ return preempt_dynamic_full; -+ -+ return -EINVAL; -+} -+ -+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -+#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) -+#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) -+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -+#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) -+#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) -+#else -+#error "Unsupported PREEMPT_DYNAMIC mechanism" -+#endif -+ -+void sched_dynamic_update(int mode) -+{ -+ /* -+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in -+ * the ZERO state, which is invalid. -+ */ -+ preempt_dynamic_enable(cond_resched); -+ preempt_dynamic_enable(might_resched); -+ preempt_dynamic_enable(preempt_schedule); -+ preempt_dynamic_enable(preempt_schedule_notrace); -+ preempt_dynamic_enable(irqentry_exit_cond_resched); -+ -+ switch (mode) { -+ case preempt_dynamic_none: -+ preempt_dynamic_enable(cond_resched); -+ preempt_dynamic_disable(might_resched); -+ preempt_dynamic_disable(preempt_schedule); -+ preempt_dynamic_disable(preempt_schedule_notrace); -+ preempt_dynamic_disable(irqentry_exit_cond_resched); -+ pr_info("Dynamic Preempt: none\n"); -+ break; -+ -+ case preempt_dynamic_voluntary: -+ preempt_dynamic_enable(cond_resched); -+ preempt_dynamic_enable(might_resched); -+ preempt_dynamic_disable(preempt_schedule); -+ preempt_dynamic_disable(preempt_schedule_notrace); -+ preempt_dynamic_disable(irqentry_exit_cond_resched); -+ pr_info("Dynamic Preempt: voluntary\n"); -+ break; -+ -+ case preempt_dynamic_full: -+ preempt_dynamic_disable(cond_resched); -+ preempt_dynamic_disable(might_resched); -+ preempt_dynamic_enable(preempt_schedule); -+ preempt_dynamic_enable(preempt_schedule_notrace); -+ preempt_dynamic_enable(irqentry_exit_cond_resched); -+ pr_info("Dynamic Preempt: full\n"); -+ break; -+ } -+ -+ preempt_dynamic_mode = mode; -+} -+ -+static int __init setup_preempt_mode(char *str) -+{ -+ int mode = sched_dynamic_mode(str); -+ if (mode < 0) { -+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); -+ return 0; -+ } -+ -+ sched_dynamic_update(mode); -+ return 1; -+} -+__setup("preempt=", setup_preempt_mode); -+ -+static void __init preempt_dynamic_init(void) -+{ -+ if (preempt_dynamic_mode == preempt_dynamic_undefined) { -+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { -+ sched_dynamic_update(preempt_dynamic_none); -+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { -+ sched_dynamic_update(preempt_dynamic_voluntary); -+ } else { -+ /* Default static call setting, nothing to do */ -+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); -+ preempt_dynamic_mode = preempt_dynamic_full; -+ pr_info("Dynamic Preempt: full\n"); -+ } -+ } -+} -+ -+#define PREEMPT_MODEL_ACCESSOR(mode) \ -+ bool preempt_model_##mode(void) \ -+ { \ -+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ -+ return preempt_dynamic_mode == preempt_dynamic_##mode; \ -+ } \ -+ EXPORT_SYMBOL_GPL(preempt_model_##mode) -+ -+PREEMPT_MODEL_ACCESSOR(none); -+PREEMPT_MODEL_ACCESSOR(voluntary); -+PREEMPT_MODEL_ACCESSOR(full); -+ -+#else /* !CONFIG_PREEMPT_DYNAMIC */ -+ -+static inline void preempt_dynamic_init(void) { } -+ -+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, it's already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In Alt schedule FW, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_flush_plug(current->plug, true); -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_RT_PRIO - 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ alt_sched_debug(); -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(sched_timeslice_ns); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); -+ -+ if (task_is_running(p)) -+ pr_cont(" running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", -+ free, task_pid_nr(p), ppid, -+ read_task_thread_flags(p)); -+ -+ print_worker_info(KERN_INFO, p); -+ print_stop_info(KERN_INFO, p); -+ show_stack(p, NULL, KERN_INFO); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ unsigned int state = READ_ONCE(p->__state); -+ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned int state_filter) -+{ -+ struct task_struct *g, *p; -+ -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* TODO: Alt schedule FW should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: CPU the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void __init init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ __sched_fork(0, idle); -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->__state = TASK_RUNNING; -+ /* -+ * PF_KTHREAD should already be set at this point; regardless, make it -+ * look like a proper per-CPU kthread. -+ */ -+ idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; -+ kthread_set_per_cpu(idle, cpu); -+ -+ sched_queue_init_idle(&rq->queue, idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+ -+static int __balance_push_cpu_stop(void *arg) -+{ -+ struct task_struct *p = arg; -+ struct rq *rq = this_rq(); -+ struct rq_flags rf; -+ int cpu; -+ -+ raw_spin_lock_irq(&p->pi_lock); -+ rq_lock(rq, &rf); -+ -+ update_rq_clock(rq); -+ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) { -+ cpu = select_fallback_rq(rq->cpu, p); -+ rq = __migrate_task(rq, p, cpu); -+ } -+ -+ rq_unlock(rq, &rf); -+ raw_spin_unlock_irq(&p->pi_lock); -+ -+ put_task_struct(p); -+ -+ return 0; -+} -+ -+static DEFINE_PER_CPU(struct cpu_stop_work, push_work); -+ -+/* -+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only -+ * effective when the hotplug motion is down. -+ */ -+static void balance_push(struct rq *rq) -+{ -+ struct task_struct *push_task = rq->curr; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ /* -+ * Ensure the thing is persistent until balance_push_set(.on = false); -+ */ -+ rq->balance_callback = &balance_push_callback; -+ -+ /* -+ * Only active while going offline and when invoked on the outgoing -+ * CPU. -+ */ -+ if (!cpu_dying(rq->cpu) || rq != this_rq()) -+ return; -+ -+ /* -+ * Both the cpu-hotplug and stop task are in this case and are -+ * required to complete the hotplug process. -+ */ -+ if (kthread_is_per_cpu(push_task) || -+ is_migration_disabled(push_task)) { -+ -+ /* -+ * If this is the idle task on the outgoing CPU try to wake -+ * up the hotplug control thread which might wait for the -+ * last task to vanish. The rcuwait_active() check is -+ * accurate here because the waiter is pinned on this CPU -+ * and can't obviously be running in parallel. -+ * -+ * On RT kernels this also has to check whether there are -+ * pinned and scheduled out tasks on the runqueue. They -+ * need to leave the migrate disabled section first. -+ */ -+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) && -+ rcuwait_active(&rq->hotplug_wait)) { -+ raw_spin_unlock(&rq->lock); -+ rcuwait_wake_up(&rq->hotplug_wait); -+ raw_spin_lock(&rq->lock); -+ } -+ return; -+ } -+ -+ get_task_struct(push_task); -+ /* -+ * Temporarily drop rq->lock such that we can wake-up the stop task. -+ * Both preemption and IRQs are still disabled. -+ */ -+ raw_spin_unlock(&rq->lock); -+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, -+ this_cpu_ptr(&push_work)); -+ /* -+ * At this point need_resched() is true and we'll take the loop in -+ * schedule(). The next pick is obviously going to be the stop task -+ * which kthread_is_per_cpu() and will push this task away. -+ */ -+ raw_spin_lock(&rq->lock); -+} -+ -+static void balance_push_set(int cpu, bool on) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rq_lock_irqsave(rq, &rf); -+ if (on) { -+ WARN_ON_ONCE(rq->balance_callback); -+ rq->balance_callback = &balance_push_callback; -+ } else if (rq->balance_callback == &balance_push_callback) { -+ rq->balance_callback = NULL; -+ } -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+/* -+ * Invoked from a CPUs hotplug control thread after the CPU has been marked -+ * inactive. All tasks which are not per CPU kernel threads are either -+ * pushed off this CPU now via balance_push() or placed on a different CPU -+ * during wakeup. Wait until the CPU is quiescent. -+ */ -+static void balance_hotplug_wait(void) -+{ -+ struct rq *rq = this_rq(); -+ -+ rcuwait_wait_event(&rq->hotplug_wait, -+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq), -+ TASK_UNINTERRUPTIBLE); -+} -+ -+#else -+ -+static void balance_push(struct rq *rq) -+{ -+} -+ -+static void balance_push_set(int cpu, bool on) -+{ -+} -+ -+static inline void balance_hotplug_wait(void) -+{ -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* -+ * Clear the balance_push callback and prepare to schedule -+ * regular tasks. -+ */ -+ balance_push_set(cpu, false); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ -+ /* -+ * From this point forward, this CPU will refuse to run any task that -+ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively -+ * push those tasks away until this gets cleared, see -+ * sched_cpu_dying(). -+ */ -+ balance_push_set(cpu, true); -+ -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Specifically, we rely on ttwu to no longer target this CPU, see -+ * ttwu_queue_cond() and is_cpu_allowed(). -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ update_rq_clock(rq); -+ set_rq_offline(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { -+ static_branch_dec_cpuslocked(&sched_smt_present); -+ if (!static_branch_likely(&sched_smt_present)) -+ cpumask_clear(&sched_sg_idle_mask); -+ } -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ balance_push_set(cpu, false); -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+ -+/* -+ * Invoked immediately before the stopper thread is invoked to bring the -+ * CPU down completely. At this point all per CPU kthreads except the -+ * hotplug thread (current) and the stopper thread (inactive) have been -+ * either parked or have been unbound from the outgoing CPU. Ensure that -+ * any of those which might be on the way out are gone. -+ * -+ * If after this point a bound task is being woken on this CPU then the -+ * responsible hotplug callback has failed to do it's job. -+ * sched_cpu_dying() will catch it with the appropriate fireworks. -+ */ -+int sched_cpu_wait_empty(unsigned int cpu) -+{ -+ balance_hotplug_wait(); -+ return 0; -+} -+ -+/* -+ * Since this CPU is going 'away' for a while, fold any nr_active delta we -+ * might have. Called from the CPU stopper task after ensuring that the -+ * stopper is the last running task on the CPU, so nr_active count is -+ * stable. We need to take the teardown thread which is calling this into -+ * account, so we hand in adjust = 1 to the load calculation. -+ * -+ * Also see the comment "Global load-average calculations". -+ */ -+static void calc_load_migrate(struct rq *rq) -+{ -+ long delta = calc_load_fold_active(rq, 1); -+ -+ if (delta) -+ atomic_long_add(delta, &calc_load_tasks); -+} -+ -+static void dump_rq_tasks(struct rq *rq, const char *loglvl) -+{ -+ struct task_struct *g, *p; -+ int cpu = cpu_of(rq); -+ -+ lockdep_assert_held(&rq->lock); -+ -+ printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); -+ for_each_process_thread(g, p) { -+ if (task_cpu(p) != cpu) -+ continue; -+ -+ if (!task_on_rq_queued(p)) -+ continue; -+ -+ printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); -+ } -+} -+ -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_tick_stop(cpu); -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { -+ WARN(true, "Dying CPU not properly vacated!"); -+ dump_rq_tasks(rq, KERN_WARNING); -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ calc_load_migrate(rq); -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ /* init topo masks */ -+ tmp = per_cpu(sched_cpu_topo_masks, cpu); -+ -+ cpumask_copy(tmp, cpumask_of(cpu)); -+ tmp++; -+ cpumask_copy(tmp, cpu_possible_mask); -+ per_cpu(sched_cpu_llc_mask, cpu) = tmp; -+ per_cpu(sched_cpu_topo_end_mask, cpu) = ++tmp; -+ /*per_cpu(sd_llc_id, cpu) = cpu;*/ -+ } -+} -+ -+#define TOPOLOGY_CPUMASK(name, mask, last)\ -+ if (cpumask_and(topo, topo, mask)) { \ -+ cpumask_copy(topo, mask); \ -+ printk(KERN_INFO "sched: cpu#%02d topo: 0x%08lx - "#name, \ -+ cpu, (topo++)->bits[0]); \ -+ } \ -+ if (!last) \ -+ cpumask_complement(topo, mask) -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *topo; -+ -+ for_each_online_cpu(cpu) { -+ /* take chance to reset time slice for idle tasks */ -+ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; -+ -+ topo = per_cpu(sched_cpu_topo_masks, cpu) + 1; -+ -+ cpumask_complement(topo, cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); -+#endif -+ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); -+ per_cpu(sched_cpu_llc_mask, cpu) = topo; -+ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); -+ -+ per_cpu(sched_cpu_topo_end_mask, cpu) = topo; -+ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", -+ cpu, per_cpu(sd_llc_id, cpu), -+ (int) (per_cpu(sched_cpu_llc_mask, cpu) - -+ per_cpu(sched_cpu_topo_masks, cpu))); -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) -+ BUG(); -+ current->flags &= ~PF_NO_SETAFFINITY; -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+#ifdef CONFIG_FAIR_GROUP_SCHED -+ unsigned long shares; -+#endif -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ printk(KERN_INFO ALT_SCHED_VERSION_MSG); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < SCHED_QUEUE_BITS; i++) -+ cpumask_copy(sched_rq_watermark + i, cpu_present_mask); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ -+ sched_queue_init(&rq->queue); -+ rq->watermark = IDLE_TASK_SCHED_PRIO; -+ rq->skip = NULL; -+ -+ raw_spin_lock_init(&rq->lock); -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+#ifdef CONFIG_SCHED_SMT -+ rq->active_balance = 0; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+ INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); -+#endif -+ rq->balance_callback = &balance_push_callback; -+#ifdef CONFIG_HOTPLUG_CPU -+ rcuwait_init(&rq->hotplug_wait); -+#endif -+#endif /* CONFIG_SMP */ -+ rq->nr_switches = 0; -+ -+ hrtick_rq_init(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * The idle task doesn't need the kthread struct to function, but it -+ * is dressed up as a per-CPU kthread and thus needs to play the part -+ * if we want to avoid special-casing it in code that deals with per-CPU -+ * kthreads. -+ */ -+ WARN_ON(!set_kthread_struct(current)); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ balance_push_set(smp_processor_id(), false); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ psi_init(); -+ -+ preempt_dynamic_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ -+void __might_sleep(const char *file, int line) -+{ -+ unsigned int state = get_current_state(); -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%x set at [<%p>] %pS\n", state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ __might_resched(file, line, 0); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) -+{ -+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) -+ return; -+ -+ if (preempt_count() == preempt_offset) -+ return; -+ -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, ip); -+} -+ -+static inline bool resched_offsets_ok(unsigned int offsets) -+{ -+ unsigned int nested = preempt_count(); -+ -+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; -+ -+ return nested == offsets; -+} -+ -+void __might_resched(const char *file, int line, unsigned int offsets) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((resched_offsets_ok(offsets) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ pr_err("BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(), -+ offsets & MIGHT_RESCHED_PREEMPT_MASK); -+ -+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { -+ pr_err("RCU nest depth: %d, expected: %u\n", -+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); -+ } -+ -+ if (task_stack_end_corrupted(current)) -+ pr_emerg("Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+ -+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, -+ preempt_disable_ip); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(__might_resched); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+ -+#ifdef CONFIG_SMP -+void __cant_migrate(const char *file, int line) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (is_migration_disabled(current)) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > 0) -+ return; -+ -+ if (current->migration_flags & MDF_FORCE_ENABLED) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); -+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), is_migration_disabled(current), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_migrate); -+#endif -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ schedstat_set(p->stats.wait_start, 0); -+ schedstat_set(p->stats.sleep_start, 0); -+ schedstat_set(p->stats.block_start, 0); -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+static void sched_unregister_group(struct task_group *tg) -+{ -+ /* -+ * We have to wait for yet another RCU grace period to expire, as -+ * print_cfs_stats() might run concurrently. -+ */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_unregister_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs: */ -+ sched_unregister_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete: */ -+ call_rcu(&tg->rcu, sched_unregister_group_rcu); -+} -+ -+void sched_release_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_release_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_unregister_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+#ifdef CONFIG_FAIR_GROUP_SCHED -+static DEFINE_MUTEX(shares_mutex); -+ -+int sched_group_set_shares(struct task_group *tg, unsigned long shares) -+{ -+ /* -+ * We can't change the weight of the root cgroup. -+ */ -+ if (&root_task_group == tg) -+ return -EINVAL; -+ -+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); -+ -+ mutex_lock(&shares_mutex); -+ if (tg->shares == shares) -+ goto done; -+ -+ tg->shares = shares; -+done: -+ mutex_unlock(&shares_mutex); -+ return 0; -+} -+ -+static int cpu_shares_write_u64(struct cgroup_subsys_state *css, -+ struct cftype *cftype, u64 shareval) -+{ -+ if (shareval > scale_load_down(ULONG_MAX)) -+ shareval = MAX_SHARES; -+ return sched_group_set_shares(css_tg(css), scale_load(shareval)); -+} -+ -+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, -+ struct cftype *cft) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ return (u64) scale_load_down(tg->shares); -+} -+#endif -+ -+static struct cftype cpu_legacy_files[] = { -+#ifdef CONFIG_FAIR_GROUP_SCHED -+ { -+ .name = "shares", -+ .read_u64 = cpu_shares_read_u64, -+ .write_u64 = cpu_shares_write_u64, -+ }, -+#endif -+ { } /* Terminate */ -+}; -+ -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c -new file mode 100644 -index 000000000000..1212a031700e ---- /dev/null -+++ b/kernel/sched/alt_debug.c -@@ -0,0 +1,31 @@ -+/* -+ * kernel/sched/alt_debug.c -+ * -+ * Print the alt scheduler debugging details -+ * -+ * Author: Alfred Chen -+ * Date : 2020 -+ */ -+#include "sched.h" -+ -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -new file mode 100644 -index 000000000000..a181bf9ce57d ---- /dev/null -+++ b/kernel/sched/alt_sched.h -@@ -0,0 +1,645 @@ -+#ifndef ALT_SCHED_H -+#define ALT_SCHED_H -+ -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include "../workqueue_internal.h" -+ -+#include "cpupri.h" -+ -+#ifdef CONFIG_SCHED_BMQ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+/* bits: RT(0-99), reserved(100-127), NORMAL_PRIO_NUM, cpu idle task */ -+#define SCHED_BITS (MIN_NORMAL_PRIO + NORMAL_PRIO_NUM + 1) -+#endif /* CONFIG_SCHED_PDS */ -+ -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+#ifdef CONFIG_SCHED_DEBUG -+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -+extern void resched_latency_warn(int cpu, u64 latency); -+#else -+# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -+static inline void resched_latency_warn(int cpu, u64 latency) {} -+#endif -+ -+/* -+ * Increase resolution of nice-level calculations for 64-bit architectures. -+ * The extra resolution improves shares distribution and load balancing of -+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup -+ * hierarchies, especially on larger systems. This is not a user-visible change -+ * and does not change the user-interface for setting shares/weights. -+ * -+ * We increase resolution only if we have enough bits to allow this increased -+ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit -+ * are pretty high and the returns do not justify the increased costs. -+ * -+ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to -+ * increase coverage and consistency always enable it on 64-bit platforms. -+ */ -+#ifdef CONFIG_64BIT -+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) -+# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -+# define scale_load_down(w) \ -+({ \ -+ unsigned long __w = (w); \ -+ if (__w) \ -+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ -+ __w; \ -+}) -+#else -+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) -+# define scale_load(w) (w) -+# define scale_load_down(w) (w) -+#endif -+ -+#ifdef CONFIG_FAIR_GROUP_SCHED -+#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD -+ -+/* -+ * A weight of 0 or 1 can cause arithmetics problems. -+ * A weight of a cfs_rq is the sum of weights of which entities -+ * are queued on this cfs_rq, so a weight of a entity should not be -+ * too large, so as the shares value of a task group. -+ * (The default weight is 1024 - so there's no practical -+ * limitation from this.) -+ */ -+#define MIN_SHARES (1UL << 1) -+#define MAX_SHARES (1UL << 18) -+#endif -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+#define WF_ON_CPU 0x08 /* Wakee is on_rq */ -+ -+#define SCHED_QUEUE_BITS (SCHED_BITS - 1) -+ -+struct sched_queue { -+ DECLARE_BITMAP(bitmap, SCHED_QUEUE_BITS); -+ struct list_head heads[SCHED_BITS]; -+}; -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop, *skip; -+ struct mm_struct *prev_mm; -+ -+ struct sched_queue queue; -+#ifdef CONFIG_SCHED_PDS -+ u64 time_edge; -+#endif -+ unsigned long watermark; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_SCHED_DEBUG -+ u64 last_seen_need_resched_ns; -+ int ticks_without_resched; -+#endif -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ unsigned int ttwu_pending; -+ unsigned char nohz_idle_balance; -+ unsigned char idle_balance; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+ struct callback_head *balance_callback; -+#ifdef CONFIG_HOTPLUG_CPU -+ struct rcuwait hotplug_wait; -+#endif -+ unsigned int nr_pinned; -+ -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* For genenal cpu load util */ -+ s32 load_history; -+ u64 load_block; -+ u64 load_stamp; -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 last_ts_switch; -+ u64 clock_task; -+ -+ unsigned int nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+ ktime_t hrtick_time; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#ifdef CONFIG_SMP -+ call_single_data_t nohz_csd; -+#endif -+ atomic_t nohz_flags; -+#endif /* CONFIG_NO_HZ_COMMON */ -+}; -+ -+extern unsigned long rq_load_util(struct rq *rq, unsigned long max); -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+ -+#ifdef CONFIG_SMP -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern bool sched_smp_initialized; -+ -+enum { -+ ITSELF_LEVEL_SPACE_HOLDER, -+#ifdef CONFIG_SCHED_SMT -+ SMT_LEVEL_SPACE_HOLDER, -+#endif -+ COREGROUP_LEVEL_SPACE_HOLDER, -+ CORE_LEVEL_SPACE_HOLDER, -+ OTHER_LEVEL_SPACE_HOLDER, -+ NR_CPU_AFFINITY_LEVELS -+}; -+ -+DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks); -+DECLARE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); -+ -+static inline int -+__best_mask_cpu(const cpumask_t *cpumask, const cpumask_t *mask) -+{ -+ int cpu; -+ -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ -+ return cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *mask) -+{ -+ return __best_mask_cpu(mask, per_cpu(sched_cpu_topo_masks, cpu)); -+} -+ -+extern void flush_smp_call_function_queue(void); -+ -+#else /* !CONFIG_SMP */ -+static inline void flush_smp_call_function_queue(void) { } -+#endif -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : BMQ need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_lock(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock(&rq->lock); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline void -+rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline raw_spinlock_t *__rq_lockp(struct rq *rq) -+{ -+ return &rq->lock; -+} -+ -+static inline raw_spinlock_t *rq_lockp(struct rq *rq) -+{ -+ return __rq_lockp(rq); -+} -+ -+static inline void lockdep_assert_rq_held(struct rq *rq) -+{ -+ lockdep_assert_held(__rq_lockp(rq)); -+} -+ -+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); -+extern void raw_spin_rq_unlock(struct rq *rq); -+ -+static inline void raw_spin_rq_lock(struct rq *rq) -+{ -+ raw_spin_rq_lock_nested(rq, 0); -+} -+ -+static inline void raw_spin_rq_lock_irq(struct rq *rq) -+{ -+ local_irq_disable(); -+ raw_spin_rq_lock(rq); -+} -+ -+static inline void raw_spin_rq_unlock_irq(struct rq *rq) -+{ -+ raw_spin_rq_unlock(rq); -+ local_irq_enable(); -+} -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern int task_running_nice(struct task_struct *p); -+ -+extern struct static_key_false sched_schedstats; -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#define NOHZ_BALANCE_KICK_BIT 0 -+#define NOHZ_STATS_KICK_BIT 1 -+ -+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) -+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) -+ -+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) -+ -+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -+ -+/* TODO: needed? -+extern void nohz_balance_exit_idle(struct rq *rq); -+#else -+static inline void nohz_balance_exit_idle(struct rq *rq) { } -+*/ -+#endif -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+extern void swake_up_all_locked(struct swait_queue_head *q); -+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+extern int preempt_dynamic_mode; -+extern int sched_dynamic_mode(const char *str); -+extern void sched_dynamic_update(int mode); -+#endif -+ -+static inline void nohz_run_idle_balance(int cpu) { } -+ -+static inline -+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, -+ struct task_struct *p) -+{ -+ return util; -+} -+ -+static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } -+ -+#endif /* ALT_SCHED_H */ -diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h -new file mode 100644 -index 000000000000..66b77291b9d0 ---- /dev/null -+++ b/kernel/sched/bmq.h -@@ -0,0 +1,110 @@ -+#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+/* -+ * BMQ only routines -+ */ -+#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) -+#define boost_threshold(p) (sched_timeslice_ns >>\ -+ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) -+ -+static inline void boost_task(struct task_struct *p) -+{ -+ int limit; -+ -+ switch (p->policy) { -+ case SCHED_NORMAL: -+ limit = -MAX_PRIORITY_ADJ; -+ break; -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ limit = 0; -+ break; -+ default: -+ return; -+ } -+ -+ if (p->boost_prio > limit) -+ p->boost_prio--; -+} -+ -+static inline void deboost_task(struct task_struct *p) -+{ -+ if (p->boost_prio < MAX_PRIORITY_ADJ) -+ p->boost_prio++; -+} -+ -+/* -+ * Common interfaces -+ */ -+static inline void sched_timeslice_imp(const int timeslice_ms) {} -+ -+static inline int -+task_sched_prio_normal(const struct task_struct *p, const struct rq *rq) -+{ -+ return p->prio + p->boost_prio - MAX_RT_PRIO; -+} -+ -+static inline int task_sched_prio(const struct task_struct *p) -+{ -+ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; -+} -+ -+static inline int -+task_sched_prio_idx(const struct task_struct *p, const struct rq *rq) -+{ -+ return task_sched_prio(p); -+} -+ -+static inline int sched_prio2idx(int prio, struct rq *rq) -+{ -+ return prio; -+} -+ -+static inline int sched_idx2prio(int idx, struct rq *rq) -+{ -+ return idx; -+} -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { -+ if (SCHED_RR != p->policy) -+ deboost_task(p); -+ requeue_task(p, rq, task_sched_prio_idx(p, rq)); -+ } -+} -+ -+static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) {} -+ -+inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = MAX_PRIORITY_ADJ; -+} -+ -+static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = MAX_PRIORITY_ADJ; -+} -+ -+#ifdef CONFIG_SMP -+static inline void sched_task_ttwu(struct task_struct *p) -+{ -+ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) -+ boost_task(p); -+} -+#endif -+ -+static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq) -+{ -+ if (rq_switch_time(rq) < boost_threshold(p)) -+ boost_task(p); -+} -+ -+static inline void update_rq_time_edge(struct rq *rq) {} -diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c -index d9dc9ab3773f..71a25540d65e 100644 ---- a/kernel/sched/build_policy.c -+++ b/kernel/sched/build_policy.c -@@ -42,13 +42,19 @@ - - #include "idle.c" - -+#ifndef CONFIG_SCHED_ALT - #include "rt.c" -+#endif - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - # include "cpudeadline.c" -+#endif - # include "pelt.c" - #endif - - #include "cputime.c" --#include "deadline.c" - -+#ifndef CONFIG_SCHED_ALT -+#include "deadline.c" -+#endif -diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c -index 99bdd96f454f..23f80a86d2d7 100644 ---- a/kernel/sched/build_utility.c -+++ b/kernel/sched/build_utility.c -@@ -85,7 +85,9 @@ - - #ifdef CONFIG_SMP - # include "cpupri.c" -+#ifndef CONFIG_SCHED_ALT - # include "stop_task.c" -+#endif - # include "topology.c" - #endif - -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 3dbf351d12d5..b2590f961139 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -160,9 +160,14 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) - unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); - - sg_cpu->max = max; -+#ifndef CONFIG_SCHED_ALT - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max, - FREQUENCY_UTIL, NULL); -+#else -+ sg_cpu->bw_dl = 0; -+ sg_cpu->util = rq_load_util(rq, max); -+#endif /* CONFIG_SCHED_ALT */ - } - - /** -@@ -306,8 +311,10 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) - { -+#ifndef CONFIG_SCHED_ALT - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_cpu->sg_policy->limits_changed = true; -+#endif - } - - static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, -@@ -607,6 +614,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -839,7 +847,9 @@ cpufreq_governor_init(schedutil_gov); - #ifdef CONFIG_ENERGY_MODEL - static void rebuild_sd_workfn(struct work_struct *work) - { -+#ifndef CONFIG_SCHED_ALT - rebuild_sched_domains_energy(); -+#endif /* CONFIG_SCHED_ALT */ - } - static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 78a233d43757..b3bbc87d4352 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -- if (task_nice(p) > 0) { -+ if (task_running_nice(p)) { - task_group_account_field(p, CPUTIME_NICE, cputime); - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -611,7 +611,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - if (task_cputime(p, &cputime.utime, &cputime.stime)) -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index bb3d63bdf4ae..4e1680785704 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -7,6 +7,7 @@ - * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar - */ - -+#ifndef CONFIG_SCHED_ALT - /* - * This allows printing both to /proc/sched_debug and - * to the console -@@ -215,6 +216,7 @@ static const struct file_operations sched_scaling_fops = { - }; - - #endif /* SMP */ -+#endif /* !CONFIG_SCHED_ALT */ - - #ifdef CONFIG_PREEMPT_DYNAMIC - -@@ -278,6 +280,7 @@ static const struct file_operations sched_dynamic_fops = { - - #endif /* CONFIG_PREEMPT_DYNAMIC */ - -+#ifndef CONFIG_SCHED_ALT - __read_mostly bool sched_debug_verbose; - - static const struct seq_operations sched_debug_sops; -@@ -293,6 +296,7 @@ static const struct file_operations sched_debug_fops = { - .llseek = seq_lseek, - .release = seq_release, - }; -+#endif /* !CONFIG_SCHED_ALT */ - - static struct dentry *debugfs_sched; - -@@ -302,12 +306,15 @@ static __init int sched_init_debug(void) - - debugfs_sched = debugfs_create_dir("sched", NULL); - -+#ifndef CONFIG_SCHED_ALT - debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); - debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose); -+#endif /* !CONFIG_SCHED_ALT */ - #ifdef CONFIG_PREEMPT_DYNAMIC - debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); - #endif - -+#ifndef CONFIG_SCHED_ALT - debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); - debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); - debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); -@@ -336,11 +343,13 @@ static __init int sched_init_debug(void) - #endif - - debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); -+#endif /* !CONFIG_SCHED_ALT */ - - return 0; - } - late_initcall(sched_init_debug); - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_SMP - - static cpumask_var_t sd_sysctl_cpus; -@@ -1067,6 +1076,7 @@ void proc_sched_set_task(struct task_struct *p) - memset(&p->stats, 0, sizeof(p->stats)); - #endif - } -+#endif /* !CONFIG_SCHED_ALT */ - - void resched_latency_warn(int cpu, u64 latency) - { -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index 328cccbee444..aef991facc79 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -400,6 +400,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * idle-task scheduling class. - */ -@@ -521,3 +522,4 @@ DEFINE_SCHED_CLASS(idle) = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h -new file mode 100644 -index 000000000000..56a649d02e49 ---- /dev/null -+++ b/kernel/sched/pds.h -@@ -0,0 +1,127 @@ -+#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+static int sched_timeslice_shift = 22; -+ -+#define NORMAL_PRIO_MOD(x) ((x) & (NORMAL_PRIO_NUM - 1)) -+ -+/* -+ * Common interfaces -+ */ -+static inline void sched_timeslice_imp(const int timeslice_ms) -+{ -+ if (2 == timeslice_ms) -+ sched_timeslice_shift = 21; -+} -+ -+static inline int -+task_sched_prio_normal(const struct task_struct *p, const struct rq *rq) -+{ -+ s64 delta = p->deadline - rq->time_edge + NORMAL_PRIO_NUM - NICE_WIDTH; -+ -+ if (WARN_ONCE(delta > NORMAL_PRIO_NUM - 1, -+ "pds: task_sched_prio_normal() delta %lld\n", delta)) -+ return NORMAL_PRIO_NUM - 1; -+ -+ return (delta < 0) ? 0 : delta; -+} -+ -+static inline int task_sched_prio(const struct task_struct *p) -+{ -+ return (p->prio < MAX_RT_PRIO) ? p->prio : -+ MIN_NORMAL_PRIO + task_sched_prio_normal(p, task_rq(p)); -+} -+ -+static inline int -+task_sched_prio_idx(const struct task_struct *p, const struct rq *rq) -+{ -+ return (p->prio < MAX_RT_PRIO) ? p->prio : MIN_NORMAL_PRIO + -+ NORMAL_PRIO_MOD(task_sched_prio_normal(p, rq) + rq->time_edge); -+} -+ -+static inline int sched_prio2idx(int prio, struct rq *rq) -+{ -+ return (IDLE_TASK_SCHED_PRIO == prio || prio < MAX_RT_PRIO) ? prio : -+ MIN_NORMAL_PRIO + NORMAL_PRIO_MOD((prio - MIN_NORMAL_PRIO) + -+ rq->time_edge); -+} -+ -+static inline int sched_idx2prio(int idx, struct rq *rq) -+{ -+ return (idx < MAX_RT_PRIO) ? idx : MIN_NORMAL_PRIO + -+ NORMAL_PRIO_MOD((idx - MIN_NORMAL_PRIO) + NORMAL_PRIO_NUM - -+ NORMAL_PRIO_MOD(rq->time_edge)); -+} -+ -+static inline void sched_renew_deadline(struct task_struct *p, const struct rq *rq) -+{ -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = (rq->clock >> sched_timeslice_shift) + -+ p->static_prio - (MAX_PRIO - NICE_WIDTH); -+} -+ -+int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio > DEFAULT_PRIO); -+} -+ -+static inline void update_rq_time_edge(struct rq *rq) -+{ -+ struct list_head head; -+ u64 old = rq->time_edge; -+ u64 now = rq->clock >> sched_timeslice_shift; -+ u64 prio, delta; -+ -+ if (now == old) -+ return; -+ -+ delta = min_t(u64, NORMAL_PRIO_NUM, now - old); -+ INIT_LIST_HEAD(&head); -+ -+ for_each_set_bit(prio, &rq->queue.bitmap[2], delta) -+ list_splice_tail_init(rq->queue.heads + MIN_NORMAL_PRIO + -+ NORMAL_PRIO_MOD(prio + old), &head); -+ -+ rq->queue.bitmap[2] = (NORMAL_PRIO_NUM == delta) ? 0UL : -+ rq->queue.bitmap[2] >> delta; -+ rq->time_edge = now; -+ if (!list_empty(&head)) { -+ u64 idx = MIN_NORMAL_PRIO + NORMAL_PRIO_MOD(now); -+ struct task_struct *p; -+ -+ list_for_each_entry(p, &head, sq_node) -+ p->sq_idx = idx; -+ -+ list_splice(&head, rq->queue.heads + idx); -+ rq->queue.bitmap[2] |= 1UL; -+ } -+} -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = sched_timeslice_ns; -+ sched_renew_deadline(p, rq); -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq, task_sched_prio_idx(p, rq)); -+} -+ -+static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) -+{ -+ u64 max_dl = rq->time_edge + NICE_WIDTH - 1; -+ if (unlikely(p->deadline > max_dl)) -+ p->deadline = max_dl; -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ sched_renew_deadline(p, rq); -+} -+ -+static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ time_slice_expired(p, rq); -+} -+ -+#ifdef CONFIG_SMP -+static inline void sched_task_ttwu(struct task_struct *p) {} -+#endif -+static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index 0f310768260c..bd38bf738fe9 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -266,6 +266,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * sched_entity: - * -@@ -383,8 +384,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - /* - * thermal: - * -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index 4ff2ed4f8fa1..226eeed61318 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,13 +1,15 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_ALT - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); - - static inline u64 thermal_load_avg(struct rq *rq) -@@ -44,6 +46,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) - return PELT_MIN_DIVIDER + avg->period_contrib; - } - -+#ifndef CONFIG_SCHED_ALT - static inline void cfs_se_util_change(struct sched_avg *avg) - { - unsigned int enqueued; -@@ -155,9 +158,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_ALT */ - - #else - -+#ifndef CONFIG_SCHED_ALT - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -175,6 +180,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - { - return 0; - } -+#endif - - static inline int - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 47b89a0fc6e5..de2641a32c22 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -5,6 +5,10 @@ - #ifndef _KERNEL_SCHED_SCHED_H - #define _KERNEL_SCHED_SCHED_H - -+#ifdef CONFIG_SCHED_ALT -+#include "alt_sched.h" -+#else -+ - #include - #include - #include -@@ -3116,4 +3120,9 @@ extern int sched_dynamic_mode(const char *str); - extern void sched_dynamic_update(int mode); - #endif - -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (task_nice(p) > 0); -+} -+#endif /* !CONFIG_SCHED_ALT */ - #endif /* _KERNEL_SCHED_SCHED_H */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 857f837f52cb..5486c63e4790 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -125,8 +125,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -143,6 +145,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -171,6 +174,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h -index baa839c1ba96..15238be0581b 100644 ---- a/kernel/sched/stats.h -+++ b/kernel/sched/stats.h -@@ -89,6 +89,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt - - #endif /* CONFIG_SCHEDSTATS */ - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_FAIR_GROUP_SCHED - struct sched_entity_stats { - struct sched_entity se; -@@ -105,6 +106,7 @@ __schedstats_from_se(struct sched_entity *se) - #endif - return &task_of(se)->stats; - } -+#endif /* CONFIG_SCHED_ALT */ - - #ifdef CONFIG_PSI - /* -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 05b6c2ad90b9..480ef393b3c9 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -3,6 +3,7 @@ - * Scheduler topology setup/handling methods - */ - -+#ifndef CONFIG_SCHED_ALT - DEFINE_MUTEX(sched_domains_mutex); - - /* Protected by sched_domains_mutex: */ -@@ -1413,8 +1414,10 @@ static void asym_cpu_capacity_scan(void) - */ - - static int default_relax_domain_level = -1; -+#endif /* CONFIG_SCHED_ALT */ - int sched_domain_level_max; - -+#ifndef CONFIG_SCHED_ALT - static int __init setup_relax_domain_level(char *str) - { - if (kstrtoint(str, 0, &default_relax_domain_level)) -@@ -1647,6 +1650,7 @@ sd_init(struct sched_domain_topology_level *tl, - - return sd; - } -+#endif /* CONFIG_SCHED_ALT */ - - /* - * Topology list, bottom-up. -@@ -1683,6 +1687,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) - sched_domain_topology_saved = NULL; - } - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA - - static const struct cpumask *sd_numa_mask(int cpu) -@@ -2638,3 +2643,15 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); - } -+#else /* CONFIG_SCHED_ALT */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{} -+ -+#ifdef CONFIG_NUMA -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+#endif -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 35d034219513..23719c728677 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -86,6 +86,10 @@ - - /* Constants used for minimum and maximum */ - -+#ifdef CONFIG_SCHED_ALT -+extern int sched_yield_type; -+#endif -+ - #ifdef CONFIG_PERF_EVENTS - static const int six_hundred_forty_kb = 640 * 1024; - #endif -@@ -1590,6 +1594,7 @@ int proc_do_static_key(struct ctl_table *table, int write, - } - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA_BALANCING - { - .procname = "numa_balancing", -@@ -1601,6 +1606,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_FOUR, - }, - #endif /* CONFIG_NUMA_BALANCING */ -+#endif /* !CONFIG_SCHED_ALT */ - { - .procname = "panic", - .data = &panic_timeout, -@@ -1902,6 +1908,17 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_ALT -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_TWO, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 0ea8702eb516..a27a0f3a654d 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -2088,8 +2088,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - int ret = 0; - u64 slack; - -+#ifndef CONFIG_SCHED_ALT - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) -+#endif - slack = 0; - - hrtimer_init_sleeper_on_stack(&t, clockid, mode); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index cb925e8ef9a8..67d823510f5c 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -223,7 +223,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -866,6 +866,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_ALT - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -873,6 +874,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -900,8 +902,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -915,7 +919,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1151,8 +1155,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index a2d301f58ced..2ccdede8585c 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1143,10 +1143,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_ALT -+ /* No deadline on BMQ/PDS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/sys-kernel/pinephone-pro-sources/files/5021_BMQ-and-PDS-gentoo-defaults.patch b/sys-kernel/pinephone-pro-sources/files/5021_BMQ-and-PDS-gentoo-defaults.patch deleted file mode 100644 index 6b2049d..0000000 --- a/sys-kernel/pinephone-pro-sources/files/5021_BMQ-and-PDS-gentoo-defaults.patch +++ /dev/null @@ -1,13 +0,0 @@ ---- a/init/Kconfig 2022-07-07 13:22:00.698439887 -0400 -+++ b/init/Kconfig 2022-07-07 13:23:45.152333576 -0400 -@@ -874,8 +874,9 @@ config UCLAMP_BUCKETS_COUNT - If in doubt, use the default value. - - menuconfig SCHED_ALT -+ depends on X86_64 - bool "Alternative CPU Schedulers" -- default y -+ default n - help - This feature enable alternative CPU scheduler" - diff --git a/sys-kernel/pinephone-pro-sources/pinephone-pro-sources-5.19.3.ebuild b/sys-kernel/pinephone-pro-sources/pinephone-pro-sources-5.19.3.ebuild deleted file mode 100644 index f6def35..0000000 --- a/sys-kernel/pinephone-pro-sources/pinephone-pro-sources-5.19.3.ebuild +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 1999-2021 Gentoo Authors -# Distributed under the terms of the GNU General Public License v2 - -EAPI="8" -K_NOUSENAME="yes" -K_NOSETEXTRAVERSION="yes" -K_SECURITY_UNSUPPORTED="1" -K_GENPATCHES_VER="1" -ETYPE="sources" -inherit kernel-2 -detect_version - -KEYWORDS="~arm64" - -DEPEND="${RDEPEND} - >=sys-devel/patch-2.7.5" - -DESCRIPTION="Full sources for the Linux kernel, with megi's patch for pinephone and gentoo patchset" - -MEGI_TAG="orange-pi-5.19-20220822-1337" -SRC_URI="https://github.com/megous/linux/archive/${MEGI_TAG}.tar.gz" - -PATCHES=( - #Gentoo Patches - ${FILESDIR}/1500_XATTR_USER_PREFIX.patch - ${FILESDIR}/1510_fs-enable-link-security-restrictions-by-default.patch - ${FILESDIR}/1700_sparc-address-warray-bound-warnings.patch - ${FILESDIR}/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch - ${FILESDIR}/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch - ${FILESDIR}/2920_sign-file-patch-for-libressl.patch - ${FILESDIR}/3000_Support-printing-firmware-info.patch - ${FILESDIR}/4567_distro-Gentoo-Kconfig.patch - ${FILESDIR}/5010_enable-cpu-optimizations-universal.patch - ${FILESDIR}/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch - ${FILESDIR}/5021_BMQ-and-PDS-gentoo-defaults.patch - - #PinePhone Patches - ${FILESDIR}/0101-arm64-dts-pinephone-drop-modem-power-node.patch - ${FILESDIR}/0102-arm64-dts-pinephone-pro-remove-modem-node.patch - ${FILESDIR}/0103-arm64-dts-rk3399-pinephone-pro-add-modem-RI-pin.patch -) - -S="${WORKDIR}/linux-${MEGI_TAG}" - -src_unpack() { - default -} - -src_prepare() { - default - eapply_user -} - -pkg_postinst() { - kernel-2_pkg_postinst - einfo "To build and install the kernel use the following commands:" - einfo "# make Image modules" - einfo "# make DTC_FLAGS="-@" dtbs" - einfo "# cp arch/arm64/boot/Image /boot" - einfo "# make INSTALL_MOD_PATH=/ modules_intall" - einfo "# make INSTALL_DTBS_PATH=/boot/dtbs dtbs_install" - einfo "You will need to create and initramfs afterwards." - einfo "If you use dracut you can run:" - einfo "# dracut -m \"rootfs-block base\" --host-only --kver 5.19.2-pinehone-gentoo-arm64" - einfo "Change 5.19.2-pinehone-gentoo-arm64 to your kernel version installed in /lib/modules" -} - -pkg_postrm() { - kernel-2_pkg_postrm -} diff --git a/sys-kernel/pinephone-sources/Manifest b/sys-kernel/pinephone-sources/Manifest index 3a33b67..91e1408 100644 --- a/sys-kernel/pinephone-sources/Manifest +++ b/sys-kernel/pinephone-sources/Manifest @@ -1 +1 @@ -DIST orange-pi-5.19-20220802-0940.tar.gz 214990340 BLAKE2B 9bbadd06a8d160d716838d709f7ca6adb6143cb2205337940fb2d4607f0b806400cc77fb4abd36856844536b0a4ced92737658fc7af60d10f141a21116d66eed SHA512 04d46f6065a138d3b206937fada3990f823a1937c14812bada6512d04ebf1c7634cdea0a57611066bd2b4951a38c8e354b187bffe2ca738f2fe2a3f50d922dc2 +DIST orange-pi-5.19-20220909-1622.tar.gz 215047997 BLAKE2B 8d9b57d5e4c52e08caf97749912ba14eff7b328eb8fa6e00ba5a7f3bf47b4064c1272162602fdbda9852eea6f7473033c01b491ef09ca6a9aa3ee0f1375145ac SHA512 c2d085522c0332d6b95dde22af92c7c2a8941f94714d9d2c83249d4ddd921fe0a85226b8a09715ca37dfe0874315dd97d0d4c5511f8fe315cb29a9fef99a1109 diff --git a/sys-kernel/pinephone-pro-sources/files/0103-arm64-dts-rk3399-pinephone-pro-add-modem-RI-pin.patch b/sys-kernel/pinephone-sources/files/0103-arm64-dts-rk3399-pinephone-pro-add-modem-RI-pin.patch similarity index 100% rename from sys-kernel/pinephone-pro-sources/files/0103-arm64-dts-rk3399-pinephone-pro-add-modem-RI-pin.patch rename to sys-kernel/pinephone-sources/files/0103-arm64-dts-rk3399-pinephone-pro-add-modem-RI-pin.patch diff --git a/sys-kernel/pinephone-sources/files/0104-PPP-Add-reset-resume-to-usb_wwan.patch b/sys-kernel/pinephone-sources/files/0104-PPP-Add-reset-resume-to-usb_wwan.patch new file mode 100644 index 0000000..be8499f --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0104-PPP-Add-reset-resume-to-usb_wwan.patch @@ -0,0 +1,21 @@ +From 94ee175a91b2c132ca3068ee04cb2766c9f47cd7 Mon Sep 17 00:00:00 2001 +From: Hendrik Borghorst +Date: Fri, 10 Jun 2022 15:36:29 +0200 +Subject: [PATCH] PPP: Add reset resume to usb_wwan + +--- + drivers/usb/serial/option.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index e60425bbf53764..08fb844c534bf6 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -2176,6 +2176,7 @@ static struct usb_serial_driver option_1port_device = { + #ifdef CONFIG_PM + .suspend = usb_wwan_suspend, + .resume = usb_wwan_resume, ++ .reset_resume = usb_wwan_resume, + #endif + }; + diff --git a/sys-kernel/pinephone-sources/files/0104-Revert-usb-quirks-Add-USB_QUIRK_RESET-for-Quectel-EG25G.patch b/sys-kernel/pinephone-sources/files/0104-Revert-usb-quirks-Add-USB_QUIRK_RESET-for-Quectel-EG25G.patch new file mode 100644 index 0000000..daf2895 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0104-Revert-usb-quirks-Add-USB_QUIRK_RESET-for-Quectel-EG25G.patch @@ -0,0 +1,25 @@ +From f57b0185c93bc94c3fedbcbb274d3e032972301a Mon Sep 17 00:00:00 2001 +From: Hendrik Borghorst +Date: Fri, 10 Jun 2022 09:19:39 +0200 +Subject: [PATCH] Revert "usb: quirks: Add USB_QUIRK_RESET for Quectel EG25G + Modem" + +This reverts commit 62867934f6251349e1352a345f827ba8de514a36. +--- + drivers/usb/core/quirks.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c +index 17c3d472fb1304..f99a65a64588fe 100644 +--- a/drivers/usb/core/quirks.c ++++ b/drivers/usb/core/quirks.c +@@ -519,9 +519,6 @@ static const struct usb_device_id usb_quirk_list[] = { + /* INTEL VALUE SSD */ + { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME }, + +- /* Quectel EG25G Modem */ +- { USB_DEVICE(0x2c7c, 0x0125), .driver_info = USB_QUIRK_RESET }, +- + { } /* terminating entry must be last */ + }; + diff --git a/sys-kernel/pinephone-sources/files/0104-rk818_charger-use-type-battery-again.patch b/sys-kernel/pinephone-sources/files/0104-rk818_charger-use-type-battery-again.patch new file mode 100644 index 0000000..74ed979 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0104-rk818_charger-use-type-battery-again.patch @@ -0,0 +1,11 @@ +--- a/drivers/power/supply/rk818_charger.c 2022-01-28 17:51:57.000000000 +0100 ++++ b/drivers/power/supply/rk818_charger.c 2022-02-02 15:06:51.303222817 +0100 +@@ -522,7 +522,7 @@ static enum power_supply_property rk818_ + */ + static const struct power_supply_desc rk818_charger_desc = { + .name = "rk818-charger", +- .type = POWER_SUPPLY_TYPE_MAINS, ++ .type = POWER_SUPPLY_TYPE_BATTERY, + .properties = rk818_charger_props, + .num_properties = ARRAY_SIZE(rk818_charger_props), + .property_is_writeable = rk818_charger_prop_writeable, diff --git a/sys-kernel/pinephone-sources/files/0106-sound-rockchip-i2s-Dont-disable-mclk-on-suspend.patch b/sys-kernel/pinephone-sources/files/0106-sound-rockchip-i2s-Dont-disable-mclk-on-suspend.patch new file mode 100644 index 0000000..04cc463 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0106-sound-rockchip-i2s-Dont-disable-mclk-on-suspend.patch @@ -0,0 +1,29 @@ +From 5f41055235786657509233557a3ca2950c401ec5 Mon Sep 17 00:00:00 2001 +From: marcin +Date: Wed, 15 Jun 2022 03:46:13 +0200 +Subject: [PATCH] sound/rockchip/i2s: Don't disable mclk on suspend + +This is a workaround to fix an issue with high-pitch sound after +suspend. + +This patch is actually authored by Biktorgj +--- + sound/soc/rockchip/rockchip_i2s.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sound/soc/rockchip/rockchip_i2s.c b/sound/soc/rockchip/rockchip_i2s.c +index 4ce5d2579387..1f9d4e5e36b2 100644 +--- a/sound/soc/rockchip/rockchip_i2s.c ++++ b/sound/soc/rockchip/rockchip_i2s.c +@@ -61,7 +61,7 @@ static int i2s_runtime_suspend(struct device *dev) + struct rk_i2s_dev *i2s = dev_get_drvdata(dev); + + regcache_cache_only(i2s->regmap, true); +- clk_disable_unprepare(i2s->mclk); ++ //clk_disable_unprepare(i2s->mclk); + + return 0; + } +-- +GitLab + diff --git a/sys-kernel/pinephone-sources/files/0201-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch b/sys-kernel/pinephone-sources/files/0201-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch new file mode 100644 index 0000000..e7d4da5 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0201-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch @@ -0,0 +1,150 @@ +--- b/drivers/video/fbdev/core/bitblit.c ++++ a/drivers/video/fbdev/core/bitblit.c +@@ -234,7 +234,7 @@ + } + + static void bit_cursor(struct vc_data *vc, struct fb_info *info, int mode, ++ int softback_lines, int fg, int bg) +- int fg, int bg) + { + struct fb_cursor cursor; + struct fbcon_ops *ops = info->fbcon_par; +@@ -247,6 +247,15 @@ + + cursor.set = 0; + ++ if (softback_lines) { ++ if (y + softback_lines >= vc->vc_rows) { ++ mode = CM_ERASE; ++ ops->cursor_flash = 0; ++ return; ++ } else ++ y += softback_lines; ++ } ++ + c = scr_readw((u16 *) vc->vc_pos); + attribute = get_attribute(info, c); + src = vc->vc_font.data + ((c & charmask) * (w * vc->vc_font.height)); +--- b/drivers/video/fbdev/core/fbcon.c ++++ a/drivers/video/fbdev/core/fbcon.c +@@ -394,7 +394,7 @@ + c = scr_readw((u16 *) vc->vc_pos); + mode = (!ops->cursor_flash || ops->cursor_state.enable) ? + CM_ERASE : CM_DRAW; ++ ops->cursor(vc, info, mode, 0, get_color(vc, info, c, 1), +- ops->cursor(vc, info, mode, get_color(vc, info, c, 1), + get_color(vc, info, c, 0)); + console_unlock(); + } +@@ -1345,7 +1345,7 @@ + + ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1; + ++ ops->cursor(vc, info, mode, 0, get_color(vc, info, c, 1), +- ops->cursor(vc, info, mode, get_color(vc, info, c, 1), + get_color(vc, info, c, 0)); + } + +--- b/drivers/video/fbdev/core/fbcon.h ++++ a/drivers/video/fbdev/core/fbcon.h +@@ -62,7 +62,7 @@ + void (*clear_margins)(struct vc_data *vc, struct fb_info *info, + int color, int bottom_only); + void (*cursor)(struct vc_data *vc, struct fb_info *info, int mode, ++ int softback_lines, int fg, int bg); +- int fg, int bg); + int (*update_start)(struct fb_info *info); + int (*rotate_font)(struct fb_info *info, struct vc_data *vc); + struct fb_var_screeninfo var; /* copy of the current fb_var_screeninfo */ +--- b/drivers/video/fbdev/core/fbcon_ccw.c ++++ a/drivers/video/fbdev/core/fbcon_ccw.c +@@ -219,7 +219,7 @@ + } + + static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode, ++ int softback_lines, int fg, int bg) +- int fg, int bg) + { + struct fb_cursor cursor; + struct fbcon_ops *ops = info->fbcon_par; +@@ -236,6 +236,15 @@ + + cursor.set = 0; + ++ if (softback_lines) { ++ if (y + softback_lines >= vc->vc_rows) { ++ mode = CM_ERASE; ++ ops->cursor_flash = 0; ++ return; ++ } else ++ y += softback_lines; ++ } ++ + c = scr_readw((u16 *) vc->vc_pos); + attribute = get_attribute(info, c); + src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.width)); +--- b/drivers/video/fbdev/core/fbcon_cw.c ++++ a/drivers/video/fbdev/core/fbcon_cw.c +@@ -202,7 +202,7 @@ + } + + static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode, ++ int softback_lines, int fg, int bg) +- int fg, int bg) + { + struct fb_cursor cursor; + struct fbcon_ops *ops = info->fbcon_par; +@@ -219,6 +219,15 @@ + + cursor.set = 0; + ++ if (softback_lines) { ++ if (y + softback_lines >= vc->vc_rows) { ++ mode = CM_ERASE; ++ ops->cursor_flash = 0; ++ return; ++ } else ++ y += softback_lines; ++ } ++ + c = scr_readw((u16 *) vc->vc_pos); + attribute = get_attribute(info, c); + src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.width)); +--- b/drivers/video/fbdev/core/fbcon_ud.c ++++ a/drivers/video/fbdev/core/fbcon_ud.c +@@ -249,7 +249,7 @@ + } + + static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode, ++ int softback_lines, int fg, int bg) +- int fg, int bg) + { + struct fb_cursor cursor; + struct fbcon_ops *ops = info->fbcon_par; +@@ -267,6 +267,15 @@ + + cursor.set = 0; + ++ if (softback_lines) { ++ if (y + softback_lines >= vc->vc_rows) { ++ mode = CM_ERASE; ++ ops->cursor_flash = 0; ++ return; ++ } else ++ y += softback_lines; ++ } ++ + c = scr_readw((u16 *) vc->vc_pos); + attribute = get_attribute(info, c); + src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.height)); +--- b/drivers/video/fbdev/core/tileblit.c ++++ a/drivers/video/fbdev/core/tileblit.c +@@ -80,7 +80,7 @@ + } + + static void tile_cursor(struct vc_data *vc, struct fb_info *info, int mode, ++ int softback_lines, int fg, int bg) +- int fg, int bg) + { + struct fb_tilecursor cursor; + int use_sw = (vc->vc_cursor_type & 0x10); diff --git a/sys-kernel/pinephone-sources/files/0202-revert-fbcon-remove-no-op-fbcon_set_origin.patch b/sys-kernel/pinephone-sources/files/0202-revert-fbcon-remove-no-op-fbcon_set_origin.patch new file mode 100644 index 0000000..6491c54 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0202-revert-fbcon-remove-no-op-fbcon_set_origin.patch @@ -0,0 +1,31 @@ +--- b/drivers/video/fbdev/core/fbcon.c ++++ a/drivers/video/fbdev/core/fbcon.c +@@ -163,6 +163,8 @@ + + #define advance_row(p, delta) (unsigned short *)((unsigned long)(p) + (delta) * vc->vc_size_row) + ++static int fbcon_set_origin(struct vc_data *); ++ + static int fbcon_cursor_noblink; + + #define divides(a, b) ((!(a) || (b)%(a)) ? 0 : 1) +@@ -2633,6 +2635,11 @@ + } + } + ++static int fbcon_set_origin(struct vc_data *vc) ++{ ++ return 0; ++} ++ + void fbcon_suspended(struct fb_info *info) + { + struct vc_data *vc = NULL; +@@ -3103,6 +3110,7 @@ + .con_font_default = fbcon_set_def_font, + .con_font_copy = fbcon_copy_font, + .con_set_palette = fbcon_set_palette, ++ .con_set_origin = fbcon_set_origin, + .con_invert_region = fbcon_invert_region, + .con_screen_pos = fbcon_screen_pos, + .con_getxy = fbcon_getxy, diff --git a/sys-kernel/pinephone-sources/files/0203-revert-fbcon-remove-soft-scrollback-code.patch b/sys-kernel/pinephone-sources/files/0203-revert-fbcon-remove-soft-scrollback-code.patch new file mode 100644 index 0000000..a3950bb --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0203-revert-fbcon-remove-soft-scrollback-code.patch @@ -0,0 +1,500 @@ +--- b/drivers/video/fbdev/core/fbcon.c ++++ a/drivers/video/fbdev/core/fbcon.c +@@ -124,6 +124,12 @@ static int logo_lines; + /* logo_shown is an index to vc_cons when >= 0; otherwise follows FBCON_LOGO + enums. */ + static int logo_shown = FBCON_LOGO_CANSHOW; ++/* Software scrollback */ ++static int fbcon_softback_size = 32768; ++static unsigned long softback_buf, softback_curr; ++static unsigned long softback_in; ++static unsigned long softback_top, softback_end; ++static int softback_lines; + /* console mappings */ + static unsigned int first_fb_vc; + static unsigned int last_fb_vc = MAX_NR_CONSOLES - 1; +@@ -163,6 +169,8 @@ static int margin_color; + + static const struct consw fb_con; + ++#define CM_SOFTBACK (8) ++ + #define advance_row(p, delta) (unsigned short *)((unsigned long)(p) + (delta) * vc->vc_size_row) + + static int fbcon_set_origin(struct vc_data *); +@@ -347,6 +355,18 @@ static int get_color(struct vc_data *vc, + return color; + } + ++static void fbcon_update_softback(struct vc_data *vc) ++{ ++ int l = fbcon_softback_size / vc->vc_size_row; ++ ++ if (l > 5) ++ softback_end = softback_buf + l * vc->vc_size_row; ++ else ++ /* Smaller scrollback makes no sense, and 0 would screw ++ the operation totally */ ++ softback_top = 0; ++} ++ + static void fb_flashcursor(struct work_struct *work) + { + struct fbcon_ops *ops = container_of(work, struct fbcon_ops, cursor_work.work); +@@ -379,7 +399,7 @@ static void fb_flashcursor(struct work_s + c = scr_readw((u16 *) vc->vc_pos); + mode = (!ops->cursor_flash || ops->cursor_state.enable) ? + CM_ERASE : CM_DRAW; +- ops->cursor(vc, info, mode, 0, get_color(vc, info, c, 1), ++ ops->cursor(vc, info, mode, softback_lines, get_color(vc, info, c, 1), + get_color(vc, info, c, 0)); + console_unlock(); + +@@ -419,7 +439,13 @@ static int __init fb_console_setup(char + } + + if (!strncmp(options, "scrollback:", 11)) { +- pr_warn("Ignoring scrollback size option\n"); ++ options += 11; ++ if (*options) { ++ fbcon_softback_size = simple_strtoul(options, &options, 0); ++ if (*options == 'k' || *options == 'K') { ++ fbcon_softback_size *= 1024; ++ } ++ } + continue; + } + +@@ -959,6 +985,31 @@ static const char *fbcon_startup(void) + + set_blitting_type(vc, info); + ++ if (info->fix.type != FB_TYPE_TEXT) { ++ if (fbcon_softback_size) { ++ if (!softback_buf) { ++ softback_buf = ++ (unsigned long) ++ kvmalloc(fbcon_softback_size, ++ GFP_KERNEL); ++ if (!softback_buf) { ++ fbcon_softback_size = 0; ++ softback_top = 0; ++ } ++ } ++ } else { ++ if (softback_buf) { ++ kvfree((void *) softback_buf); ++ softback_buf = 0; ++ softback_top = 0; ++ } ++ } ++ if (softback_buf) ++ softback_in = softback_top = softback_curr = ++ softback_buf; ++ softback_lines = 0; ++ } ++ + /* Setup default font */ + if (!p->fontdata && !vc->vc_font.data) { + if (!fontname[0] || !(font = find_font(fontname))) +@@ -1129,6 +1180,9 @@ static void fbcon_init(struct vc_data *v + if (logo) + fbcon_prepare_logo(vc, info, cols, rows, new_cols, new_rows); + ++ if (vc == svc && softback_buf) ++ fbcon_update_softback(vc); ++ + if (ops->rotate_font && ops->rotate_font(info, vc)) { + ops->rotate = FB_ROTATE_UR; + set_blitting_type(vc, info); +@@ -1152,6 +1206,9 @@ static void fbcon_release_all(void) + struct fb_info *info; + int i, j, mapped; + ++ kvfree((void *)softback_buf); ++ softback_buf = 0UL; ++ + fbcon_for_each_registered_fb(i) { + mapped = 0; + info = fbcon_registered_fb[i]; +@@ -1312,6 +1369,7 @@ static void fbcon_cursor(struct vc_data + { + struct fb_info *info = fbcon_info_from_console(vc->vc_num); + struct fbcon_ops *ops = info->fbcon_par; ++ int y; + int c = scr_readw((u16 *) vc->vc_pos); + + ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms); +@@ -1325,11 +1383,19 @@ static void fbcon_cursor(struct vc_data + fbcon_add_cursor_work(info); + + ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1; ++ if (mode & CM_SOFTBACK) { ++ mode &= ~CM_SOFTBACK; ++ y = softback_lines; ++ } else { ++ if (softback_lines) ++ fbcon_set_origin(vc); ++ y = 0; ++ } + + if (!ops->cursor) + return; + +- ops->cursor(vc, info, mode, 0, get_color(vc, info, c, 1), ++ ops->cursor(vc, info, mode, y, get_color(vc, info, c, 1), + get_color(vc, info, c, 0)); + } + +@@ -1399,6 +1465,8 @@ static void fbcon_set_disp(struct fb_inf + + if (con_is_visible(vc)) { + update_screen(vc); ++ if (softback_buf) ++ fbcon_update_softback(vc); + } + } + +@@ -1536,6 +1604,99 @@ static __inline__ void ypan_down_redraw( + scrollback_current = 0; + } + ++static void fbcon_redraw_softback(struct vc_data *vc, struct fbcon_display *p, ++ long delta) ++{ ++ int count = vc->vc_rows; ++ unsigned short *d, *s; ++ unsigned long n; ++ int line = 0; ++ ++ d = (u16 *) softback_curr; ++ if (d == (u16 *) softback_in) ++ d = (u16 *) vc->vc_origin; ++ n = softback_curr + delta * vc->vc_size_row; ++ softback_lines -= delta; ++ if (delta < 0) { ++ if (softback_curr < softback_top && n < softback_buf) { ++ n += softback_end - softback_buf; ++ if (n < softback_top) { ++ softback_lines -= ++ (softback_top - n) / vc->vc_size_row; ++ n = softback_top; ++ } ++ } else if (softback_curr >= softback_top ++ && n < softback_top) { ++ softback_lines -= ++ (softback_top - n) / vc->vc_size_row; ++ n = softback_top; ++ } ++ } else { ++ if (softback_curr > softback_in && n >= softback_end) { ++ n += softback_buf - softback_end; ++ if (n > softback_in) { ++ n = softback_in; ++ softback_lines = 0; ++ } ++ } else if (softback_curr <= softback_in && n > softback_in) { ++ n = softback_in; ++ softback_lines = 0; ++ } ++ } ++ if (n == softback_curr) ++ return; ++ softback_curr = n; ++ s = (u16 *) softback_curr; ++ if (s == (u16 *) softback_in) ++ s = (u16 *) vc->vc_origin; ++ while (count--) { ++ unsigned short *start; ++ unsigned short *le; ++ unsigned short c; ++ int x = 0; ++ unsigned short attr = 1; ++ ++ start = s; ++ le = advance_row(s, 1); ++ do { ++ c = scr_readw(s); ++ if (attr != (c & 0xff00)) { ++ attr = c & 0xff00; ++ if (s > start) { ++ fbcon_putcs(vc, start, s - start, ++ line, x); ++ x += s - start; ++ start = s; ++ } ++ } ++ if (c == scr_readw(d)) { ++ if (s > start) { ++ fbcon_putcs(vc, start, s - start, ++ line, x); ++ x += s - start + 1; ++ start = s + 1; ++ } else { ++ x++; ++ start++; ++ } ++ } ++ s++; ++ d++; ++ } while (s < le); ++ if (s > start) ++ fbcon_putcs(vc, start, s - start, line, x); ++ line++; ++ if (d == (u16 *) softback_end) ++ d = (u16 *) softback_buf; ++ if (d == (u16 *) softback_in) ++ d = (u16 *) vc->vc_origin; ++ if (s == (u16 *) softback_end) ++ s = (u16 *) softback_buf; ++ if (s == (u16 *) softback_in) ++ s = (u16 *) vc->vc_origin; ++ } ++} ++ + static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, + int line, int count, int dy) + { +@@ -1740,6 +1901,31 @@ static void fbcon_bmove(struct vc_data * + p->vrows - p->yscroll); + } + ++static inline void fbcon_softback_note(struct vc_data *vc, int t, ++ int count) ++{ ++ unsigned short *p; ++ ++ if (vc->vc_num != fg_console) ++ return; ++ p = (unsigned short *) (vc->vc_origin + t * vc->vc_size_row); ++ ++ while (count) { ++ scr_memcpyw((u16 *) softback_in, p, vc->vc_size_row); ++ count--; ++ p = advance_row(p, 1); ++ softback_in += vc->vc_size_row; ++ if (softback_in == softback_end) ++ softback_in = softback_buf; ++ if (softback_in == softback_top) { ++ softback_top += vc->vc_size_row; ++ if (softback_top == softback_end) ++ softback_top = softback_buf; ++ } ++ } ++ softback_curr = softback_in; ++} ++ + static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b, + enum con_scroll dir, unsigned int count) + { +@@ -1762,6 +1948,8 @@ static bool fbcon_scroll(struct vc_data + case SM_UP: + if (count > vc->vc_rows) /* Maximum realistic size */ + count = vc->vc_rows; ++ if (softback_top) ++ fbcon_softback_note(vc, t, count); + switch (fb_scrollmode(p)) { + case SCROLL_MOVE: + fbcon_redraw_blit(vc, info, p, t, b - t - count, +@@ -2076,6 +2264,14 @@ static int fbcon_switch(struct vc_data * + info = fbcon_info_from_console(vc->vc_num); + ops = info->fbcon_par; + ++ if (softback_top) { ++ if (softback_lines) ++ fbcon_set_origin(vc); ++ softback_top = softback_curr = softback_in = softback_buf; ++ softback_lines = 0; ++ fbcon_update_softback(vc); ++ } ++ + if (logo_shown >= 0) { + struct vc_data *conp2 = vc_cons[logo_shown].d; + +@@ -2406,6 +2602,9 @@ static int fbcon_do_set_font(struct vc_d + int resize; + char *old_data = NULL; + ++ if (con_is_visible(vc) && softback_lines) ++ fbcon_set_origin(vc); ++ + resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); + if (p->userfont) + old_data = vc->vc_font.data; +@@ -2436,6 +2635,8 @@ static int fbcon_do_set_font(struct vc_d + ret = vc_resize(vc, cols, rows); + if (ret) + goto err_out; ++ if (con_is_visible(vc) && softback_buf) ++ fbcon_update_softback(vc); + } else if (con_is_visible(vc) + && vc->vc_mode == KD_TEXT) { + fbcon_clear_margins(vc, 0); +@@ -2582,7 +2783,19 @@ static void fbcon_set_palette(struct vc_ + + static u16 *fbcon_screen_pos(const struct vc_data *vc, int offset) + { +- return (u16 *) (vc->vc_origin + offset); ++ unsigned long p; ++ int line; ++ ++ if (vc->vc_num != fg_console || !softback_lines) ++ return (u16 *) (vc->vc_origin + offset); ++ line = offset / vc->vc_size_row; ++ if (line >= softback_lines) ++ return (u16 *) (vc->vc_origin + offset - ++ softback_lines * vc->vc_size_row); ++ p = softback_curr + offset; ++ if (p >= softback_end) ++ p += softback_buf - softback_end; ++ return (u16 *) p; + } + + static unsigned long fbcon_getxy(struct vc_data *vc, unsigned long pos, +@@ -2596,7 +2809,22 @@ static unsigned long fbcon_getxy(struct + + x = offset % vc->vc_cols; + y = offset / vc->vc_cols; ++ if (vc->vc_num == fg_console) ++ y += softback_lines; ++ ret = pos + (vc->vc_cols - x) * 2; ++ } else if (vc->vc_num == fg_console && softback_lines) { ++ unsigned long offset = pos - softback_curr; ++ ++ if (pos < softback_curr) ++ offset += softback_end - softback_buf; ++ offset /= 2; ++ x = offset % vc->vc_cols; ++ y = offset / vc->vc_cols; + ret = pos + (vc->vc_cols - x) * 2; ++ if (ret == softback_end) ++ ret = softback_buf; ++ if (ret == softback_in) ++ ret = vc->vc_origin; + } else { + /* Should not happen */ + x = y = 0; +@@ -2624,11 +2852,106 @@ static void fbcon_invert_region(struct v + a = ((a) & 0x88ff) | (((a) & 0x7000) >> 4) | + (((a) & 0x0700) << 4); + scr_writew(a, p++); ++ if (p == (u16 *) softback_end) ++ p = (u16 *) softback_buf; ++ if (p == (u16 *) softback_in) ++ p = (u16 *) vc->vc_origin; ++ } ++} ++ ++static void fbcon_scrolldelta(struct vc_data *vc, int lines) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[fg_console]]; ++ struct fbcon_ops *ops = info->fbcon_par; ++ struct fbcon_display *disp = &fb_display[fg_console]; ++ int offset, limit, scrollback_old; ++ ++ if (softback_top) { ++ if (vc->vc_num != fg_console) ++ return; ++ if (vc->vc_mode != KD_TEXT || !lines) ++ return; ++ if (logo_shown >= 0) { ++ struct vc_data *conp2 = vc_cons[logo_shown].d; ++ ++ if (conp2->vc_top == logo_lines ++ && conp2->vc_bottom == conp2->vc_rows) ++ conp2->vc_top = 0; ++ if (logo_shown == vc->vc_num) { ++ unsigned long p, q; ++ int i; ++ ++ p = softback_in; ++ q = vc->vc_origin + ++ logo_lines * vc->vc_size_row; ++ for (i = 0; i < logo_lines; i++) { ++ if (p == softback_top) ++ break; ++ if (p == softback_buf) ++ p = softback_end; ++ p -= vc->vc_size_row; ++ q -= vc->vc_size_row; ++ scr_memcpyw((u16 *) q, (u16 *) p, ++ vc->vc_size_row); ++ } ++ softback_in = softback_curr = p; ++ update_region(vc, vc->vc_origin, ++ logo_lines * vc->vc_cols); ++ } ++ logo_shown = FBCON_LOGO_CANSHOW; ++ } ++ fbcon_cursor(vc, CM_ERASE | CM_SOFTBACK); ++ fbcon_redraw_softback(vc, disp, lines); ++ fbcon_cursor(vc, CM_DRAW | CM_SOFTBACK); ++ return; + } ++ ++ if (!scrollback_phys_max) ++ return; ++ ++ scrollback_old = scrollback_current; ++ scrollback_current -= lines; ++ if (scrollback_current < 0) ++ scrollback_current = 0; ++ else if (scrollback_current > scrollback_max) ++ scrollback_current = scrollback_max; ++ if (scrollback_current == scrollback_old) ++ return; ++ ++ if (fbcon_is_inactive(vc, info)) ++ return; ++ ++ fbcon_cursor(vc, CM_ERASE); ++ ++ offset = disp->yscroll - scrollback_current; ++ limit = disp->vrows; ++ switch (disp->scrollmode) { ++ case SCROLL_WRAP_MOVE: ++ info->var.vmode |= FB_VMODE_YWRAP; ++ break; ++ case SCROLL_PAN_MOVE: ++ case SCROLL_PAN_REDRAW: ++ limit -= vc->vc_rows; ++ info->var.vmode &= ~FB_VMODE_YWRAP; ++ break; ++ } ++ if (offset < 0) ++ offset += limit; ++ else if (offset >= limit) ++ offset -= limit; ++ ++ ops->var.xoffset = 0; ++ ops->var.yoffset = offset * vc->vc_font.height; ++ ops->update_start(info); ++ ++ if (!scrollback_current) ++ fbcon_cursor(vc, CM_DRAW); + } + + static int fbcon_set_origin(struct vc_data *vc) + { ++ if (softback_lines) ++ fbcon_scrolldelta(vc, softback_lines); + return 0; + } + +@@ -2692,6 +3015,8 @@ static void fbcon_modechanged(struct fb_ + + fbcon_set_palette(vc, color_table); + update_screen(vc); ++ if (softback_buf) ++ fbcon_update_softback(vc); + } + } + +@@ -3154,6 +3479,7 @@ static const struct consw fb_con = { + .con_font_get = fbcon_get_font, + .con_font_default = fbcon_set_def_font, + .con_set_palette = fbcon_set_palette, ++ .con_scrolldelta = fbcon_scrolldelta, + .con_set_origin = fbcon_set_origin, + .con_invert_region = fbcon_invert_region, + .con_screen_pos = fbcon_screen_pos, diff --git a/sys-kernel/pinephone-sources/files/5.19.10-11.patch b/sys-kernel/pinephone-sources/files/5.19.10-11.patch new file mode 100644 index 0000000..a5ff5cb --- /dev/null +++ b/sys-kernel/pinephone-sources/files/5.19.10-11.patch @@ -0,0 +1,1231 @@ +diff --git a/Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml b/Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml +index 85c85b694217c..e18107eafe7cc 100644 +--- a/Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml ++++ b/Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml +@@ -96,7 +96,7 @@ properties: + Documentation/devicetree/bindings/arm/cpus.yaml). + + required: +- - fiq-index ++ - apple,fiq-index + - cpus + + required: +diff --git a/Makefile b/Makefile +index 33a9b6b547c47..01463a22926d5 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 5 + PATCHLEVEL = 19 +-SUBLEVEL = 10 ++SUBLEVEL = 11 + EXTRAVERSION = + NAME = Superb Owl + +diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig +index cd2b3fe156724..c68c3581483ac 100644 +--- a/arch/parisc/Kconfig ++++ b/arch/parisc/Kconfig +@@ -225,8 +225,18 @@ config MLONGCALLS + Enabling this option will probably slow down your kernel. + + config 64BIT +- def_bool "$(ARCH)" = "parisc64" ++ def_bool y if "$(ARCH)" = "parisc64" ++ bool "64-bit kernel" if "$(ARCH)" = "parisc" + depends on PA8X00 ++ help ++ Enable this if you want to support 64bit kernel on PA-RISC platform. ++ ++ At the moment, only people willing to use more than 2GB of RAM, ++ or having a 64bit-only capable PA-RISC machine should say Y here. ++ ++ Since there is no 64bit userland on PA-RISC, there is no point to ++ enable this option otherwise. The 64bit kernel is significantly bigger ++ and slower than the 32bit one. + + choice + prompt "Kernel page size" +diff --git a/block/blk-core.c b/block/blk-core.c +index 27fb1357ad4b8..cc6fbcb6d2521 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -338,7 +338,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) + + while (!blk_try_enter_queue(q, pm)) { + if (flags & BLK_MQ_REQ_NOWAIT) +- return -EBUSY; ++ return -EAGAIN; + + /* + * read pair of barrier in blk_freeze_queue_start(), we need to +@@ -368,7 +368,7 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio) + if (test_bit(GD_DEAD, &disk->state)) + goto dead; + bio_wouldblock_error(bio); +- return -EBUSY; ++ return -EAGAIN; + } + + /* +diff --git a/block/blk-lib.c b/block/blk-lib.c +index 09b7e1200c0f4..20e42144065b8 100644 +--- a/block/blk-lib.c ++++ b/block/blk-lib.c +@@ -311,6 +311,11 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, + struct blk_plug plug; + int ret = 0; + ++ /* make sure that "len << SECTOR_SHIFT" doesn't overflow */ ++ if (max_sectors > UINT_MAX >> SECTOR_SHIFT) ++ max_sectors = UINT_MAX >> SECTOR_SHIFT; ++ max_sectors &= ~bs_mask; ++ + if (max_sectors == 0) + return -EOPNOTSUPP; + if ((sector | nr_sects) & bs_mask) +@@ -324,10 +329,10 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, + + bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); + bio->bi_iter.bi_sector = sector; +- bio->bi_iter.bi_size = len; ++ bio->bi_iter.bi_size = len << SECTOR_SHIFT; + +- sector += len << SECTOR_SHIFT; +- nr_sects -= len << SECTOR_SHIFT; ++ sector += len; ++ nr_sects -= len; + if (!nr_sects) { + ret = submit_bio_wait(bio); + bio_put(bio); +diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c +index a964e25ea6206..763256efddc2b 100644 +--- a/drivers/gpio/gpio-mpc8xxx.c ++++ b/drivers/gpio/gpio-mpc8xxx.c +@@ -172,6 +172,7 @@ static int mpc8xxx_irq_set_type(struct irq_data *d, unsigned int flow_type) + + switch (flow_type) { + case IRQ_TYPE_EDGE_FALLING: ++ case IRQ_TYPE_LEVEL_LOW: + raw_spin_lock_irqsave(&mpc8xxx_gc->lock, flags); + gc->write_reg(mpc8xxx_gc->regs + GPIO_ICR, + gc->read_reg(mpc8xxx_gc->regs + GPIO_ICR) +diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c +index e342a6dc4c6c1..bb953f6478647 100644 +--- a/drivers/gpio/gpio-rockchip.c ++++ b/drivers/gpio/gpio-rockchip.c +@@ -418,11 +418,11 @@ static int rockchip_irq_set_type(struct irq_data *d, unsigned int type) + goto out; + } else { + bank->toggle_edge_mode |= mask; +- level |= mask; ++ level &= ~mask; + + /* + * Determine gpio state. If 1 next interrupt should be +- * falling otherwise rising. ++ * low otherwise high. + */ + data = readl(bank->reg_base + bank->gpio_regs->ext_port); + if (data & mask) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 67d4a3c13ed19..929f8b75bfaee 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -2391,8 +2391,16 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) + } + adev->ip_blocks[i].status.sw = true; + +- /* need to do gmc hw init early so we can allocate gpu mem */ +- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { ++ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { ++ /* need to do common hw init early so everything is set up for gmc */ ++ r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); ++ if (r) { ++ DRM_ERROR("hw_init %d failed %d\n", i, r); ++ goto init_failed; ++ } ++ adev->ip_blocks[i].status.hw = true; ++ } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { ++ /* need to do gmc hw init early so we can allocate gpu mem */ + /* Try to reserve bad pages early */ + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_exchange_data(adev); +@@ -3078,8 +3086,8 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) + int i, r; + + static enum amd_ip_block_type ip_order[] = { +- AMD_IP_BLOCK_TYPE_GMC, + AMD_IP_BLOCK_TYPE_COMMON, ++ AMD_IP_BLOCK_TYPE_GMC, + AMD_IP_BLOCK_TYPE_PSP, + AMD_IP_BLOCK_TYPE_IH, + }; +diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c +index f49db13b3fbee..0debdbcf46310 100644 +--- a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c ++++ b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c +@@ -380,6 +380,7 @@ static void nbio_v2_3_enable_aspm(struct amdgpu_device *adev, + WREG32_PCIE(smnPCIE_LC_CNTL, data); + } + ++#ifdef CONFIG_PCIEASPM + static void nbio_v2_3_program_ltr(struct amdgpu_device *adev) + { + uint32_t def, data; +@@ -401,9 +402,11 @@ static void nbio_v2_3_program_ltr(struct amdgpu_device *adev) + if (def != data) + WREG32_PCIE(smnBIF_CFG_DEV0_EPF0_DEVICE_CNTL2, data); + } ++#endif + + static void nbio_v2_3_program_aspm(struct amdgpu_device *adev) + { ++#ifdef CONFIG_PCIEASPM + uint32_t def, data; + + def = data = RREG32_PCIE(smnPCIE_LC_CNTL); +@@ -459,7 +462,10 @@ static void nbio_v2_3_program_aspm(struct amdgpu_device *adev) + if (def != data) + WREG32_PCIE(smnPCIE_LC_CNTL6, data); + +- nbio_v2_3_program_ltr(adev); ++ /* Don't bother about LTR if LTR is not enabled ++ * in the path */ ++ if (adev->pdev->ltr_path) ++ nbio_v2_3_program_ltr(adev); + + def = data = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP3); + data |= 0x5DE0 << RCC_BIF_STRAP3__STRAP_VLINK_ASPM_IDLE_TIMER__SHIFT; +@@ -483,6 +489,7 @@ static void nbio_v2_3_program_aspm(struct amdgpu_device *adev) + data &= ~PCIE_LC_CNTL3__LC_DSC_DONT_ENTER_L23_AFTER_PME_ACK_MASK; + if (def != data) + WREG32_PCIE(smnPCIE_LC_CNTL3, data); ++#endif + } + + static void nbio_v2_3_apply_lc_spc_mode_wa(struct amdgpu_device *adev) +diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c b/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c +index f7f6ddebd3e49..37615a77287bc 100644 +--- a/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c ++++ b/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c +@@ -282,6 +282,7 @@ static void nbio_v6_1_init_registers(struct amdgpu_device *adev) + mmBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) << 2; + } + ++#ifdef CONFIG_PCIEASPM + static void nbio_v6_1_program_ltr(struct amdgpu_device *adev) + { + uint32_t def, data; +@@ -303,9 +304,11 @@ static void nbio_v6_1_program_ltr(struct amdgpu_device *adev) + if (def != data) + WREG32_PCIE(smnBIF_CFG_DEV0_EPF0_DEVICE_CNTL2, data); + } ++#endif + + static void nbio_v6_1_program_aspm(struct amdgpu_device *adev) + { ++#ifdef CONFIG_PCIEASPM + uint32_t def, data; + + def = data = RREG32_PCIE(smnPCIE_LC_CNTL); +@@ -361,7 +364,10 @@ static void nbio_v6_1_program_aspm(struct amdgpu_device *adev) + if (def != data) + WREG32_PCIE(smnPCIE_LC_CNTL6, data); + +- nbio_v6_1_program_ltr(adev); ++ /* Don't bother about LTR if LTR is not enabled ++ * in the path */ ++ if (adev->pdev->ltr_path) ++ nbio_v6_1_program_ltr(adev); + + def = data = RREG32_PCIE(smnRCC_BIF_STRAP3); + data |= 0x5DE0 << RCC_BIF_STRAP3__STRAP_VLINK_ASPM_IDLE_TIMER__SHIFT; +@@ -385,6 +391,7 @@ static void nbio_v6_1_program_aspm(struct amdgpu_device *adev) + data &= ~PCIE_LC_CNTL3__LC_DSC_DONT_ENTER_L23_AFTER_PME_ACK_MASK; + if (def != data) + WREG32_PCIE(smnPCIE_LC_CNTL3, data); ++#endif + } + + const struct amdgpu_nbio_funcs nbio_v6_1_funcs = { +diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +index 11848d1e238b6..19455a7259391 100644 +--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c ++++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +@@ -673,6 +673,7 @@ struct amdgpu_nbio_ras nbio_v7_4_ras = { + }; + + ++#ifdef CONFIG_PCIEASPM + static void nbio_v7_4_program_ltr(struct amdgpu_device *adev) + { + uint32_t def, data; +@@ -694,9 +695,11 @@ static void nbio_v7_4_program_ltr(struct amdgpu_device *adev) + if (def != data) + WREG32_PCIE(smnBIF_CFG_DEV0_EPF0_DEVICE_CNTL2, data); + } ++#endif + + static void nbio_v7_4_program_aspm(struct amdgpu_device *adev) + { ++#ifdef CONFIG_PCIEASPM + uint32_t def, data; + + if (adev->ip_versions[NBIO_HWIP][0] == IP_VERSION(7, 4, 4)) +@@ -755,7 +758,10 @@ static void nbio_v7_4_program_aspm(struct amdgpu_device *adev) + if (def != data) + WREG32_PCIE(smnPCIE_LC_CNTL6, data); + +- nbio_v7_4_program_ltr(adev); ++ /* Don't bother about LTR if LTR is not enabled ++ * in the path */ ++ if (adev->pdev->ltr_path) ++ nbio_v7_4_program_ltr(adev); + + def = data = RREG32_PCIE(smnRCC_BIF_STRAP3); + data |= 0x5DE0 << RCC_BIF_STRAP3__STRAP_VLINK_ASPM_IDLE_TIMER__SHIFT; +@@ -779,6 +785,7 @@ static void nbio_v7_4_program_aspm(struct amdgpu_device *adev) + data &= ~PCIE_LC_CNTL3__LC_DSC_DONT_ENTER_L23_AFTER_PME_ACK_MASK; + if (def != data) + WREG32_PCIE(smnPCIE_LC_CNTL3, data); ++#endif + } + + const struct amdgpu_nbio_funcs nbio_v7_4_funcs = { +diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +index 65181efba50ec..56424f75dd2cc 100644 +--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +@@ -1504,6 +1504,11 @@ static int sdma_v4_0_start(struct amdgpu_device *adev) + WREG32_SDMA(i, mmSDMA0_CNTL, temp); + + if (!amdgpu_sriov_vf(adev)) { ++ ring = &adev->sdma.instance[i].ring; ++ adev->nbio.funcs->sdma_doorbell_range(adev, i, ++ ring->use_doorbell, ring->doorbell_index, ++ adev->doorbell_index.sdma_doorbell_range); ++ + /* unhalt engine */ + temp = RREG32_SDMA(i, mmSDMA0_F32_CNTL); + temp = REG_SET_FIELD(temp, SDMA0_F32_CNTL, HALT, 0); +diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c +index fde6154f20096..183024d7c184e 100644 +--- a/drivers/gpu/drm/amd/amdgpu/soc15.c ++++ b/drivers/gpu/drm/amd/amdgpu/soc15.c +@@ -1211,25 +1211,6 @@ static int soc15_common_sw_fini(void *handle) + return 0; + } + +-static void soc15_doorbell_range_init(struct amdgpu_device *adev) +-{ +- int i; +- struct amdgpu_ring *ring; +- +- /* sdma/ih doorbell range are programed by hypervisor */ +- if (!amdgpu_sriov_vf(adev)) { +- for (i = 0; i < adev->sdma.num_instances; i++) { +- ring = &adev->sdma.instance[i].ring; +- adev->nbio.funcs->sdma_doorbell_range(adev, i, +- ring->use_doorbell, ring->doorbell_index, +- adev->doorbell_index.sdma_doorbell_range); +- } +- +- adev->nbio.funcs->ih_doorbell_range(adev, adev->irq.ih.use_doorbell, +- adev->irq.ih.doorbell_index); +- } +-} +- + static int soc15_common_hw_init(void *handle) + { + struct amdgpu_device *adev = (struct amdgpu_device *)handle; +@@ -1249,12 +1230,6 @@ static int soc15_common_hw_init(void *handle) + + /* enable the doorbell aperture */ + soc15_enable_doorbell_aperture(adev, true); +- /* HW doorbell routing policy: doorbell writing not +- * in SDMA/IH/MM/ACV range will be routed to CP. So +- * we need to init SDMA/IH/MM/ACV doorbell range prior +- * to CP ip block init and ring test. +- */ +- soc15_doorbell_range_init(adev); + + return 0; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c +index 03b7066471f9a..1e83db0c5438d 100644 +--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c ++++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c +@@ -289,6 +289,10 @@ static int vega10_ih_irq_init(struct amdgpu_device *adev) + } + } + ++ if (!amdgpu_sriov_vf(adev)) ++ adev->nbio.funcs->ih_doorbell_range(adev, adev->irq.ih.use_doorbell, ++ adev->irq.ih.doorbell_index); ++ + pci_set_master(adev->pdev); + + /* enable interrupts */ +diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c +index 2022ffbb8dba5..59dfca093155c 100644 +--- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c ++++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c +@@ -340,6 +340,10 @@ static int vega20_ih_irq_init(struct amdgpu_device *adev) + } + } + ++ if (!amdgpu_sriov_vf(adev)) ++ adev->nbio.funcs->ih_doorbell_range(adev, adev->irq.ih.use_doorbell, ++ adev->irq.ih.doorbell_index); ++ + pci_set_master(adev->pdev); + + /* enable interrupts */ +diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c +index 19bf717fd4cb6..5508ebb9eb434 100644 +--- a/drivers/gpu/drm/i915/display/icl_dsi.c ++++ b/drivers/gpu/drm/i915/display/icl_dsi.c +@@ -1629,6 +1629,8 @@ static int gen11_dsi_dsc_compute_config(struct intel_encoder *encoder, + /* FIXME: initialize from VBT */ + vdsc_cfg->rc_model_size = DSC_RC_MODEL_SIZE_CONST; + ++ vdsc_cfg->pic_height = crtc_state->hw.adjusted_mode.crtc_vdisplay; ++ + ret = intel_dsc_compute_params(crtc_state); + if (ret) + return ret; +diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c +index 41aaa6c98114f..fe8b6b72970a2 100644 +--- a/drivers/gpu/drm/i915/display/intel_dp.c ++++ b/drivers/gpu/drm/i915/display/intel_dp.c +@@ -1379,6 +1379,7 @@ static int intel_dp_dsc_compute_params(struct intel_encoder *encoder, + * DP_DSC_RC_BUF_SIZE for this. + */ + vdsc_cfg->rc_model_size = DSC_RC_MODEL_SIZE_CONST; ++ vdsc_cfg->pic_height = crtc_state->hw.adjusted_mode.crtc_vdisplay; + + /* + * Slice Height of 8 works for all currently available panels. So start +diff --git a/drivers/gpu/drm/i915/display/intel_vdsc.c b/drivers/gpu/drm/i915/display/intel_vdsc.c +index 43e1bbc1e3035..ca530f0733e0e 100644 +--- a/drivers/gpu/drm/i915/display/intel_vdsc.c ++++ b/drivers/gpu/drm/i915/display/intel_vdsc.c +@@ -460,7 +460,6 @@ int intel_dsc_compute_params(struct intel_crtc_state *pipe_config) + u8 i = 0; + + vdsc_cfg->pic_width = pipe_config->hw.adjusted_mode.crtc_hdisplay; +- vdsc_cfg->pic_height = pipe_config->hw.adjusted_mode.crtc_vdisplay; + vdsc_cfg->slice_width = DIV_ROUND_UP(vdsc_cfg->pic_width, + pipe_config->dsc.slice_count); + +diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h +index 9feda105f9131..a7acffbf15d1f 100644 +--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h ++++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h +@@ -235,6 +235,14 @@ struct intel_guc { + * @shift: Right shift value for the gpm timestamp + */ + u32 shift; ++ ++ /** ++ * @last_stat_jiffies: jiffies at last actual stats collection time ++ * We use this timestamp to ensure we don't oversample the ++ * stats because runtime power management events can trigger ++ * stats collection at much higher rates than required. ++ */ ++ unsigned long last_stat_jiffies; + } timestamp; + + #ifdef CONFIG_DRM_I915_SELFTEST +diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +index 26a051ef119df..d7e4681d7297c 100644 +--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c ++++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +@@ -1365,6 +1365,8 @@ static void __update_guc_busyness_stats(struct intel_guc *guc) + unsigned long flags; + ktime_t unused; + ++ guc->timestamp.last_stat_jiffies = jiffies; ++ + spin_lock_irqsave(&guc->timestamp.lock, flags); + + guc_update_pm_timestamp(guc, &unused); +@@ -1436,7 +1438,23 @@ void intel_guc_busyness_park(struct intel_gt *gt) + if (!guc_submission_initialized(guc)) + return; + +- cancel_delayed_work(&guc->timestamp.work); ++ /* ++ * There is a race with suspend flow where the worker runs after suspend ++ * and causes an unclaimed register access warning. Cancel the worker ++ * synchronously here. ++ */ ++ cancel_delayed_work_sync(&guc->timestamp.work); ++ ++ /* ++ * Before parking, we should sample engine busyness stats if we need to. ++ * We can skip it if we are less than half a ping from the last time we ++ * sampled the busyness stats. ++ */ ++ if (guc->timestamp.last_stat_jiffies && ++ !time_after(jiffies, guc->timestamp.last_stat_jiffies + ++ (guc->timestamp.ping_delay / 2))) ++ return; ++ + __update_guc_busyness_stats(guc); + } + +diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h +index 4f5a51bb9e1e4..e77956ae88a4b 100644 +--- a/drivers/gpu/drm/i915/i915_reg.h ++++ b/drivers/gpu/drm/i915/i915_reg.h +@@ -1849,14 +1849,14 @@ + + #define GT0_PERF_LIMIT_REASONS _MMIO(0x1381a8) + #define GT0_PERF_LIMIT_REASONS_MASK 0xde3 +-#define PROCHOT_MASK REG_BIT(1) +-#define THERMAL_LIMIT_MASK REG_BIT(2) +-#define RATL_MASK REG_BIT(6) +-#define VR_THERMALERT_MASK REG_BIT(7) +-#define VR_TDC_MASK REG_BIT(8) +-#define POWER_LIMIT_4_MASK REG_BIT(9) +-#define POWER_LIMIT_1_MASK REG_BIT(11) +-#define POWER_LIMIT_2_MASK REG_BIT(12) ++#define PROCHOT_MASK REG_BIT(0) ++#define THERMAL_LIMIT_MASK REG_BIT(1) ++#define RATL_MASK REG_BIT(5) ++#define VR_THERMALERT_MASK REG_BIT(6) ++#define VR_TDC_MASK REG_BIT(7) ++#define POWER_LIMIT_4_MASK REG_BIT(8) ++#define POWER_LIMIT_1_MASK REG_BIT(10) ++#define POWER_LIMIT_2_MASK REG_BIT(11) + + #define CHV_CLK_CTL1 _MMIO(0x101100) + #define VLV_CLK_CTL2 _MMIO(0x101104) +diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c +index 16460b169ed21..2a32729a74b51 100644 +--- a/drivers/gpu/drm/i915/i915_vma.c ++++ b/drivers/gpu/drm/i915/i915_vma.c +@@ -1870,12 +1870,13 @@ int _i915_vma_move_to_active(struct i915_vma *vma, + enum dma_resv_usage usage; + int idx; + +- obj->read_domains = 0; + if (flags & EXEC_OBJECT_WRITE) { + usage = DMA_RESV_USAGE_WRITE; + obj->write_domain = I915_GEM_DOMAIN_RENDER; ++ obj->read_domains = 0; + } else { + usage = DMA_RESV_USAGE_READ; ++ obj->write_domain = 0; + } + + dma_fence_array_for_each(curr, idx, fence) +diff --git a/drivers/gpu/drm/meson/meson_plane.c b/drivers/gpu/drm/meson/meson_plane.c +index 8640a8a8a4691..44aa526294439 100644 +--- a/drivers/gpu/drm/meson/meson_plane.c ++++ b/drivers/gpu/drm/meson/meson_plane.c +@@ -168,7 +168,7 @@ static void meson_plane_atomic_update(struct drm_plane *plane, + + /* Enable OSD and BLK0, set max global alpha */ + priv->viu.osd1_ctrl_stat = OSD_ENABLE | +- (0xFF << OSD_GLOBAL_ALPHA_SHIFT) | ++ (0x100 << OSD_GLOBAL_ALPHA_SHIFT) | + OSD_BLK0_ENABLE; + + priv->viu.osd1_ctrl_stat2 = readl(priv->io_base + +diff --git a/drivers/gpu/drm/meson/meson_viu.c b/drivers/gpu/drm/meson/meson_viu.c +index bb7e109534de1..d4b907889a21d 100644 +--- a/drivers/gpu/drm/meson/meson_viu.c ++++ b/drivers/gpu/drm/meson/meson_viu.c +@@ -94,7 +94,7 @@ static void meson_viu_set_g12a_osd1_matrix(struct meson_drm *priv, + priv->io_base + _REG(VPP_WRAP_OSD1_MATRIX_COEF11_12)); + writel(((m[9] & 0x1fff) << 16) | (m[10] & 0x1fff), + priv->io_base + _REG(VPP_WRAP_OSD1_MATRIX_COEF20_21)); +- writel((m[11] & 0x1fff) << 16, ++ writel((m[11] & 0x1fff), + priv->io_base + _REG(VPP_WRAP_OSD1_MATRIX_COEF22)); + + writel(((m[18] & 0xfff) << 16) | (m[19] & 0xfff), +diff --git a/drivers/gpu/drm/panel/panel-edp.c b/drivers/gpu/drm/panel/panel-edp.c +index a189982601a48..e8040defe6073 100644 +--- a/drivers/gpu/drm/panel/panel-edp.c ++++ b/drivers/gpu/drm/panel/panel-edp.c +@@ -1270,7 +1270,8 @@ static const struct panel_desc innolux_n116bca_ea1 = { + }, + .delay = { + .hpd_absent = 200, +- .prepare_to_enable = 80, ++ .enable = 80, ++ .disable = 50, + .unprepare = 500, + }, + }; +diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c +index d6e831576cd2b..88271f04615b0 100644 +--- a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c ++++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c +@@ -1436,11 +1436,15 @@ static void rk3568_set_intf_mux(struct vop2_video_port *vp, int id, + die &= ~RK3568_SYS_DSP_INFACE_EN_HDMI_MUX; + die |= RK3568_SYS_DSP_INFACE_EN_HDMI | + FIELD_PREP(RK3568_SYS_DSP_INFACE_EN_HDMI_MUX, vp->id); ++ dip &= ~RK3568_DSP_IF_POL__HDMI_PIN_POL; ++ dip |= FIELD_PREP(RK3568_DSP_IF_POL__HDMI_PIN_POL, polflags); + break; + case ROCKCHIP_VOP2_EP_EDP0: + die &= ~RK3568_SYS_DSP_INFACE_EN_EDP_MUX; + die |= RK3568_SYS_DSP_INFACE_EN_EDP | + FIELD_PREP(RK3568_SYS_DSP_INFACE_EN_EDP_MUX, vp->id); ++ dip &= ~RK3568_DSP_IF_POL__EDP_PIN_POL; ++ dip |= FIELD_PREP(RK3568_DSP_IF_POL__EDP_PIN_POL, polflags); + break; + case ROCKCHIP_VOP2_EP_MIPI0: + die &= ~RK3568_SYS_DSP_INFACE_EN_MIPI0_MUX; +diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c +index fc8c1420c0b69..64b14ac4c7b02 100644 +--- a/drivers/iommu/intel/dmar.c ++++ b/drivers/iommu/intel/dmar.c +@@ -2368,13 +2368,6 @@ static int dmar_device_hotplug(acpi_handle handle, bool insert) + if (!dmar_in_use()) + return 0; + +- /* +- * It's unlikely that any I/O board is hot added before the IOMMU +- * subsystem is initialized. +- */ +- if (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled) +- return -EOPNOTSUPP; +- + if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) { + tmp = handle; + } else { +diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c +index c0464959cbcdb..861a239d905a4 100644 +--- a/drivers/iommu/intel/iommu.c ++++ b/drivers/iommu/intel/iommu.c +@@ -3133,7 +3133,13 @@ static int __init init_dmars(void) + + #ifdef CONFIG_INTEL_IOMMU_SVM + if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { ++ /* ++ * Call dmar_alloc_hwirq() with dmar_global_lock held, ++ * could cause possible lock race condition. ++ */ ++ up_write(&dmar_global_lock); + ret = intel_svm_enable_prq(iommu); ++ down_write(&dmar_global_lock); + if (ret) + goto free_iommu; + } +@@ -4039,6 +4045,7 @@ int __init intel_iommu_init(void) + force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || + platform_optin_force_iommu(); + ++ down_write(&dmar_global_lock); + if (dmar_table_init()) { + if (force_on) + panic("tboot: Failed to initialize DMAR table\n"); +@@ -4051,6 +4058,16 @@ int __init intel_iommu_init(void) + goto out_free_dmar; + } + ++ up_write(&dmar_global_lock); ++ ++ /* ++ * The bus notifier takes the dmar_global_lock, so lockdep will ++ * complain later when we register it under the lock. ++ */ ++ dmar_register_bus_notifier(); ++ ++ down_write(&dmar_global_lock); ++ + if (!no_iommu) + intel_iommu_debugfs_init(); + +@@ -4098,9 +4115,11 @@ int __init intel_iommu_init(void) + pr_err("Initialization failed\n"); + goto out_free_dmar; + } ++ up_write(&dmar_global_lock); + + init_iommu_pm_ops(); + ++ down_read(&dmar_global_lock); + for_each_active_iommu(iommu, drhd) { + /* + * The flush queue implementation does not perform +@@ -4118,11 +4137,13 @@ int __init intel_iommu_init(void) + "%s", iommu->name); + iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + } ++ up_read(&dmar_global_lock); + + bus_set_iommu(&pci_bus_type, &intel_iommu_ops); + if (si_domain && !hw_pass_through) + register_memory_notifier(&intel_iommu_memory_nb); + ++ down_read(&dmar_global_lock); + if (probe_acpi_namespace_devices()) + pr_warn("ACPI name space devices didn't probe correctly\n"); + +@@ -4133,15 +4154,17 @@ int __init intel_iommu_init(void) + + iommu_disable_protect_mem_regions(iommu); + } ++ up_read(&dmar_global_lock); + +- intel_iommu_enabled = 1; +- dmar_register_bus_notifier(); + pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); + ++ intel_iommu_enabled = 1; ++ + return 0; + + out_free_dmar: + intel_iommu_free_dmars(); ++ up_write(&dmar_global_lock); + return ret; + } + +diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c +index 520ed965bb7a4..583ca847a39cb 100644 +--- a/drivers/of/fdt.c ++++ b/drivers/of/fdt.c +@@ -314,7 +314,7 @@ static int unflatten_dt_nodes(const void *blob, + for (offset = 0; + offset >= 0 && depth >= initial_depth; + offset = fdt_next_node(blob, offset, &depth)) { +- if (WARN_ON_ONCE(depth >= FDT_MAX_DEPTH)) ++ if (WARN_ON_ONCE(depth >= FDT_MAX_DEPTH - 1)) + continue; + + if (!IS_ENABLED(CONFIG_OF_KOBJ) && +diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c +index f69ab90b5e22d..6052f264bbb0a 100644 +--- a/drivers/parisc/ccio-dma.c ++++ b/drivers/parisc/ccio-dma.c +@@ -1546,6 +1546,7 @@ static int __init ccio_probe(struct parisc_device *dev) + } + ccio_ioc_init(ioc); + if (ccio_init_resources(ioc)) { ++ iounmap(ioc->ioc_regs); + kfree(ioc); + return -ENOMEM; + } +diff --git a/drivers/pinctrl/qcom/pinctrl-sc8180x.c b/drivers/pinctrl/qcom/pinctrl-sc8180x.c +index 6bec7f1431348..704a99d2f93ce 100644 +--- a/drivers/pinctrl/qcom/pinctrl-sc8180x.c ++++ b/drivers/pinctrl/qcom/pinctrl-sc8180x.c +@@ -530,10 +530,10 @@ DECLARE_MSM_GPIO_PINS(187); + DECLARE_MSM_GPIO_PINS(188); + DECLARE_MSM_GPIO_PINS(189); + +-static const unsigned int sdc2_clk_pins[] = { 190 }; +-static const unsigned int sdc2_cmd_pins[] = { 191 }; +-static const unsigned int sdc2_data_pins[] = { 192 }; +-static const unsigned int ufs_reset_pins[] = { 193 }; ++static const unsigned int ufs_reset_pins[] = { 190 }; ++static const unsigned int sdc2_clk_pins[] = { 191 }; ++static const unsigned int sdc2_cmd_pins[] = { 192 }; ++static const unsigned int sdc2_data_pins[] = { 193 }; + + enum sc8180x_functions { + msm_mux_adsp_ext, +@@ -1582,7 +1582,7 @@ static const int sc8180x_acpi_reserved_gpios[] = { + static const struct msm_gpio_wakeirq_map sc8180x_pdc_map[] = { + { 3, 31 }, { 5, 32 }, { 8, 33 }, { 9, 34 }, { 10, 100 }, { 12, 104 }, + { 24, 37 }, { 26, 38 }, { 27, 41 }, { 28, 42 }, { 30, 39 }, { 36, 43 }, +- { 37, 43 }, { 38, 45 }, { 39, 118 }, { 39, 125 }, { 41, 47 }, ++ { 37, 44 }, { 38, 45 }, { 39, 118 }, { 39, 125 }, { 41, 47 }, + { 42, 48 }, { 46, 50 }, { 47, 49 }, { 48, 51 }, { 49, 53 }, { 50, 52 }, + { 51, 116 }, { 51, 123 }, { 53, 54 }, { 54, 55 }, { 55, 56 }, + { 56, 57 }, { 58, 58 }, { 60, 60 }, { 68, 62 }, { 70, 63 }, { 76, 86 }, +diff --git a/drivers/pinctrl/sunxi/pinctrl-sun50i-a100-r.c b/drivers/pinctrl/sunxi/pinctrl-sun50i-a100-r.c +index 21054fcacd345..18088f6f44b23 100644 +--- a/drivers/pinctrl/sunxi/pinctrl-sun50i-a100-r.c ++++ b/drivers/pinctrl/sunxi/pinctrl-sun50i-a100-r.c +@@ -98,7 +98,7 @@ MODULE_DEVICE_TABLE(of, a100_r_pinctrl_match); + static struct platform_driver a100_r_pinctrl_driver = { + .probe = a100_r_pinctrl_probe, + .driver = { +- .name = "sun50iw10p1-r-pinctrl", ++ .name = "sun50i-a100-r-pinctrl", + .of_match_table = a100_r_pinctrl_match, + }, + }; +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c +index 386bb523c69ea..bdc3efdb12219 100644 +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -707,9 +707,6 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) + int length = 0; + int total_read; + +- smb_msg->msg_control = NULL; +- smb_msg->msg_controllen = 0; +- + for (total_read = 0; msg_data_left(smb_msg); total_read += length) { + try_to_freeze(); + +@@ -765,7 +762,7 @@ int + cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + unsigned int to_read) + { +- struct msghdr smb_msg; ++ struct msghdr smb_msg = {}; + struct kvec iov = {.iov_base = buf, .iov_len = to_read}; + iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); + +@@ -775,15 +772,13 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, + ssize_t + cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) + { +- struct msghdr smb_msg; ++ struct msghdr smb_msg = {}; + + /* + * iov_iter_discard already sets smb_msg.type and count and iov_offset + * and cifs_readv_from_socket sets msg_control and msg_controllen + * so little to initialize in struct msghdr + */ +- smb_msg.msg_name = NULL; +- smb_msg.msg_namelen = 0; + iov_iter_discard(&smb_msg.msg_iter, READ, to_read); + + return cifs_readv_from_socket(server, &smb_msg); +@@ -793,7 +788,7 @@ int + cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, + unsigned int page_offset, unsigned int to_read) + { +- struct msghdr smb_msg; ++ struct msghdr smb_msg = {}; + struct bio_vec bv = { + .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; + iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); +diff --git a/fs/cifs/file.c b/fs/cifs/file.c +index 0f03c0bfdf280..02dd591acabb3 100644 +--- a/fs/cifs/file.c ++++ b/fs/cifs/file.c +@@ -3327,6 +3327,9 @@ static ssize_t __cifs_writev( + + ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from) + { ++ struct file *file = iocb->ki_filp; ++ ++ cifs_revalidate_mapping(file->f_inode); + return __cifs_writev(iocb, from, true); + } + +diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c +index bfc9bd55870a0..8adc0f2a59518 100644 +--- a/fs/cifs/transport.c ++++ b/fs/cifs/transport.c +@@ -196,10 +196,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, + + *sent = 0; + +- smb_msg->msg_name = (struct sockaddr *) &server->dstaddr; +- smb_msg->msg_namelen = sizeof(struct sockaddr); +- smb_msg->msg_control = NULL; +- smb_msg->msg_controllen = 0; + if (server->noblocksnd) + smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; + else +@@ -311,7 +307,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, + sigset_t mask, oldmask; + size_t total_len = 0, sent, size; + struct socket *ssocket = server->ssocket; +- struct msghdr smb_msg; ++ struct msghdr smb_msg = {}; + __be32 rfc1002_marker; + + if (cifs_rdma_enabled(server)) { +diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h +index 8f8cd6e2d4dbc..597e3ce3f148a 100644 +--- a/fs/nfs/internal.h ++++ b/fs/nfs/internal.h +@@ -604,6 +604,31 @@ static inline gfp_t nfs_io_gfp_mask(void) + return GFP_KERNEL; + } + ++/* ++ * Special version of should_remove_suid() that ignores capabilities. ++ */ ++static inline int nfs_should_remove_suid(const struct inode *inode) ++{ ++ umode_t mode = inode->i_mode; ++ int kill = 0; ++ ++ /* suid always must be killed */ ++ if (unlikely(mode & S_ISUID)) ++ kill = ATTR_KILL_SUID; ++ ++ /* ++ * sgid without any exec bits is just a mandatory locking mark; leave ++ * it alone. If some exec bits are set, it's a real sgid; kill it. ++ */ ++ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) ++ kill |= ATTR_KILL_SGID; ++ ++ if (unlikely(kill && S_ISREG(mode))) ++ return kill; ++ ++ return 0; ++} ++ + /* unlink.c */ + extern struct rpc_task * + nfs_async_rename(struct inode *old_dir, struct inode *new_dir, +diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c +index 068c45b3bc1ab..6dab9e4083729 100644 +--- a/fs/nfs/nfs42proc.c ++++ b/fs/nfs/nfs42proc.c +@@ -78,10 +78,15 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, + + status = nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); +- if (status == 0) ++ if (status == 0) { ++ if (nfs_should_remove_suid(inode)) { ++ spin_lock(&inode->i_lock); ++ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); ++ spin_unlock(&inode->i_lock); ++ } + status = nfs_post_op_update_inode_force_wcc(inode, + res.falloc_fattr); +- ++ } + if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE]) + trace_nfs4_fallocate(inode, &args, status); + else +diff --git a/fs/nfs/super.c b/fs/nfs/super.c +index 6ab5eeb000dc0..5e4bacb77bfc7 100644 +--- a/fs/nfs/super.c ++++ b/fs/nfs/super.c +@@ -1051,22 +1051,31 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) + if (ctx->bsize) + sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits); + +- if (server->nfs_client->rpc_ops->version != 2) { +- /* The VFS shouldn't apply the umask to mode bits. We will do +- * so ourselves when necessary. ++ switch (server->nfs_client->rpc_ops->version) { ++ case 2: ++ sb->s_time_gran = 1000; ++ sb->s_time_min = 0; ++ sb->s_time_max = U32_MAX; ++ break; ++ case 3: ++ /* ++ * The VFS shouldn't apply the umask to mode bits. ++ * We will do so ourselves when necessary. + */ + sb->s_flags |= SB_POSIXACL; + sb->s_time_gran = 1; +- sb->s_export_op = &nfs_export_ops; +- } else +- sb->s_time_gran = 1000; +- +- if (server->nfs_client->rpc_ops->version != 4) { + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; +- } else { ++ sb->s_export_op = &nfs_export_ops; ++ break; ++ case 4: ++ sb->s_flags |= SB_POSIXACL; ++ sb->s_time_gran = 1; + sb->s_time_min = S64_MIN; + sb->s_time_max = S64_MAX; ++ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) ++ sb->s_export_op = &nfs_export_ops; ++ break; + } + + sb->s_magic = NFS_SUPER_MAGIC; +diff --git a/fs/nfs/write.c b/fs/nfs/write.c +index 5d7e1c2061842..4212473c69ee9 100644 +--- a/fs/nfs/write.c ++++ b/fs/nfs/write.c +@@ -1497,31 +1497,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) + NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); + } + +-/* +- * Special version of should_remove_suid() that ignores capabilities. +- */ +-static int nfs_should_remove_suid(const struct inode *inode) +-{ +- umode_t mode = inode->i_mode; +- int kill = 0; +- +- /* suid always must be killed */ +- if (unlikely(mode & S_ISUID)) +- kill = ATTR_KILL_SUID; +- +- /* +- * sgid without any exec bits is just a mandatory locking mark; leave +- * it alone. If some exec bits are set, it's a real sgid; kill it. +- */ +- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) +- kill |= ATTR_KILL_SGID; +- +- if (unlikely(kill && S_ISREG(mode))) +- return kill; +- +- return 0; +-} +- + static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, + struct nfs_fattr *fattr) + { +diff --git a/include/linux/dmar.h b/include/linux/dmar.h +index f3a3d95df5325..cbd714a198a0a 100644 +--- a/include/linux/dmar.h ++++ b/include/linux/dmar.h +@@ -69,7 +69,6 @@ struct dmar_pci_notify_info { + + extern struct rw_semaphore dmar_global_lock; + extern struct list_head dmar_drhd_units; +-extern int intel_iommu_enabled; + + #define for_each_drhd_unit(drhd) \ + list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ +@@ -93,8 +92,7 @@ extern int intel_iommu_enabled; + static inline bool dmar_rcu_check(void) + { + return rwsem_is_locked(&dmar_global_lock) || +- system_state == SYSTEM_BOOTING || +- (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled); ++ system_state == SYSTEM_BOOTING; + } + + #define dmar_rcu_dereference(p) rcu_dereference_check((p), dmar_rcu_check()) +diff --git a/include/linux/of_device.h b/include/linux/of_device.h +index 1d7992a02e36e..1a803e4335d30 100644 +--- a/include/linux/of_device.h ++++ b/include/linux/of_device.h +@@ -101,8 +101,9 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) + } + + static inline int of_dma_configure_id(struct device *dev, +- struct device_node *np, +- bool force_dma) ++ struct device_node *np, ++ bool force_dma, ++ const u32 *id) + { + return 0; + } +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index c39d910d4b454..9ca397eed1638 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -1195,6 +1195,8 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk); + + static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) + { ++ if (!sk_fullsock(osk)) ++ return 0; + sk->sk_policy[0] = NULL; + sk->sk_policy[1] = NULL; + if (unlikely(osk->sk_policy[0] || osk->sk_policy[1])) +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 48833d0edd089..602da2cfd57c8 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -5061,7 +5061,8 @@ done: + req_set_fail(req); + __io_req_complete(req, issue_flags, ret, 0); + /* put file to avoid an attempt to IOPOLL the req */ +- io_put_file(req->file); ++ if (!(req->flags & REQ_F_FIXED_FILE)) ++ io_put_file(req->file); + req->file = NULL; + return 0; + } +diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c +index afc6c0e9c966e..f93983910b5e1 100644 +--- a/kernel/cgroup/cgroup-v1.c ++++ b/kernel/cgroup/cgroup-v1.c +@@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) + int retval = 0; + + mutex_lock(&cgroup_mutex); ++ cpus_read_lock(); + percpu_down_write(&cgroup_threadgroup_rwsem); + for_each_root(root) { + struct cgroup *from_cgrp; +@@ -72,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) + break; + } + percpu_up_write(&cgroup_threadgroup_rwsem); ++ cpus_read_unlock(); + mutex_unlock(&cgroup_mutex); + + return retval; +diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c +index da8b3cc67234d..028eb28c7882d 100644 +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -1704,7 +1704,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, + tcp_hdr(skb)->source, tcp_hdr(skb)->dest, + arg->uid); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); +- rt = ip_route_output_key(net, &fl4); ++ rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) + return; + +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index 586c102ce152d..9fd92e263d0a3 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -819,6 +819,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) + ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_priority : sk->sk_priority; + transmit_time = tcp_transmit_time(sk); ++ xfrm_sk_clone_policy(ctl_sk, sk); + } + ip_send_unicast_reply(ctl_sk, + skb, &TCP_SKB_CB(skb)->header.h4.opt, +@@ -827,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) + transmit_time); + + ctl_sk->sk_mark = 0; ++ xfrm_sk_free_policy(ctl_sk); + sock_net_set(ctl_sk, &init_net); + __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c +index be09941fe6d9a..5eabe746cfa76 100644 +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -952,7 +952,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 + * Underlying function will use this to retrieve the network + * namespace + */ +- dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); ++ if (sk && sk->sk_state != TCP_TIME_WAIT) ++ dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/ ++ else ++ dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL); + if (!IS_ERR(dst)) { + skb_dst_set(buff, dst); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, +diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c +index c1a01947530f0..db8c0de1de422 100644 +--- a/net/sunrpc/clnt.c ++++ b/net/sunrpc/clnt.c +@@ -2858,6 +2858,9 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, + + task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, + &rpc_cb_add_xprt_call_ops, data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ + data->xps->xps_nunique_destaddr_xprts++; + rpc_put_task(task); + success: +diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c +index 53b024cea3b3e..5ecafffe7ce59 100644 +--- a/net/sunrpc/xprt.c ++++ b/net/sunrpc/xprt.c +@@ -1179,11 +1179,8 @@ xprt_request_dequeue_receive_locked(struct rpc_task *task) + { + struct rpc_rqst *req = task->tk_rqstp; + +- if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { ++ if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + xprt_request_rb_remove(req->rq_xprt, req); +- xdr_free_bvec(&req->rq_rcv_buf); +- req->rq_private_buf.bvec = NULL; +- } + } + + /** +@@ -1221,6 +1218,8 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) + + xprt->stat.recvs++; + ++ xdr_free_bvec(&req->rq_rcv_buf); ++ req->rq_private_buf.bvec = NULL; + req->rq_private_buf.len = copied; + /* Ensure all writes are done before we update */ + /* req->rq_reply_bytes_recvd */ +@@ -1453,6 +1452,7 @@ xprt_request_dequeue_xprt(struct rpc_task *task) + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + spin_unlock(&xprt->queue_lock); ++ xdr_free_bvec(&req->rq_rcv_buf); + } + } + +diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c +index 61df4d33c48ff..7f340f18599c9 100644 +--- a/sound/pci/hda/patch_sigmatel.c ++++ b/sound/pci/hda/patch_sigmatel.c +@@ -209,6 +209,7 @@ struct sigmatel_spec { + + /* beep widgets */ + hda_nid_t anabeep_nid; ++ bool beep_power_on; + + /* SPDIF-out mux */ + const char * const *spdif_labels; +@@ -4443,6 +4444,28 @@ static int stac_suspend(struct hda_codec *codec) + + return 0; + } ++ ++static int stac_check_power_status(struct hda_codec *codec, hda_nid_t nid) ++{ ++#ifdef CONFIG_SND_HDA_INPUT_BEEP ++ struct sigmatel_spec *spec = codec->spec; ++#endif ++ int ret = snd_hda_gen_check_power_status(codec, nid); ++ ++#ifdef CONFIG_SND_HDA_INPUT_BEEP ++ if (nid == spec->gen.beep_nid && codec->beep) { ++ if (codec->beep->enabled != spec->beep_power_on) { ++ spec->beep_power_on = codec->beep->enabled; ++ if (spec->beep_power_on) ++ snd_hda_power_up_pm(codec); ++ else ++ snd_hda_power_down_pm(codec); ++ } ++ ret |= spec->beep_power_on; ++ } ++#endif ++ return ret; ++} + #else + #define stac_suspend NULL + #endif /* CONFIG_PM */ +@@ -4455,6 +4478,7 @@ static const struct hda_codec_ops stac_patch_ops = { + .unsol_event = snd_hda_jack_unsol_event, + #ifdef CONFIG_PM + .suspend = stac_suspend, ++ .check_power_status = stac_check_power_status, + #endif + }; + +diff --git a/tools/include/uapi/asm/errno.h b/tools/include/uapi/asm/errno.h +index d30439b4b8ab4..869379f91fe48 100644 +--- a/tools/include/uapi/asm/errno.h ++++ b/tools/include/uapi/asm/errno.h +@@ -9,8 +9,8 @@ + #include "../../../arch/alpha/include/uapi/asm/errno.h" + #elif defined(__mips__) + #include "../../../arch/mips/include/uapi/asm/errno.h" +-#elif defined(__xtensa__) +-#include "../../../arch/xtensa/include/uapi/asm/errno.h" ++#elif defined(__hppa__) ++#include "../../../arch/parisc/include/uapi/asm/errno.h" + #else + #include + #endif diff --git a/sys-kernel/pinephone-sources/files/5.19.11-12.patch b/sys-kernel/pinephone-sources/files/5.19.11-12.patch new file mode 100644 index 0000000..8c6e32f --- /dev/null +++ b/sys-kernel/pinephone-sources/files/5.19.11-12.patch @@ -0,0 +1,9776 @@ +diff --git a/Makefile b/Makefile +index 01463a22926d5..7df4c195c8ab2 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 5 + PATCHLEVEL = 19 +-SUBLEVEL = 11 ++SUBLEVEL = 12 + EXTRAVERSION = + NAME = Superb Owl + +diff --git a/arch/arm/boot/dts/lan966x.dtsi b/arch/arm/boot/dts/lan966x.dtsi +index 38e90a31d2dd1..25c19f9d0a12f 100644 +--- a/arch/arm/boot/dts/lan966x.dtsi ++++ b/arch/arm/boot/dts/lan966x.dtsi +@@ -515,13 +515,13 @@ + + phy0: ethernet-phy@1 { + reg = <1>; +- interrupts = ; ++ interrupts = ; + status = "disabled"; + }; + + phy1: ethernet-phy@2 { + reg = <2>; +- interrupts = ; ++ interrupts = ; + status = "disabled"; + }; + }; +diff --git a/arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts b/arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts +index 92eaf4ef45638..57ecdfa0dfc09 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts ++++ b/arch/arm64/boot/dts/freescale/imx8mm-mx8menlo.dts +@@ -152,11 +152,11 @@ + * CPLD_reset is RESET_SOFT in schematic + */ + gpio-line-names = +- "CPLD_D[1]", "CPLD_int", "CPLD_reset", "", +- "", "CPLD_D[0]", "", "", +- "", "", "", "CPLD_D[2]", +- "CPLD_D[3]", "CPLD_D[4]", "CPLD_D[5]", "CPLD_D[6]", +- "CPLD_D[7]", "", "", "", ++ "CPLD_D[6]", "CPLD_int", "CPLD_reset", "", ++ "", "CPLD_D[7]", "", "", ++ "", "", "", "CPLD_D[5]", ++ "CPLD_D[4]", "CPLD_D[3]", "CPLD_D[2]", "CPLD_D[1]", ++ "CPLD_D[0]", "", "", "", + "", "", "", "", + "", "", "", "KBD_intK", + "", "", "", ""; +diff --git a/arch/arm64/boot/dts/freescale/imx8mm-tqma8mqml-mba8mx.dts b/arch/arm64/boot/dts/freescale/imx8mm-tqma8mqml-mba8mx.dts +index 286d2df01cfa7..7e0aeb2db3054 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mm-tqma8mqml-mba8mx.dts ++++ b/arch/arm64/boot/dts/freescale/imx8mm-tqma8mqml-mba8mx.dts +@@ -5,7 +5,6 @@ + + /dts-v1/; + +-#include + #include "imx8mm-tqma8mqml.dtsi" + #include "mba8mx.dtsi" + +diff --git a/arch/arm64/boot/dts/freescale/imx8mm-tqma8mqml.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-tqma8mqml.dtsi +index 16ee9b5179e6e..f649dfacb4b69 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mm-tqma8mqml.dtsi ++++ b/arch/arm64/boot/dts/freescale/imx8mm-tqma8mqml.dtsi +@@ -3,6 +3,7 @@ + * Copyright 2020-2021 TQ-Systems GmbH + */ + ++#include + #include "imx8mm.dtsi" + + / { +diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +index c2d4da25482ff..44b473494d0f5 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi ++++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +@@ -359,8 +359,8 @@ + nxp,dvs-standby-voltage = <850000>; + regulator-always-on; + regulator-boot-on; +- regulator-max-microvolt = <950000>; +- regulator-min-microvolt = <850000>; ++ regulator-max-microvolt = <1050000>; ++ regulator-min-microvolt = <805000>; + regulator-name = "On-module +VDD_ARM (BUCK2)"; + regulator-ramp-delay = <3125>; + }; +@@ -368,8 +368,8 @@ + reg_vdd_dram: BUCK3 { + regulator-always-on; + regulator-boot-on; +- regulator-max-microvolt = <950000>; +- regulator-min-microvolt = <850000>; ++ regulator-max-microvolt = <1000000>; ++ regulator-min-microvolt = <805000>; + regulator-name = "On-module +VDD_GPU_VPU_DDR (BUCK3)"; + }; + +@@ -408,7 +408,7 @@ + reg_vdd_snvs: LDO2 { + regulator-always-on; + regulator-boot-on; +- regulator-max-microvolt = <900000>; ++ regulator-max-microvolt = <800000>; + regulator-min-microvolt = <800000>; + regulator-name = "On-module +V0.8_SNVS (LDO2)"; + }; +diff --git a/arch/arm64/boot/dts/freescale/imx8mn.dtsi b/arch/arm64/boot/dts/freescale/imx8mn.dtsi +index e41e1d56f980d..7bd4eecd592ef 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mn.dtsi ++++ b/arch/arm64/boot/dts/freescale/imx8mn.dtsi +@@ -672,7 +672,6 @@ + <&clk IMX8MN_CLK_GPU_SHADER>, + <&clk IMX8MN_CLK_GPU_BUS_ROOT>, + <&clk IMX8MN_CLK_GPU_AHB>; +- resets = <&src IMX8MQ_RESET_GPU_RESET>; + }; + + pgc_dispmix: power-domain@3 { +diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts +index 6630ec561dc25..211e6a1b296e1 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts ++++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts +@@ -123,8 +123,7 @@ + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_reg_can>; + regulator-name = "can2_stby"; +- gpio = <&gpio3 19 GPIO_ACTIVE_HIGH>; +- enable-active-high; ++ gpio = <&gpio3 19 GPIO_ACTIVE_LOW>; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; +@@ -484,35 +483,40 @@ + lan1: port@0 { + reg = <0>; + label = "lan1"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + + lan2: port@1 { + reg = <1>; + label = "lan2"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + + lan3: port@2 { + reg = <2>; + label = "lan3"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + + lan4: port@3 { + reg = <3>; + label = "lan4"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + + lan5: port@4 { + reg = <4>; + label = "lan5"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + +- port@6 { +- reg = <6>; ++ port@5 { ++ reg = <5>; + label = "cpu"; + ethernet = <&fec>; + phy-mode = "rgmii-id"; +diff --git a/arch/arm64/boot/dts/freescale/imx8ulp.dtsi b/arch/arm64/boot/dts/freescale/imx8ulp.dtsi +index 09f7364dd1d05..1cd389b1b95d6 100644 +--- a/arch/arm64/boot/dts/freescale/imx8ulp.dtsi ++++ b/arch/arm64/boot/dts/freescale/imx8ulp.dtsi +@@ -172,6 +172,7 @@ + compatible = "fsl,imx8ulp-pcc3"; + reg = <0x292d0000 0x10000>; + #clock-cells = <1>; ++ #reset-cells = <1>; + }; + + tpm5: tpm@29340000 { +@@ -270,6 +271,7 @@ + compatible = "fsl,imx8ulp-pcc4"; + reg = <0x29800000 0x10000>; + #clock-cells = <1>; ++ #reset-cells = <1>; + }; + + lpi2c6: i2c@29840000 { +@@ -414,6 +416,7 @@ + compatible = "fsl,imx8ulp-pcc5"; + reg = <0x2da70000 0x10000>; + #clock-cells = <1>; ++ #reset-cells = <1>; + }; + }; + +diff --git a/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core.dtsi b/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core.dtsi +index 7249871530ab9..5eecbefa8a336 100644 +--- a/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core.dtsi ++++ b/arch/arm64/boot/dts/rockchip/px30-engicam-px30-core.dtsi +@@ -2,8 +2,8 @@ + /* + * Copyright (c) 2020 Fuzhou Rockchip Electronics Co., Ltd + * Copyright (c) 2020 Engicam srl +- * Copyright (c) 2020 Amarula Solutons +- * Copyright (c) 2020 Amarula Solutons(India) ++ * Copyright (c) 2020 Amarula Solutions ++ * Copyright (c) 2020 Amarula Solutions(India) + */ + + #include +diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-bob.dts b/arch/arm64/boot/dts/rockchip/rk3399-gru-bob.dts +index 31ebb4e5fd330..0f9cc042d9bf0 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3399-gru-bob.dts ++++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-bob.dts +@@ -88,3 +88,8 @@ + }; + }; + }; ++ ++&wlan_host_wake_l { ++ /* Kevin has an external pull up, but Bob does not. */ ++ rockchip,pins = <0 RK_PB0 RK_FUNC_GPIO &pcfg_pull_up>; ++}; +diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi +index 50d459ee4831c..af5810e5f5b79 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi ++++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi +@@ -244,6 +244,14 @@ + &edp { + status = "okay"; + ++ /* ++ * eDP PHY/clk don't sync reliably at anything other than 24 MHz. Only ++ * set this here, because rk3399-gru.dtsi ensures we can generate this ++ * off GPLL=600MHz, whereas some other RK3399 boards may not. ++ */ ++ assigned-clocks = <&cru PCLK_EDP>; ++ assigned-clock-rates = <24000000>; ++ + ports { + edp_out: port@1 { + reg = <1>; +@@ -578,6 +586,7 @@ ap_i2c_tp: &i2c5 { + }; + + wlan_host_wake_l: wlan-host-wake-l { ++ /* Kevin has an external pull up, but Bob does not */ + rockchip,pins = <0 RK_PB0 RK_FUNC_GPIO &pcfg_pull_none>; + }; + }; +diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +index b1ac3a89f259c..aa3e21bd6c8f4 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi ++++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +@@ -62,7 +62,6 @@ + vcc5v0_host: vcc5v0-host-regulator { + compatible = "regulator-fixed"; + gpio = <&gpio4 RK_PA3 GPIO_ACTIVE_LOW>; +- enable-active-low; + pinctrl-names = "default"; + pinctrl-0 = <&vcc5v0_host_en>; + regulator-name = "vcc5v0_host"; +diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts +index fa953b7366421..fdbfdf3634e43 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts ++++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts +@@ -163,7 +163,6 @@ + + vcc3v3_sd: vcc3v3_sd { + compatible = "regulator-fixed"; +- enable-active-low; + gpio = <&gpio0 RK_PA5 GPIO_ACTIVE_LOW>; + pinctrl-names = "default"; + pinctrl-0 = <&vcc_sd_h>; +diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts +index 02d5f5a8ca036..528bb4e8ac776 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts ++++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts +@@ -506,7 +506,7 @@ + disable-wp; + pinctrl-names = "default"; + pinctrl-0 = <&sdmmc0_bus4 &sdmmc0_clk &sdmmc0_cmd &sdmmc0_det>; +- sd-uhs-sdr104; ++ sd-uhs-sdr50; + vmmc-supply = <&vcc3v3_sd>; + vqmmc-supply = <&vccio_sd>; + status = "okay"; +diff --git a/arch/arm64/boot/dts/rockchip/rk3568-evb1-v10.dts b/arch/arm64/boot/dts/rockchip/rk3568-evb1-v10.dts +index 622be8be9813d..282f5c74d5cda 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3568-evb1-v10.dts ++++ b/arch/arm64/boot/dts/rockchip/rk3568-evb1-v10.dts +@@ -618,7 +618,7 @@ + }; + + &usb2phy0_otg { +- vbus-supply = <&vcc5v0_usb_otg>; ++ phy-supply = <&vcc5v0_usb_otg>; + status = "okay"; + }; + +diff --git a/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts b/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts +index 0813c0c5abded..26912f02684ce 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts ++++ b/arch/arm64/boot/dts/rockchip/rk3568-rock-3a.dts +@@ -543,7 +543,7 @@ + }; + + &usb2phy0_otg { +- vbus-supply = <&vcc5v0_usb_otg>; ++ phy-supply = <&vcc5v0_usb_otg>; + status = "okay"; + }; + +diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c +index 707b5451929d4..d4abb948eb14e 100644 +--- a/arch/arm64/kernel/topology.c ++++ b/arch/arm64/kernel/topology.c +@@ -251,7 +251,7 @@ static void amu_fie_setup(const struct cpumask *cpus) + for_each_cpu(cpu, cpus) { + if (!freq_counters_valid(cpu) || + freq_inv_set_max_ratio(cpu, +- cpufreq_get_hw_max_freq(cpu) * 1000, ++ cpufreq_get_hw_max_freq(cpu) * 1000ULL, + arch_timer_get_rate())) + return; + } +diff --git a/arch/mips/lantiq/clk.c b/arch/mips/lantiq/clk.c +index 7a623684d9b5e..2d5a0bcb0cec1 100644 +--- a/arch/mips/lantiq/clk.c ++++ b/arch/mips/lantiq/clk.c +@@ -50,6 +50,7 @@ struct clk *clk_get_io(void) + { + return &cpu_clk_generic[2]; + } ++EXPORT_SYMBOL_GPL(clk_get_io); + + struct clk *clk_get_ppe(void) + { +diff --git a/arch/mips/loongson32/common/platform.c b/arch/mips/loongson32/common/platform.c +index 794c96c2a4cdd..311dc1580bbde 100644 +--- a/arch/mips/loongson32/common/platform.c ++++ b/arch/mips/loongson32/common/platform.c +@@ -98,7 +98,7 @@ int ls1x_eth_mux_init(struct platform_device *pdev, void *priv) + if (plat_dat->bus_id) { + __raw_writel(__raw_readl(LS1X_MUX_CTRL0) | GMAC1_USE_UART1 | + GMAC1_USE_UART0, LS1X_MUX_CTRL0); +- switch (plat_dat->interface) { ++ switch (plat_dat->phy_interface) { + case PHY_INTERFACE_MODE_RGMII: + val &= ~(GMAC1_USE_TXCLK | GMAC1_USE_PWM23); + break; +@@ -107,12 +107,12 @@ int ls1x_eth_mux_init(struct platform_device *pdev, void *priv) + break; + default: + pr_err("unsupported mii mode %d\n", +- plat_dat->interface); ++ plat_dat->phy_interface); + return -ENOTSUPP; + } + val &= ~GMAC1_SHUT; + } else { +- switch (plat_dat->interface) { ++ switch (plat_dat->phy_interface) { + case PHY_INTERFACE_MODE_RGMII: + val &= ~(GMAC0_USE_TXCLK | GMAC0_USE_PWM01); + break; +@@ -121,7 +121,7 @@ int ls1x_eth_mux_init(struct platform_device *pdev, void *priv) + break; + default: + pr_err("unsupported mii mode %d\n", +- plat_dat->interface); ++ plat_dat->phy_interface); + return -ENOTSUPP; + } + val &= ~GMAC0_SHUT; +@@ -131,7 +131,7 @@ int ls1x_eth_mux_init(struct platform_device *pdev, void *priv) + plat_dat = dev_get_platdata(&pdev->dev); + + val &= ~PHY_INTF_SELI; +- if (plat_dat->interface == PHY_INTERFACE_MODE_RMII) ++ if (plat_dat->phy_interface == PHY_INTERFACE_MODE_RMII) + val |= 0x4 << PHY_INTF_SELI_SHIFT; + __raw_writel(val, LS1X_MUX_CTRL1); + +@@ -146,9 +146,9 @@ static struct plat_stmmacenet_data ls1x_eth0_pdata = { + .bus_id = 0, + .phy_addr = -1, + #if defined(CONFIG_LOONGSON1_LS1B) +- .interface = PHY_INTERFACE_MODE_MII, ++ .phy_interface = PHY_INTERFACE_MODE_MII, + #elif defined(CONFIG_LOONGSON1_LS1C) +- .interface = PHY_INTERFACE_MODE_RMII, ++ .phy_interface = PHY_INTERFACE_MODE_RMII, + #endif + .mdio_bus_data = &ls1x_mdio_bus_data, + .dma_cfg = &ls1x_eth_dma_cfg, +@@ -186,7 +186,7 @@ struct platform_device ls1x_eth0_pdev = { + static struct plat_stmmacenet_data ls1x_eth1_pdata = { + .bus_id = 1, + .phy_addr = -1, +- .interface = PHY_INTERFACE_MODE_MII, ++ .phy_interface = PHY_INTERFACE_MODE_MII, + .mdio_bus_data = &ls1x_mdio_bus_data, + .dma_cfg = &ls1x_eth_dma_cfg, + .has_gmac = 1, +diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig +index fcbb81feb7ad8..1f02f15569749 100644 +--- a/arch/riscv/Kconfig ++++ b/arch/riscv/Kconfig +@@ -361,6 +361,7 @@ config RISCV_ISA_C + config RISCV_ISA_SVPBMT + bool "SVPBMT extension support" + depends on 64BIT && MMU ++ depends on !XIP_KERNEL + select RISCV_ALTERNATIVE + default y + help +diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c +index 5a2de6b6f8822..5c591123c4409 100644 +--- a/arch/riscv/kernel/signal.c ++++ b/arch/riscv/kernel/signal.c +@@ -124,6 +124,8 @@ SYSCALL_DEFINE0(rt_sigreturn) + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + ++ regs->cause = -1UL; ++ + return regs->a0; + + badframe: +diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c +index e0de60e503b98..d9e023c78f568 100644 +--- a/arch/um/kernel/um_arch.c ++++ b/arch/um/kernel/um_arch.c +@@ -33,7 +33,7 @@ + #include "um_arch.h" + + #define DEFAULT_COMMAND_LINE_ROOT "root=98:0" +-#define DEFAULT_COMMAND_LINE_CONSOLE "console=tty" ++#define DEFAULT_COMMAND_LINE_CONSOLE "console=tty0" + + /* Changed in add_arg and setup_arch, which run before SMP is started */ + static char __initdata command_line[COMMAND_LINE_SIZE] = { 0 }; +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 4c0e812f2f044..19c04412f6e16 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -713,6 +713,7 @@ struct kvm_vcpu_arch { + struct fpu_guest guest_fpu; + + u64 xcr0; ++ u64 guest_supported_xcr0; + + struct kvm_pio_request pio; + void *pio_data; +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index de6d44e07e348..3ab498165639f 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -283,7 +283,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) + { + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *best; +- u64 guest_supported_xcr0; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (best && apic) { +@@ -295,10 +294,16 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) + kvm_apic_set_version(vcpu); + } + +- guest_supported_xcr0 = ++ vcpu->arch.guest_supported_xcr0 = + cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); + +- vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0; ++ /* ++ * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if ++ * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't ++ * supported by the host. ++ */ ++ vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 | ++ XFEATURE_MASK_FPSSE; + + kvm_update_pv_runtime(vcpu); + +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c +index 09fa8a94807bf..0c4a866813b31 100644 +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -4134,6 +4134,9 @@ static int em_xsetbv(struct x86_emulate_ctxt *ctxt) + { + u32 eax, ecx, edx; + ++ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE)) ++ return emulate_ud(ctxt); ++ + eax = reg_read(ctxt, VCPU_REGS_RAX); + edx = reg_read(ctxt, VCPU_REGS_RDX); + ecx = reg_read(ctxt, VCPU_REGS_RCX); +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 5b36866528568..8c2815151864b 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -1025,15 +1025,10 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) + } + EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); + +-static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu) +-{ +- return vcpu->arch.guest_fpu.fpstate->user_xfeatures; +-} +- + #ifdef CONFIG_X86_64 + static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) + { +- return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC; ++ return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; + } + #endif + +@@ -1056,7 +1051,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) + * saving. However, xcr0 bit 0 is always set, even if the + * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). + */ +- valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP; ++ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; + if (xcr0 & ~valid_bits) + return 1; + +@@ -1084,6 +1079,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) + + int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) + { ++ /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ + if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || + __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { + kvm_inject_gp(vcpu, 0); +diff --git a/block/blk-core.c b/block/blk-core.c +index cc6fbcb6d2521..7743c68177e89 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -284,49 +284,6 @@ void blk_queue_start_drain(struct request_queue *q) + wake_up_all(&q->mq_freeze_wq); + } + +-/** +- * blk_cleanup_queue - shutdown a request queue +- * @q: request queue to shutdown +- * +- * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and +- * put it. All future requests will be failed immediately with -ENODEV. +- * +- * Context: can sleep +- */ +-void blk_cleanup_queue(struct request_queue *q) +-{ +- /* cannot be called from atomic context */ +- might_sleep(); +- +- WARN_ON_ONCE(blk_queue_registered(q)); +- +- /* mark @q DYING, no new request or merges will be allowed afterwards */ +- blk_queue_flag_set(QUEUE_FLAG_DYING, q); +- blk_queue_start_drain(q); +- +- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); +- blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); +- +- /* +- * Drain all requests queued before DYING marking. Set DEAD flag to +- * prevent that blk_mq_run_hw_queues() accesses the hardware queues +- * after draining finished. +- */ +- blk_freeze_queue(q); +- +- blk_queue_flag_set(QUEUE_FLAG_DEAD, q); +- +- blk_sync_queue(q); +- if (queue_is_mq(q)) { +- blk_mq_cancel_work_sync(q); +- blk_mq_exit_queue(q); +- } +- +- /* @q is and will stay empty, shutdown and put */ +- blk_put_queue(q); +-} +-EXPORT_SYMBOL(blk_cleanup_queue); +- + /** + * blk_queue_enter() - try to increase q->q_usage_counter + * @q: request queue pointer +diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c +index 61f179e5f151a..28adb01f64419 100644 +--- a/block/blk-mq-debugfs.c ++++ b/block/blk-mq-debugfs.c +@@ -116,7 +116,6 @@ static const char *const blk_queue_flag_name[] = { + QUEUE_FLAG_NAME(NOXMERGES), + QUEUE_FLAG_NAME(ADD_RANDOM), + QUEUE_FLAG_NAME(SAME_FORCE), +- QUEUE_FLAG_NAME(DEAD), + QUEUE_FLAG_NAME(INIT_DONE), + QUEUE_FLAG_NAME(STABLE_WRITES), + QUEUE_FLAG_NAME(POLL), +@@ -151,11 +150,10 @@ static ssize_t queue_state_write(void *data, const char __user *buf, + char opbuf[16] = { }, *op; + + /* +- * The "state" attribute is removed after blk_cleanup_queue() has called +- * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid +- * triggering a use-after-free. ++ * The "state" attribute is removed when the queue is removed. Don't ++ * allow setting the state on a dying queue to avoid a use-after-free. + */ +- if (blk_queue_dead(q)) ++ if (blk_queue_dying(q)) + return -ENOENT; + + if (count >= sizeof(opbuf)) { +diff --git a/block/blk-mq.c b/block/blk-mq.c +index 0a299941c622e..69d0a58f9e2f1 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -3896,7 +3896,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + q->queuedata = queuedata; + ret = blk_mq_init_allocated_queue(set, q); + if (ret) { +- blk_cleanup_queue(q); ++ blk_put_queue(q); + return ERR_PTR(ret); + } + return q; +@@ -3908,6 +3908,35 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) + } + EXPORT_SYMBOL(blk_mq_init_queue); + ++/** ++ * blk_mq_destroy_queue - shutdown a request queue ++ * @q: request queue to shutdown ++ * ++ * This shuts down a request queue allocated by blk_mq_init_queue() and drops ++ * the initial reference. All future requests will failed with -ENODEV. ++ * ++ * Context: can sleep ++ */ ++void blk_mq_destroy_queue(struct request_queue *q) ++{ ++ WARN_ON_ONCE(!queue_is_mq(q)); ++ WARN_ON_ONCE(blk_queue_registered(q)); ++ ++ might_sleep(); ++ ++ blk_queue_flag_set(QUEUE_FLAG_DYING, q); ++ blk_queue_start_drain(q); ++ blk_freeze_queue(q); ++ ++ blk_sync_queue(q); ++ blk_mq_cancel_work_sync(q); ++ blk_mq_exit_queue(q); ++ ++ /* @q is and will stay empty, shutdown and put */ ++ blk_put_queue(q); ++} ++EXPORT_SYMBOL(blk_mq_destroy_queue); ++ + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass) + { +@@ -3920,13 +3949,23 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + + disk = __alloc_disk_node(q, set->numa_node, lkclass); + if (!disk) { +- blk_cleanup_queue(q); ++ blk_mq_destroy_queue(q); + return ERR_PTR(-ENOMEM); + } ++ set_bit(GD_OWNS_QUEUE, &disk->state); + return disk; + } + EXPORT_SYMBOL(__blk_mq_alloc_disk); + ++struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, ++ struct lock_class_key *lkclass) ++{ ++ if (!blk_get_queue(q)) ++ return NULL; ++ return __alloc_disk_node(q, NUMA_NO_NODE, lkclass); ++} ++EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); ++ + static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( + struct blk_mq_tag_set *set, struct request_queue *q, + int hctx_idx, int node) +diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c +index 9b905e9443e49..84d7f87015673 100644 +--- a/block/blk-sysfs.c ++++ b/block/blk-sysfs.c +@@ -748,11 +748,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) + * decremented with blk_put_queue(). Once the refcount reaches 0 this function + * is called. + * +- * For drivers that have a request_queue on a gendisk and added with +- * __device_add_disk() the refcount to request_queue will reach 0 with +- * the last put_disk() called by the driver. For drivers which don't use +- * __device_add_disk() this happens with blk_cleanup_queue(). +- * + * Drivers exist which depend on the release of the request_queue to be + * synchronous, it should not be deferred. + * +diff --git a/block/blk.h b/block/blk.h +index 434017701403f..0d6668663ab5d 100644 +--- a/block/blk.h ++++ b/block/blk.h +@@ -411,6 +411,9 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); + void blk_drop_partitions(struct gendisk *disk); + ++struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, ++ struct lock_class_key *lkclass); ++ + int bio_add_hw_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + unsigned int max_sectors, bool *same_page); +diff --git a/block/bsg-lib.c b/block/bsg-lib.c +index acfe1357bf6c4..fd4cd5e682826 100644 +--- a/block/bsg-lib.c ++++ b/block/bsg-lib.c +@@ -324,7 +324,7 @@ void bsg_remove_queue(struct request_queue *q) + container_of(q->tag_set, struct bsg_set, tag_set); + + bsg_unregister_queue(bset->bd); +- blk_cleanup_queue(q); ++ blk_mq_destroy_queue(q); + blk_mq_free_tag_set(&bset->tag_set); + kfree(bset); + } +@@ -399,7 +399,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, + + return q; + out_cleanup_queue: +- blk_cleanup_queue(q); ++ blk_mq_destroy_queue(q); + out_queue: + blk_mq_free_tag_set(set); + out_tag_set: +diff --git a/block/genhd.c b/block/genhd.c +index 278227ba1d531..a39c416d658fd 100644 +--- a/block/genhd.c ++++ b/block/genhd.c +@@ -617,13 +617,14 @@ void del_gendisk(struct gendisk *disk) + * Fail any new I/O. + */ + set_bit(GD_DEAD, &disk->state); ++ if (test_bit(GD_OWNS_QUEUE, &disk->state)) ++ blk_queue_flag_set(QUEUE_FLAG_DYING, q); + set_capacity(disk, 0); + + /* + * Prevent new I/O from crossing bio_queue_enter(). + */ + blk_queue_start_drain(q); +- blk_mq_freeze_queue_wait(q); + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); +@@ -647,6 +648,8 @@ void del_gendisk(struct gendisk *disk) + pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); + device_del(disk_to_dev(disk)); + ++ blk_mq_freeze_queue_wait(q); ++ + blk_throtl_cancel_bios(disk->queue); + + blk_sync_queue(q); +@@ -663,11 +666,16 @@ void del_gendisk(struct gendisk *disk) + blk_mq_unquiesce_queue(q); + + /* +- * Allow using passthrough request again after the queue is torn down. ++ * If the disk does not own the queue, allow using passthrough requests ++ * again. Else leave the queue frozen to fail all I/O. + */ +- blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); +- __blk_mq_unfreeze_queue(q, true); +- ++ if (!test_bit(GD_OWNS_QUEUE, &disk->state)) { ++ blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); ++ __blk_mq_unfreeze_queue(q, true); ++ } else { ++ if (queue_is_mq(q)) ++ blk_mq_exit_queue(q); ++ } + } + EXPORT_SYMBOL(del_gendisk); + +@@ -1151,6 +1159,18 @@ static void disk_release(struct device *dev) + might_sleep(); + WARN_ON_ONCE(disk_live(disk)); + ++ /* ++ * To undo the all initialization from blk_mq_init_allocated_queue in ++ * case of a probe failure where add_disk is never called we have to ++ * call blk_mq_exit_queue here. We can't do this for the more common ++ * teardown case (yet) as the tagset can be gone by the time the disk ++ * is released once it was added. ++ */ ++ if (queue_is_mq(disk->queue) && ++ test_bit(GD_OWNS_QUEUE, &disk->state) && ++ !test_bit(GD_ADDED, &disk->state)) ++ blk_mq_exit_queue(disk->queue); ++ + blkcg_exit_queue(disk->queue); + + disk_release_events(disk); +@@ -1338,12 +1358,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + { + struct gendisk *disk; + +- if (!blk_get_queue(q)) +- return NULL; +- + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); + if (!disk) +- goto out_put_queue; ++ return NULL; + + disk->bdi = bdi_alloc(node_id); + if (!disk->bdi) +@@ -1387,11 +1404,8 @@ out_free_bdi: + bdi_put(disk->bdi); + out_free_disk: + kfree(disk); +-out_put_queue: +- blk_put_queue(q); + return NULL; + } +-EXPORT_SYMBOL(__alloc_disk_node); + + struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) + { +@@ -1404,9 +1418,10 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) + + disk = __alloc_disk_node(q, node, lkclass); + if (!disk) { +- blk_cleanup_queue(q); ++ blk_put_queue(q); + return NULL; + } ++ set_bit(GD_OWNS_QUEUE, &disk->state); + return disk; + } + EXPORT_SYMBOL(__blk_alloc_disk); +@@ -1418,6 +1433,9 @@ EXPORT_SYMBOL(__blk_alloc_disk); + * This decrements the refcount for the struct gendisk. When this reaches 0 + * we'll have disk_release() called. + * ++ * Note: for blk-mq disk put_disk must be called before freeing the tag_set ++ * when handling probe errors (that is before add_disk() is called). ++ * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ +@@ -1439,7 +1457,6 @@ EXPORT_SYMBOL(put_disk); + */ + void blk_cleanup_disk(struct gendisk *disk) + { +- blk_cleanup_queue(disk->queue); + put_disk(disk); + } + EXPORT_SYMBOL(blk_cleanup_disk); +diff --git a/certs/Kconfig b/certs/Kconfig +index bf9b511573d75..1f109b0708778 100644 +--- a/certs/Kconfig ++++ b/certs/Kconfig +@@ -43,7 +43,7 @@ config SYSTEM_TRUSTED_KEYRING + bool "Provide system-wide ring of trusted keys" + depends on KEYS + depends on ASYMMETRIC_KEY_TYPE +- depends on X509_CERTIFICATE_PARSER ++ depends on X509_CERTIFICATE_PARSER = y + help + Provide a system keyring to which trusted keys can be added. Keys in + the keyring are considered to be trusted. Keys may be added at will +diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c +index e232cc4fd444b..c6e41ee18aaa2 100644 +--- a/drivers/block/ataflop.c ++++ b/drivers/block/ataflop.c +@@ -2045,7 +2045,6 @@ static void atari_floppy_cleanup(void) + if (!unit[i].disk[type]) + continue; + del_gendisk(unit[i].disk[type]); +- blk_cleanup_queue(unit[i].disk[type]->queue); + put_disk(unit[i].disk[type]); + } + blk_mq_free_tag_set(&unit[i].tag_set); +diff --git a/drivers/block/loop.c b/drivers/block/loop.c +index a59910ef948e9..1c036ef686fbb 100644 +--- a/drivers/block/loop.c ++++ b/drivers/block/loop.c +@@ -2062,7 +2062,6 @@ static void loop_remove(struct loop_device *lo) + { + /* Make this loop device unreachable from pathname. */ + del_gendisk(lo->lo_disk); +- blk_cleanup_queue(lo->lo_disk->queue); + blk_mq_free_tag_set(&lo->tag_set); + + mutex_lock(&loop_ctl_mutex); +diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c +index 6699e4b2f7f43..06994a35acc7a 100644 +--- a/drivers/block/mtip32xx/mtip32xx.c ++++ b/drivers/block/mtip32xx/mtip32xx.c +@@ -3677,7 +3677,6 @@ static int mtip_block_shutdown(struct driver_data *dd) + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) + del_gendisk(dd->disk); + +- blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); + put_disk(dd->disk); + return 0; +@@ -4040,7 +4039,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) + dev_info(&dd->pdev->dev, "device %s surprise removal\n", + dd->disk->disk_name); + +- blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); + + /* De-initialize the protocol layer. */ +diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c +index 409c76b81aed4..a4470374f54fc 100644 +--- a/drivers/block/rnbd/rnbd-clt.c ++++ b/drivers/block/rnbd/rnbd-clt.c +@@ -1755,7 +1755,7 @@ static void rnbd_destroy_sessions(void) + list_for_each_entry_safe(dev, tn, &sess->devs_list, list) { + /* + * Here unmap happens in parallel for only one reason: +- * blk_cleanup_queue() takes around half a second, so ++ * del_gendisk() takes around half a second, so + * on huge amount of devices the whole module unload + * procedure takes minutes. + */ +diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c +index 63b4f6431d2e6..75057dbbcfbea 100644 +--- a/drivers/block/sx8.c ++++ b/drivers/block/sx8.c +@@ -1536,7 +1536,7 @@ err_out_free_majors: + clear_bit(0, &carm_major_alloc); + else if (host->major == 161) + clear_bit(1, &carm_major_alloc); +- blk_cleanup_queue(host->oob_q); ++ blk_mq_destroy_queue(host->oob_q); + blk_mq_free_tag_set(&host->tag_set); + err_out_dma_free: + dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma); +@@ -1570,7 +1570,7 @@ static void carm_remove_one (struct pci_dev *pdev) + clear_bit(0, &carm_major_alloc); + else if (host->major == 161) + clear_bit(1, &carm_major_alloc); +- blk_cleanup_queue(host->oob_q); ++ blk_mq_destroy_queue(host->oob_q); + blk_mq_free_tag_set(&host->tag_set); + dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma); + iounmap(host->mmio); +diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c +index d756423e0059a..59d6d5faf7396 100644 +--- a/drivers/block/virtio_blk.c ++++ b/drivers/block/virtio_blk.c +@@ -1107,7 +1107,6 @@ static void virtblk_remove(struct virtio_device *vdev) + flush_work(&vblk->config_work); + + del_gendisk(vblk->disk); +- blk_cleanup_queue(vblk->disk->queue); + blk_mq_free_tag_set(&vblk->tag_set); + + mutex_lock(&vblk->vdev_mutex); +diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c +index 7a6ed83481b8d..18ad43d9933ec 100644 +--- a/drivers/block/z2ram.c ++++ b/drivers/block/z2ram.c +@@ -384,7 +384,6 @@ static void __exit z2_exit(void) + + for (i = 0; i < Z2MINOR_COUNT; i++) { + del_gendisk(z2ram_gendisk[i]); +- blk_cleanup_queue(z2ram_gendisk[i]->queue); + put_disk(z2ram_gendisk[i]); + } + blk_mq_free_tag_set(&tag_set); +diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c +index 8e78b37d0f6a4..f4cc90ea6198e 100644 +--- a/drivers/cdrom/gdrom.c ++++ b/drivers/cdrom/gdrom.c +@@ -831,7 +831,6 @@ probe_fail_no_mem: + + static int remove_gdrom(struct platform_device *devptr) + { +- blk_cleanup_queue(gd.gdrom_rq); + blk_mq_free_tag_set(&gd.tag_set); + free_irq(HW_EVENT_GDROM_CMD, &gd); + free_irq(HW_EVENT_GDROM_DMA, &gd); +diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c +index cb6401c9e9a4f..acf31cc1dbcca 100644 +--- a/drivers/dax/hmem/device.c ++++ b/drivers/dax/hmem/device.c +@@ -15,6 +15,7 @@ void hmem_register_device(int target_nid, struct resource *r) + .start = r->start, + .end = r->end, + .flags = IORESOURCE_MEM, ++ .desc = IORES_DESC_SOFT_RESERVED, + }; + struct platform_device *pdev; + struct memregion_info info; +diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c +index d4f1e4e9603a4..85e00701473cb 100644 +--- a/drivers/dma/ti/k3-udma-private.c ++++ b/drivers/dma/ti/k3-udma-private.c +@@ -31,14 +31,14 @@ struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property) + } + + pdev = of_find_device_by_node(udma_node); ++ if (np != udma_node) ++ of_node_put(udma_node); ++ + if (!pdev) { + pr_debug("UDMA device not found\n"); + return ERR_PTR(-EPROBE_DEFER); + } + +- if (np != udma_node) +- of_node_put(udma_node); +- + ud = platform_get_drvdata(pdev); + if (!ud) { + pr_debug("UDMA has not been probed\n"); +diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c +index 673f3eb498f43..e9afa8cab7309 100644 +--- a/drivers/firmware/arm_scmi/reset.c ++++ b/drivers/firmware/arm_scmi/reset.c +@@ -166,9 +166,13 @@ static int scmi_domain_reset(const struct scmi_protocol_handle *ph, u32 domain, + struct scmi_xfer *t; + struct scmi_msg_reset_domain_reset *dom; + struct scmi_reset_info *pi = ph->get_priv(ph); +- struct reset_dom_info *rdom = pi->dom_info + domain; ++ struct reset_dom_info *rdom; + +- if (rdom->async_reset) ++ if (domain >= pi->num_domains) ++ return -EINVAL; ++ ++ rdom = pi->dom_info + domain; ++ if (rdom->async_reset && flags & AUTONOMOUS_RESET) + flags |= ASYNCHRONOUS_RESET; + + ret = ph->xops->xfer_get_init(ph, RESET, sizeof(*dom), 0, &t); +@@ -180,7 +184,7 @@ static int scmi_domain_reset(const struct scmi_protocol_handle *ph, u32 domain, + dom->flags = cpu_to_le32(flags); + dom->reset_state = cpu_to_le32(state); + +- if (rdom->async_reset) ++ if (flags & ASYNCHRONOUS_RESET) + ret = ph->xops->do_xfer_with_response(ph, t); + else + ret = ph->xops->do_xfer(ph, t); +diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c +index 8a18930f3eb69..516f4f0069bd2 100644 +--- a/drivers/firmware/efi/libstub/secureboot.c ++++ b/drivers/firmware/efi/libstub/secureboot.c +@@ -14,7 +14,7 @@ + + /* SHIM variables */ + static const efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID; +-static const efi_char16_t shim_MokSBState_name[] = L"MokSBState"; ++static const efi_char16_t shim_MokSBState_name[] = L"MokSBStateRT"; + + static efi_status_t get_var(efi_char16_t *name, efi_guid_t *vendor, u32 *attr, + unsigned long *data_size, void *data) +@@ -43,8 +43,8 @@ enum efi_secureboot_mode efi_get_secureboot(void) + + /* + * See if a user has put the shim into insecure mode. If so, and if the +- * variable doesn't have the runtime attribute set, we might as well +- * honor that. ++ * variable doesn't have the non-volatile attribute set, we might as ++ * well honor that. + */ + size = sizeof(moksbstate); + status = get_efi_var(shim_MokSBState_name, &shim_guid, +@@ -53,7 +53,7 @@ enum efi_secureboot_mode efi_get_secureboot(void) + /* If it fails, we don't care why. Default to secure */ + if (status != EFI_SUCCESS) + goto secure_boot_enabled; +- if (!(attr & EFI_VARIABLE_RUNTIME_ACCESS) && moksbstate == 1) ++ if (!(attr & EFI_VARIABLE_NON_VOLATILE) && moksbstate == 1) + return efi_secureboot_mode_disabled; + + secure_boot_enabled: +diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c +index 05ae8bcc9d671..9780f32a9f243 100644 +--- a/drivers/firmware/efi/libstub/x86-stub.c ++++ b/drivers/firmware/efi/libstub/x86-stub.c +@@ -517,6 +517,13 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, + hdr->ramdisk_image = 0; + hdr->ramdisk_size = 0; + ++ /* ++ * Disregard any setup data that was provided by the bootloader: ++ * setup_data could be pointing anywhere, and we have no way of ++ * authenticating or validating the payload. ++ */ ++ hdr->setup_data = 0; ++ + efi_stub_entry(handle, sys_table_arg, boot_params); + /* not reached */ + +diff --git a/drivers/gpio/gpio-ixp4xx.c b/drivers/gpio/gpio-ixp4xx.c +index 312309be0287d..56656fb519f85 100644 +--- a/drivers/gpio/gpio-ixp4xx.c ++++ b/drivers/gpio/gpio-ixp4xx.c +@@ -63,6 +63,14 @@ static void ixp4xx_gpio_irq_ack(struct irq_data *d) + __raw_writel(BIT(d->hwirq), g->base + IXP4XX_REG_GPIS); + } + ++static void ixp4xx_gpio_mask_irq(struct irq_data *d) ++{ ++ struct gpio_chip *gc = irq_data_get_irq_chip_data(d); ++ ++ irq_chip_mask_parent(d); ++ gpiochip_disable_irq(gc, d->hwirq); ++} ++ + static void ixp4xx_gpio_irq_unmask(struct irq_data *d) + { + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); +@@ -72,6 +80,7 @@ static void ixp4xx_gpio_irq_unmask(struct irq_data *d) + if (!(g->irq_edge & BIT(d->hwirq))) + ixp4xx_gpio_irq_ack(d); + ++ gpiochip_enable_irq(gc, d->hwirq); + irq_chip_unmask_parent(d); + } + +@@ -149,12 +158,14 @@ static int ixp4xx_gpio_irq_set_type(struct irq_data *d, unsigned int type) + return irq_chip_set_type_parent(d, IRQ_TYPE_LEVEL_HIGH); + } + +-static struct irq_chip ixp4xx_gpio_irqchip = { ++static const struct irq_chip ixp4xx_gpio_irqchip = { + .name = "IXP4GPIO", + .irq_ack = ixp4xx_gpio_irq_ack, +- .irq_mask = irq_chip_mask_parent, ++ .irq_mask = ixp4xx_gpio_mask_irq, + .irq_unmask = ixp4xx_gpio_irq_unmask, + .irq_set_type = ixp4xx_gpio_irq_set_type, ++ .flags = IRQCHIP_IMMUTABLE, ++ GPIOCHIP_IRQ_RESOURCE_HELPERS, + }; + + static int ixp4xx_gpio_child_to_parent_hwirq(struct gpio_chip *gc, +@@ -263,7 +274,7 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) + g->gc.owner = THIS_MODULE; + + girq = &g->gc.irq; +- girq->chip = &ixp4xx_gpio_irqchip; ++ gpio_irq_chip_set_chip(girq, &ixp4xx_gpio_irqchip); + girq->fwnode = g->fwnode; + girq->parent_domain = parent; + girq->child_to_parent_hwirq = ixp4xx_gpio_child_to_parent_hwirq; +diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c +index a2e505a7545cd..523dfd17dd922 100644 +--- a/drivers/gpio/gpio-mockup.c ++++ b/drivers/gpio/gpio-mockup.c +@@ -533,8 +533,10 @@ static int __init gpio_mockup_register_chip(int idx) + } + + fwnode = fwnode_create_software_node(properties, NULL); +- if (IS_ERR(fwnode)) ++ if (IS_ERR(fwnode)) { ++ kfree_strarray(line_names, ngpio); + return PTR_ERR(fwnode); ++ } + + pdevinfo.name = "gpio-mockup"; + pdevinfo.id = idx; +@@ -597,9 +599,9 @@ static int __init gpio_mockup_init(void) + + static void __exit gpio_mockup_exit(void) + { ++ gpio_mockup_unregister_pdevs(); + debugfs_remove_recursive(gpio_mockup_dbg_dir); + platform_driver_unregister(&gpio_mockup_driver); +- gpio_mockup_unregister_pdevs(); + } + + module_init(gpio_mockup_init); +diff --git a/drivers/gpio/gpio-mt7621.c b/drivers/gpio/gpio-mt7621.c +index d8a26e503ca5d..f163f5ca857be 100644 +--- a/drivers/gpio/gpio-mt7621.c ++++ b/drivers/gpio/gpio-mt7621.c +@@ -112,6 +112,8 @@ mediatek_gpio_irq_unmask(struct irq_data *d) + unsigned long flags; + u32 rise, fall, high, low; + ++ gpiochip_enable_irq(gc, d->hwirq); ++ + spin_lock_irqsave(&rg->lock, flags); + rise = mtk_gpio_r32(rg, GPIO_REG_REDGE); + fall = mtk_gpio_r32(rg, GPIO_REG_FEDGE); +@@ -143,6 +145,8 @@ mediatek_gpio_irq_mask(struct irq_data *d) + mtk_gpio_w32(rg, GPIO_REG_HLVL, high & ~BIT(pin)); + mtk_gpio_w32(rg, GPIO_REG_LLVL, low & ~BIT(pin)); + spin_unlock_irqrestore(&rg->lock, flags); ++ ++ gpiochip_disable_irq(gc, d->hwirq); + } + + static int +@@ -204,6 +208,16 @@ mediatek_gpio_xlate(struct gpio_chip *chip, + return gpio % MTK_BANK_WIDTH; + } + ++static const struct irq_chip mt7621_irq_chip = { ++ .name = "mt7621-gpio", ++ .irq_mask_ack = mediatek_gpio_irq_mask, ++ .irq_mask = mediatek_gpio_irq_mask, ++ .irq_unmask = mediatek_gpio_irq_unmask, ++ .irq_set_type = mediatek_gpio_irq_type, ++ .flags = IRQCHIP_IMMUTABLE, ++ GPIOCHIP_IRQ_RESOURCE_HELPERS, ++}; ++ + static int + mediatek_gpio_bank_probe(struct device *dev, int bank) + { +@@ -238,11 +252,6 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) + return -ENOMEM; + + rg->chip.offset = bank * MTK_BANK_WIDTH; +- rg->irq_chip.name = dev_name(dev); +- rg->irq_chip.irq_unmask = mediatek_gpio_irq_unmask; +- rg->irq_chip.irq_mask = mediatek_gpio_irq_mask; +- rg->irq_chip.irq_mask_ack = mediatek_gpio_irq_mask; +- rg->irq_chip.irq_set_type = mediatek_gpio_irq_type; + + if (mtk->gpio_irq) { + struct gpio_irq_chip *girq; +@@ -262,7 +271,7 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) + } + + girq = &rg->chip.irq; +- girq->chip = &rg->irq_chip; ++ gpio_irq_chip_set_chip(girq, &mt7621_irq_chip); + /* This will let us handle the parent IRQ in the driver */ + girq->parent_handler = NULL; + girq->num_parents = 0; +diff --git a/drivers/gpio/gpio-tqmx86.c b/drivers/gpio/gpio-tqmx86.c +index fa4bc7481f9a6..e739dcea61b23 100644 +--- a/drivers/gpio/gpio-tqmx86.c ++++ b/drivers/gpio/gpio-tqmx86.c +@@ -307,6 +307,8 @@ static int tqmx86_gpio_probe(struct platform_device *pdev) + girq->default_type = IRQ_TYPE_NONE; + girq->handler = handle_simple_irq; + girq->init_valid_mask = tqmx86_init_irq_valid_mask; ++ ++ irq_domain_set_pm_device(girq->domain, dev); + } + + ret = devm_gpiochip_add_data(dev, chip, gpio); +@@ -315,8 +317,6 @@ static int tqmx86_gpio_probe(struct platform_device *pdev) + goto out_pm_dis; + } + +- irq_domain_set_pm_device(girq->domain, dev); +- + dev_info(dev, "GPIO functionality initialized with %d pins\n", + chip->ngpio); + +diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c +index b26e643383762..21fee9ed7f0d2 100644 +--- a/drivers/gpio/gpiolib-cdev.c ++++ b/drivers/gpio/gpiolib-cdev.c +@@ -1975,7 +1975,6 @@ static int lineevent_create(struct gpio_device *gdev, void __user *ip) + ret = -ENODEV; + goto out_free_le; + } +- le->irq = irq; + + if (eflags & GPIOEVENT_REQUEST_RISING_EDGE) + irqflags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ? +@@ -1989,7 +1988,7 @@ static int lineevent_create(struct gpio_device *gdev, void __user *ip) + init_waitqueue_head(&le->wait); + + /* Request a thread to read the events */ +- ret = request_threaded_irq(le->irq, ++ ret = request_threaded_irq(irq, + lineevent_irq_handler, + lineevent_irq_thread, + irqflags, +@@ -1998,6 +1997,8 @@ static int lineevent_create(struct gpio_device *gdev, void __user *ip) + if (ret) + goto out_free_le; + ++ le->irq = irq; ++ + fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); + if (fd < 0) { + ret = fd; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +index 4dfd6724b3caa..0a8c15c3a04c3 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +@@ -35,6 +35,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -495,6 +497,12 @@ static const struct drm_framebuffer_funcs amdgpu_fb_funcs = { + .create_handle = drm_gem_fb_create_handle, + }; + ++static const struct drm_framebuffer_funcs amdgpu_fb_funcs_atomic = { ++ .destroy = drm_gem_fb_destroy, ++ .create_handle = drm_gem_fb_create_handle, ++ .dirty = drm_atomic_helper_dirtyfb, ++}; ++ + uint32_t amdgpu_display_supported_domains(struct amdgpu_device *adev, + uint64_t bo_flags) + { +@@ -1069,7 +1077,10 @@ static int amdgpu_display_gem_fb_verify_and_init(struct drm_device *dev, + if (ret) + goto err; + +- ret = drm_framebuffer_init(dev, &rfb->base, &amdgpu_fb_funcs); ++ if (drm_drv_uses_atomic_modeset(dev)) ++ ret = drm_framebuffer_init(dev, &rfb->base, &amdgpu_fb_funcs_atomic); ++ else ++ ret = drm_framebuffer_init(dev, &rfb->base, &amdgpu_fb_funcs); + if (ret) + goto err; + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +index b19bf0c3f3737..79ce654bd3dad 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +@@ -748,7 +748,7 @@ static int psp_tmr_init(struct psp_context *psp) + } + + pptr = amdgpu_sriov_vf(psp->adev) ? &tmr_buf : NULL; +- ret = amdgpu_bo_create_kernel(psp->adev, tmr_size, PSP_TMR_SIZE(psp->adev), ++ ret = amdgpu_bo_create_kernel(psp->adev, tmr_size, PSP_TMR_ALIGNMENT, + AMDGPU_GEM_DOMAIN_VRAM, + &psp->tmr_bo, &psp->tmr_mc_addr, pptr); + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +index e431f49949319..cd366c7f311fd 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +@@ -36,6 +36,7 @@ + #define PSP_CMD_BUFFER_SIZE 0x1000 + #define PSP_1_MEG 0x100000 + #define PSP_TMR_SIZE(adev) ((adev)->asic_type == CHIP_ALDEBARAN ? 0x800000 : 0x400000) ++#define PSP_TMR_ALIGNMENT 0x100000 + #define PSP_FW_NAME_LEN 0x24 + + enum psp_shared_mem_size { +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +index dac202ae864dd..9193ca5d6fe7a 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +@@ -1805,7 +1805,8 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) + amdgpu_ras_query_error_status(adev, &info); + + if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && +- adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { ++ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && ++ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) { + if (amdgpu_ras_reset_error_status(adev, info.head.block)) + dev_warn(adev->dev, "Failed to reset error counter and error status"); + } +diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +index cdc0c97798483..6c1fd471a4c7d 100644 +--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c ++++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +@@ -28,6 +28,14 @@ + #include "nbio/nbio_7_7_0_sh_mask.h" + #include + ++static void nbio_v7_7_remap_hdp_registers(struct amdgpu_device *adev) ++{ ++ WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_MEM_FLUSH_CNTL, ++ adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); ++ WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_REG_FLUSH_CNTL, ++ adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); ++} ++ + static u32 nbio_v7_7_get_rev_id(struct amdgpu_device *adev) + { + u32 tmp; +@@ -237,4 +245,5 @@ const struct amdgpu_nbio_funcs nbio_v7_7_funcs = { + .ih_doorbell_range = nbio_v7_7_ih_doorbell_range, + .ih_control = nbio_v7_7_ih_control, + .init_registers = nbio_v7_7_init_registers, ++ .remap_hdp_registers = nbio_v7_7_remap_hdp_registers, + }; +diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c +index f47d82da115c9..42a567e71439b 100644 +--- a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c ++++ b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c +@@ -6651,8 +6651,7 @@ static double CalculateUrgentLatency( + return ret; + } + +- +-static void UseMinimumDCFCLK( ++static noinline_for_stack void UseMinimumDCFCLK( + struct display_mode_lib *mode_lib, + int MaxInterDCNTileRepeaters, + int MaxPrefetchMode, +diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c +index e4b9fd31223c9..40a672236198e 100644 +--- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c ++++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c +@@ -261,33 +261,13 @@ static void CalculateRowBandwidth( + + static void CalculateFlipSchedule( + struct display_mode_lib *mode_lib, ++ unsigned int k, + double HostVMInefficiencyFactor, + double UrgentExtraLatency, + double UrgentLatency, +- unsigned int GPUVMMaxPageTableLevels, +- bool HostVMEnable, +- unsigned int HostVMMaxNonCachedPageTableLevels, +- bool GPUVMEnable, +- double HostVMMinPageSize, + double PDEAndMetaPTEBytesPerFrame, + double MetaRowBytes, +- double DPTEBytesPerRow, +- double BandwidthAvailableForImmediateFlip, +- unsigned int TotImmediateFlipBytes, +- enum source_format_class SourcePixelFormat, +- double LineTime, +- double VRatio, +- double VRatioChroma, +- double Tno_bw, +- bool DCCEnable, +- unsigned int dpte_row_height, +- unsigned int meta_row_height, +- unsigned int dpte_row_height_chroma, +- unsigned int meta_row_height_chroma, +- double *DestinationLinesToRequestVMInImmediateFlip, +- double *DestinationLinesToRequestRowInImmediateFlip, +- double *final_flip_bw, +- bool *ImmediateFlipSupportedForPipe); ++ double DPTEBytesPerRow); + static double CalculateWriteBackDelay( + enum source_format_class WritebackPixelFormat, + double WritebackHRatio, +@@ -321,64 +301,28 @@ static void CalculateVupdateAndDynamicMetadataParameters( + static void CalculateWatermarksAndDRAMSpeedChangeSupport( + struct display_mode_lib *mode_lib, + unsigned int PrefetchMode, +- unsigned int NumberOfActivePlanes, +- unsigned int MaxLineBufferLines, +- unsigned int LineBufferSize, +- unsigned int WritebackInterfaceBufferSize, + double DCFCLK, + double ReturnBW, +- bool SynchronizedVBlank, +- unsigned int dpte_group_bytes[], +- unsigned int MetaChunkSize, + double UrgentLatency, + double ExtraLatency, +- double WritebackLatency, +- double WritebackChunkSize, + double SOCCLK, +- double DRAMClockChangeLatency, +- double SRExitTime, +- double SREnterPlusExitTime, +- double SRExitZ8Time, +- double SREnterPlusExitZ8Time, + double DCFCLKDeepSleep, + unsigned int DETBufferSizeY[], + unsigned int DETBufferSizeC[], + unsigned int SwathHeightY[], + unsigned int SwathHeightC[], +- unsigned int LBBitPerPixel[], + double SwathWidthY[], + double SwathWidthC[], +- double HRatio[], +- double HRatioChroma[], +- unsigned int vtaps[], +- unsigned int VTAPsChroma[], +- double VRatio[], +- double VRatioChroma[], +- unsigned int HTotal[], +- double PixelClock[], +- unsigned int BlendingAndTiming[], + unsigned int DPPPerPlane[], + double BytePerPixelDETY[], + double BytePerPixelDETC[], +- double DSTXAfterScaler[], +- double DSTYAfterScaler[], +- bool WritebackEnable[], +- enum source_format_class WritebackPixelFormat[], +- double WritebackDestinationWidth[], +- double WritebackDestinationHeight[], +- double WritebackSourceHeight[], + bool UnboundedRequestEnabled, + int unsigned CompressedBufferSizeInkByte, + enum clock_change_support *DRAMClockChangeSupport, +- double *UrgentWatermark, +- double *WritebackUrgentWatermark, +- double *DRAMClockChangeWatermark, +- double *WritebackDRAMClockChangeWatermark, + double *StutterExitWatermark, + double *StutterEnterPlusExitWatermark, + double *Z8StutterExitWatermark, +- double *Z8StutterEnterPlusExitWatermark, +- double *MinActiveDRAMClockChangeLatencySupported); ++ double *Z8StutterEnterPlusExitWatermark); + + static void CalculateDCFCLKDeepSleep( + struct display_mode_lib *mode_lib, +@@ -2914,33 +2858,13 @@ static void DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerforman + for (k = 0; k < v->NumberOfActivePlanes; ++k) { + CalculateFlipSchedule( + mode_lib, ++ k, + HostVMInefficiencyFactor, + v->UrgentExtraLatency, + v->UrgentLatency, +- v->GPUVMMaxPageTableLevels, +- v->HostVMEnable, +- v->HostVMMaxNonCachedPageTableLevels, +- v->GPUVMEnable, +- v->HostVMMinPageSize, + v->PDEAndMetaPTEBytesFrame[k], + v->MetaRowByte[k], +- v->PixelPTEBytesPerRow[k], +- v->BandwidthAvailableForImmediateFlip, +- v->TotImmediateFlipBytes, +- v->SourcePixelFormat[k], +- v->HTotal[k] / v->PixelClock[k], +- v->VRatio[k], +- v->VRatioChroma[k], +- v->Tno_bw[k], +- v->DCCEnable[k], +- v->dpte_row_height[k], +- v->meta_row_height[k], +- v->dpte_row_height_chroma[k], +- v->meta_row_height_chroma[k], +- &v->DestinationLinesToRequestVMInImmediateFlip[k], +- &v->DestinationLinesToRequestRowInImmediateFlip[k], +- &v->final_flip_bw[k], +- &v->ImmediateFlipSupportedForPipe[k]); ++ v->PixelPTEBytesPerRow[k]); + } + + v->total_dcn_read_bw_with_flip = 0.0; +@@ -3027,64 +2951,28 @@ static void DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerforman + CalculateWatermarksAndDRAMSpeedChangeSupport( + mode_lib, + PrefetchMode, +- v->NumberOfActivePlanes, +- v->MaxLineBufferLines, +- v->LineBufferSize, +- v->WritebackInterfaceBufferSize, + v->DCFCLK, + v->ReturnBW, +- v->SynchronizedVBlank, +- v->dpte_group_bytes, +- v->MetaChunkSize, + v->UrgentLatency, + v->UrgentExtraLatency, +- v->WritebackLatency, +- v->WritebackChunkSize, + v->SOCCLK, +- v->DRAMClockChangeLatency, +- v->SRExitTime, +- v->SREnterPlusExitTime, +- v->SRExitZ8Time, +- v->SREnterPlusExitZ8Time, + v->DCFCLKDeepSleep, + v->DETBufferSizeY, + v->DETBufferSizeC, + v->SwathHeightY, + v->SwathHeightC, +- v->LBBitPerPixel, + v->SwathWidthY, + v->SwathWidthC, +- v->HRatio, +- v->HRatioChroma, +- v->vtaps, +- v->VTAPsChroma, +- v->VRatio, +- v->VRatioChroma, +- v->HTotal, +- v->PixelClock, +- v->BlendingAndTiming, + v->DPPPerPlane, + v->BytePerPixelDETY, + v->BytePerPixelDETC, +- v->DSTXAfterScaler, +- v->DSTYAfterScaler, +- v->WritebackEnable, +- v->WritebackPixelFormat, +- v->WritebackDestinationWidth, +- v->WritebackDestinationHeight, +- v->WritebackSourceHeight, + v->UnboundedRequestEnabled, + v->CompressedBufferSizeInkByte, + &DRAMClockChangeSupport, +- &v->UrgentWatermark, +- &v->WritebackUrgentWatermark, +- &v->DRAMClockChangeWatermark, +- &v->WritebackDRAMClockChangeWatermark, + &v->StutterExitWatermark, + &v->StutterEnterPlusExitWatermark, + &v->Z8StutterExitWatermark, +- &v->Z8StutterEnterPlusExitWatermark, +- &v->MinActiveDRAMClockChangeLatencySupported); ++ &v->Z8StutterEnterPlusExitWatermark); + + for (k = 0; k < v->NumberOfActivePlanes; ++k) { + if (v->WritebackEnable[k] == true) { +@@ -3696,61 +3584,43 @@ static void CalculateRowBandwidth( + + static void CalculateFlipSchedule( + struct display_mode_lib *mode_lib, ++ unsigned int k, + double HostVMInefficiencyFactor, + double UrgentExtraLatency, + double UrgentLatency, +- unsigned int GPUVMMaxPageTableLevels, +- bool HostVMEnable, +- unsigned int HostVMMaxNonCachedPageTableLevels, +- bool GPUVMEnable, +- double HostVMMinPageSize, + double PDEAndMetaPTEBytesPerFrame, + double MetaRowBytes, +- double DPTEBytesPerRow, +- double BandwidthAvailableForImmediateFlip, +- unsigned int TotImmediateFlipBytes, +- enum source_format_class SourcePixelFormat, +- double LineTime, +- double VRatio, +- double VRatioChroma, +- double Tno_bw, +- bool DCCEnable, +- unsigned int dpte_row_height, +- unsigned int meta_row_height, +- unsigned int dpte_row_height_chroma, +- unsigned int meta_row_height_chroma, +- double *DestinationLinesToRequestVMInImmediateFlip, +- double *DestinationLinesToRequestRowInImmediateFlip, +- double *final_flip_bw, +- bool *ImmediateFlipSupportedForPipe) ++ double DPTEBytesPerRow) + { ++ struct vba_vars_st *v = &mode_lib->vba; + double min_row_time = 0.0; + unsigned int HostVMDynamicLevelsTrips; + double TimeForFetchingMetaPTEImmediateFlip; + double TimeForFetchingRowInVBlankImmediateFlip; + double ImmediateFlipBW; ++ double LineTime = v->HTotal[k] / v->PixelClock[k]; + +- if (GPUVMEnable == true && HostVMEnable == true) { +- HostVMDynamicLevelsTrips = HostVMMaxNonCachedPageTableLevels; ++ if (v->GPUVMEnable == true && v->HostVMEnable == true) { ++ HostVMDynamicLevelsTrips = v->HostVMMaxNonCachedPageTableLevels; + } else { + HostVMDynamicLevelsTrips = 0; + } + +- if (GPUVMEnable == true || DCCEnable == true) { +- ImmediateFlipBW = (PDEAndMetaPTEBytesPerFrame + MetaRowBytes + DPTEBytesPerRow) * BandwidthAvailableForImmediateFlip / TotImmediateFlipBytes; ++ if (v->GPUVMEnable == true || v->DCCEnable[k] == true) { ++ ImmediateFlipBW = (PDEAndMetaPTEBytesPerFrame + MetaRowBytes + DPTEBytesPerRow) * v->BandwidthAvailableForImmediateFlip / v->TotImmediateFlipBytes; + } + +- if (GPUVMEnable == true) { ++ if (v->GPUVMEnable == true) { + TimeForFetchingMetaPTEImmediateFlip = dml_max3( +- Tno_bw + PDEAndMetaPTEBytesPerFrame * HostVMInefficiencyFactor / ImmediateFlipBW, +- UrgentExtraLatency + UrgentLatency * (GPUVMMaxPageTableLevels * (HostVMDynamicLevelsTrips + 1) - 1), ++ v->Tno_bw[k] + PDEAndMetaPTEBytesPerFrame * HostVMInefficiencyFactor / ImmediateFlipBW, ++ UrgentExtraLatency + UrgentLatency * (v->GPUVMMaxPageTableLevels * (HostVMDynamicLevelsTrips + 1) - 1), + LineTime / 4.0); + } else { + TimeForFetchingMetaPTEImmediateFlip = 0; + } + +- *DestinationLinesToRequestVMInImmediateFlip = dml_ceil(4.0 * (TimeForFetchingMetaPTEImmediateFlip / LineTime), 1) / 4.0; +- if ((GPUVMEnable == true || DCCEnable == true)) { ++ v->DestinationLinesToRequestVMInImmediateFlip[k] = dml_ceil(4.0 * (TimeForFetchingMetaPTEImmediateFlip / LineTime), 1) / 4.0; ++ if ((v->GPUVMEnable == true || v->DCCEnable[k] == true)) { + TimeForFetchingRowInVBlankImmediateFlip = dml_max3( + (MetaRowBytes + DPTEBytesPerRow * HostVMInefficiencyFactor) / ImmediateFlipBW, + UrgentLatency * (HostVMDynamicLevelsTrips + 1), +@@ -3759,54 +3629,54 @@ static void CalculateFlipSchedule( + TimeForFetchingRowInVBlankImmediateFlip = 0; + } + +- *DestinationLinesToRequestRowInImmediateFlip = dml_ceil(4.0 * (TimeForFetchingRowInVBlankImmediateFlip / LineTime), 1) / 4.0; ++ v->DestinationLinesToRequestRowInImmediateFlip[k] = dml_ceil(4.0 * (TimeForFetchingRowInVBlankImmediateFlip / LineTime), 1) / 4.0; + +- if (GPUVMEnable == true) { +- *final_flip_bw = dml_max( +- PDEAndMetaPTEBytesPerFrame * HostVMInefficiencyFactor / (*DestinationLinesToRequestVMInImmediateFlip * LineTime), +- (MetaRowBytes + DPTEBytesPerRow * HostVMInefficiencyFactor) / (*DestinationLinesToRequestRowInImmediateFlip * LineTime)); +- } else if ((GPUVMEnable == true || DCCEnable == true)) { +- *final_flip_bw = (MetaRowBytes + DPTEBytesPerRow * HostVMInefficiencyFactor) / (*DestinationLinesToRequestRowInImmediateFlip * LineTime); ++ if (v->GPUVMEnable == true) { ++ v->final_flip_bw[k] = dml_max( ++ PDEAndMetaPTEBytesPerFrame * HostVMInefficiencyFactor / (v->DestinationLinesToRequestVMInImmediateFlip[k] * LineTime), ++ (MetaRowBytes + DPTEBytesPerRow * HostVMInefficiencyFactor) / (v->DestinationLinesToRequestRowInImmediateFlip[k] * LineTime)); ++ } else if ((v->GPUVMEnable == true || v->DCCEnable[k] == true)) { ++ v->final_flip_bw[k] = (MetaRowBytes + DPTEBytesPerRow * HostVMInefficiencyFactor) / (v->DestinationLinesToRequestRowInImmediateFlip[k] * LineTime); + } else { +- *final_flip_bw = 0; ++ v->final_flip_bw[k] = 0; + } + +- if (SourcePixelFormat == dm_420_8 || SourcePixelFormat == dm_420_10 || SourcePixelFormat == dm_rgbe_alpha) { +- if (GPUVMEnable == true && DCCEnable != true) { +- min_row_time = dml_min(dpte_row_height * LineTime / VRatio, dpte_row_height_chroma * LineTime / VRatioChroma); +- } else if (GPUVMEnable != true && DCCEnable == true) { +- min_row_time = dml_min(meta_row_height * LineTime / VRatio, meta_row_height_chroma * LineTime / VRatioChroma); ++ if (v->SourcePixelFormat[k] == dm_420_8 || v->SourcePixelFormat[k] == dm_420_10 || v->SourcePixelFormat[k] == dm_rgbe_alpha) { ++ if (v->GPUVMEnable == true && v->DCCEnable[k] != true) { ++ min_row_time = dml_min(v->dpte_row_height[k] * LineTime / v->VRatio[k], v->dpte_row_height_chroma[k] * LineTime / v->VRatioChroma[k]); ++ } else if (v->GPUVMEnable != true && v->DCCEnable[k] == true) { ++ min_row_time = dml_min(v->meta_row_height[k] * LineTime / v->VRatio[k], v->meta_row_height_chroma[k] * LineTime / v->VRatioChroma[k]); + } else { + min_row_time = dml_min4( +- dpte_row_height * LineTime / VRatio, +- meta_row_height * LineTime / VRatio, +- dpte_row_height_chroma * LineTime / VRatioChroma, +- meta_row_height_chroma * LineTime / VRatioChroma); ++ v->dpte_row_height[k] * LineTime / v->VRatio[k], ++ v->meta_row_height[k] * LineTime / v->VRatio[k], ++ v->dpte_row_height_chroma[k] * LineTime / v->VRatioChroma[k], ++ v->meta_row_height_chroma[k] * LineTime / v->VRatioChroma[k]); + } + } else { +- if (GPUVMEnable == true && DCCEnable != true) { +- min_row_time = dpte_row_height * LineTime / VRatio; +- } else if (GPUVMEnable != true && DCCEnable == true) { +- min_row_time = meta_row_height * LineTime / VRatio; ++ if (v->GPUVMEnable == true && v->DCCEnable[k] != true) { ++ min_row_time = v->dpte_row_height[k] * LineTime / v->VRatio[k]; ++ } else if (v->GPUVMEnable != true && v->DCCEnable[k] == true) { ++ min_row_time = v->meta_row_height[k] * LineTime / v->VRatio[k]; + } else { +- min_row_time = dml_min(dpte_row_height * LineTime / VRatio, meta_row_height * LineTime / VRatio); ++ min_row_time = dml_min(v->dpte_row_height[k] * LineTime / v->VRatio[k], v->meta_row_height[k] * LineTime / v->VRatio[k]); + } + } + +- if (*DestinationLinesToRequestVMInImmediateFlip >= 32 || *DestinationLinesToRequestRowInImmediateFlip >= 16 ++ if (v->DestinationLinesToRequestVMInImmediateFlip[k] >= 32 || v->DestinationLinesToRequestRowInImmediateFlip[k] >= 16 + || TimeForFetchingMetaPTEImmediateFlip + 2 * TimeForFetchingRowInVBlankImmediateFlip > min_row_time) { +- *ImmediateFlipSupportedForPipe = false; ++ v->ImmediateFlipSupportedForPipe[k] = false; + } else { +- *ImmediateFlipSupportedForPipe = true; ++ v->ImmediateFlipSupportedForPipe[k] = true; + } + + #ifdef __DML_VBA_DEBUG__ +- dml_print("DML::%s: DestinationLinesToRequestVMInImmediateFlip = %f\n", __func__, *DestinationLinesToRequestVMInImmediateFlip); +- dml_print("DML::%s: DestinationLinesToRequestRowInImmediateFlip = %f\n", __func__, *DestinationLinesToRequestRowInImmediateFlip); ++ dml_print("DML::%s: DestinationLinesToRequestVMInImmediateFlip = %f\n", __func__, v->DestinationLinesToRequestVMInImmediateFlip[k]); ++ dml_print("DML::%s: DestinationLinesToRequestRowInImmediateFlip = %f\n", __func__, v->DestinationLinesToRequestRowInImmediateFlip[k]); + dml_print("DML::%s: TimeForFetchingMetaPTEImmediateFlip = %f\n", __func__, TimeForFetchingMetaPTEImmediateFlip); + dml_print("DML::%s: TimeForFetchingRowInVBlankImmediateFlip = %f\n", __func__, TimeForFetchingRowInVBlankImmediateFlip); + dml_print("DML::%s: min_row_time = %f\n", __func__, min_row_time); +- dml_print("DML::%s: ImmediateFlipSupportedForPipe = %d\n", __func__, *ImmediateFlipSupportedForPipe); ++ dml_print("DML::%s: ImmediateFlipSupportedForPipe = %d\n", __func__, v->ImmediateFlipSupportedForPipe[k]); + #endif + + } +@@ -5397,33 +5267,13 @@ void dml31_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l + for (k = 0; k < v->NumberOfActivePlanes; k++) { + CalculateFlipSchedule( + mode_lib, ++ k, + HostVMInefficiencyFactor, + v->ExtraLatency, + v->UrgLatency[i], +- v->GPUVMMaxPageTableLevels, +- v->HostVMEnable, +- v->HostVMMaxNonCachedPageTableLevels, +- v->GPUVMEnable, +- v->HostVMMinPageSize, + v->PDEAndMetaPTEBytesPerFrame[i][j][k], + v->MetaRowBytes[i][j][k], +- v->DPTEBytesPerRow[i][j][k], +- v->BandwidthAvailableForImmediateFlip, +- v->TotImmediateFlipBytes, +- v->SourcePixelFormat[k], +- v->HTotal[k] / v->PixelClock[k], +- v->VRatio[k], +- v->VRatioChroma[k], +- v->Tno_bw[k], +- v->DCCEnable[k], +- v->dpte_row_height[k], +- v->meta_row_height[k], +- v->dpte_row_height_chroma[k], +- v->meta_row_height_chroma[k], +- &v->DestinationLinesToRequestVMInImmediateFlip[k], +- &v->DestinationLinesToRequestRowInImmediateFlip[k], +- &v->final_flip_bw[k], +- &v->ImmediateFlipSupportedForPipe[k]); ++ v->DPTEBytesPerRow[i][j][k]); + } + v->total_dcn_read_bw_with_flip = 0.0; + for (k = 0; k < v->NumberOfActivePlanes; k++) { +@@ -5481,64 +5331,28 @@ void dml31_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l + CalculateWatermarksAndDRAMSpeedChangeSupport( + mode_lib, + v->PrefetchModePerState[i][j], +- v->NumberOfActivePlanes, +- v->MaxLineBufferLines, +- v->LineBufferSize, +- v->WritebackInterfaceBufferSize, + v->DCFCLKState[i][j], + v->ReturnBWPerState[i][j], +- v->SynchronizedVBlank, +- v->dpte_group_bytes, +- v->MetaChunkSize, + v->UrgLatency[i], + v->ExtraLatency, +- v->WritebackLatency, +- v->WritebackChunkSize, + v->SOCCLKPerState[i], +- v->DRAMClockChangeLatency, +- v->SRExitTime, +- v->SREnterPlusExitTime, +- v->SRExitZ8Time, +- v->SREnterPlusExitZ8Time, + v->ProjectedDCFCLKDeepSleep[i][j], + v->DETBufferSizeYThisState, + v->DETBufferSizeCThisState, + v->SwathHeightYThisState, + v->SwathHeightCThisState, +- v->LBBitPerPixel, + v->SwathWidthYThisState, + v->SwathWidthCThisState, +- v->HRatio, +- v->HRatioChroma, +- v->vtaps, +- v->VTAPsChroma, +- v->VRatio, +- v->VRatioChroma, +- v->HTotal, +- v->PixelClock, +- v->BlendingAndTiming, + v->NoOfDPPThisState, + v->BytePerPixelInDETY, + v->BytePerPixelInDETC, +- v->DSTXAfterScaler, +- v->DSTYAfterScaler, +- v->WritebackEnable, +- v->WritebackPixelFormat, +- v->WritebackDestinationWidth, +- v->WritebackDestinationHeight, +- v->WritebackSourceHeight, + UnboundedRequestEnabledThisState, + CompressedBufferSizeInkByteThisState, + &v->DRAMClockChangeSupport[i][j], +- &v->UrgentWatermark, +- &v->WritebackUrgentWatermark, +- &v->DRAMClockChangeWatermark, +- &v->WritebackDRAMClockChangeWatermark, +- &dummy, + &dummy, + &dummy, + &dummy, +- &v->MinActiveDRAMClockChangeLatencySupported); ++ &dummy); + } + } + +@@ -5663,64 +5477,28 @@ void dml31_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l + static void CalculateWatermarksAndDRAMSpeedChangeSupport( + struct display_mode_lib *mode_lib, + unsigned int PrefetchMode, +- unsigned int NumberOfActivePlanes, +- unsigned int MaxLineBufferLines, +- unsigned int LineBufferSize, +- unsigned int WritebackInterfaceBufferSize, + double DCFCLK, + double ReturnBW, +- bool SynchronizedVBlank, +- unsigned int dpte_group_bytes[], +- unsigned int MetaChunkSize, + double UrgentLatency, + double ExtraLatency, +- double WritebackLatency, +- double WritebackChunkSize, + double SOCCLK, +- double DRAMClockChangeLatency, +- double SRExitTime, +- double SREnterPlusExitTime, +- double SRExitZ8Time, +- double SREnterPlusExitZ8Time, + double DCFCLKDeepSleep, + unsigned int DETBufferSizeY[], + unsigned int DETBufferSizeC[], + unsigned int SwathHeightY[], + unsigned int SwathHeightC[], +- unsigned int LBBitPerPixel[], + double SwathWidthY[], + double SwathWidthC[], +- double HRatio[], +- double HRatioChroma[], +- unsigned int vtaps[], +- unsigned int VTAPsChroma[], +- double VRatio[], +- double VRatioChroma[], +- unsigned int HTotal[], +- double PixelClock[], +- unsigned int BlendingAndTiming[], + unsigned int DPPPerPlane[], + double BytePerPixelDETY[], + double BytePerPixelDETC[], +- double DSTXAfterScaler[], +- double DSTYAfterScaler[], +- bool WritebackEnable[], +- enum source_format_class WritebackPixelFormat[], +- double WritebackDestinationWidth[], +- double WritebackDestinationHeight[], +- double WritebackSourceHeight[], + bool UnboundedRequestEnabled, + int unsigned CompressedBufferSizeInkByte, + enum clock_change_support *DRAMClockChangeSupport, +- double *UrgentWatermark, +- double *WritebackUrgentWatermark, +- double *DRAMClockChangeWatermark, +- double *WritebackDRAMClockChangeWatermark, + double *StutterExitWatermark, + double *StutterEnterPlusExitWatermark, + double *Z8StutterExitWatermark, +- double *Z8StutterEnterPlusExitWatermark, +- double *MinActiveDRAMClockChangeLatencySupported) ++ double *Z8StutterEnterPlusExitWatermark) + { + struct vba_vars_st *v = &mode_lib->vba; + double EffectiveLBLatencyHidingY; +@@ -5740,103 +5518,103 @@ static void CalculateWatermarksAndDRAMSpeedChangeSupport( + double TotalPixelBW = 0.0; + int k, j; + +- *UrgentWatermark = UrgentLatency + ExtraLatency; ++ v->UrgentWatermark = UrgentLatency + ExtraLatency; + + #ifdef __DML_VBA_DEBUG__ + dml_print("DML::%s: UrgentLatency = %f\n", __func__, UrgentLatency); + dml_print("DML::%s: ExtraLatency = %f\n", __func__, ExtraLatency); +- dml_print("DML::%s: UrgentWatermark = %f\n", __func__, *UrgentWatermark); ++ dml_print("DML::%s: UrgentWatermark = %f\n", __func__, v->UrgentWatermark); + #endif + +- *DRAMClockChangeWatermark = DRAMClockChangeLatency + *UrgentWatermark; ++ v->DRAMClockChangeWatermark = v->DRAMClockChangeLatency + v->UrgentWatermark; + + #ifdef __DML_VBA_DEBUG__ +- dml_print("DML::%s: DRAMClockChangeLatency = %f\n", __func__, DRAMClockChangeLatency); +- dml_print("DML::%s: DRAMClockChangeWatermark = %f\n", __func__, *DRAMClockChangeWatermark); ++ dml_print("DML::%s: v->DRAMClockChangeLatency = %f\n", __func__, v->DRAMClockChangeLatency); ++ dml_print("DML::%s: DRAMClockChangeWatermark = %f\n", __func__, v->DRAMClockChangeWatermark); + #endif + + v->TotalActiveWriteback = 0; +- for (k = 0; k < NumberOfActivePlanes; ++k) { +- if (WritebackEnable[k] == true) { ++ for (k = 0; k < v->NumberOfActivePlanes; ++k) { ++ if (v->WritebackEnable[k] == true) { + v->TotalActiveWriteback = v->TotalActiveWriteback + 1; + } + } + + if (v->TotalActiveWriteback <= 1) { +- *WritebackUrgentWatermark = WritebackLatency; ++ v->WritebackUrgentWatermark = v->WritebackLatency; + } else { +- *WritebackUrgentWatermark = WritebackLatency + WritebackChunkSize * 1024.0 / 32.0 / SOCCLK; ++ v->WritebackUrgentWatermark = v->WritebackLatency + v->WritebackChunkSize * 1024.0 / 32.0 / SOCCLK; + } + + if (v->TotalActiveWriteback <= 1) { +- *WritebackDRAMClockChangeWatermark = DRAMClockChangeLatency + WritebackLatency; ++ v->WritebackDRAMClockChangeWatermark = v->DRAMClockChangeLatency + v->WritebackLatency; + } else { +- *WritebackDRAMClockChangeWatermark = DRAMClockChangeLatency + WritebackLatency + WritebackChunkSize * 1024.0 / 32.0 / SOCCLK; ++ v->WritebackDRAMClockChangeWatermark = v->DRAMClockChangeLatency + v->WritebackLatency + v->WritebackChunkSize * 1024.0 / 32.0 / SOCCLK; + } + +- for (k = 0; k < NumberOfActivePlanes; ++k) { ++ for (k = 0; k < v->NumberOfActivePlanes; ++k) { + TotalPixelBW = TotalPixelBW +- + DPPPerPlane[k] * (SwathWidthY[k] * BytePerPixelDETY[k] * VRatio[k] + SwathWidthC[k] * BytePerPixelDETC[k] * VRatioChroma[k]) +- / (HTotal[k] / PixelClock[k]); ++ + DPPPerPlane[k] * (SwathWidthY[k] * BytePerPixelDETY[k] * v->VRatio[k] + SwathWidthC[k] * BytePerPixelDETC[k] * v->VRatioChroma[k]) ++ / (v->HTotal[k] / v->PixelClock[k]); + } + +- for (k = 0; k < NumberOfActivePlanes; ++k) { ++ for (k = 0; k < v->NumberOfActivePlanes; ++k) { + double EffectiveDETBufferSizeY = DETBufferSizeY[k]; + + v->LBLatencyHidingSourceLinesY = dml_min( +- (double) MaxLineBufferLines, +- dml_floor(LineBufferSize / LBBitPerPixel[k] / (SwathWidthY[k] / dml_max(HRatio[k], 1.0)), 1)) - (vtaps[k] - 1); ++ (double) v->MaxLineBufferLines, ++ dml_floor(v->LineBufferSize / v->LBBitPerPixel[k] / (SwathWidthY[k] / dml_max(v->HRatio[k], 1.0)), 1)) - (v->vtaps[k] - 1); + + v->LBLatencyHidingSourceLinesC = dml_min( +- (double) MaxLineBufferLines, +- dml_floor(LineBufferSize / LBBitPerPixel[k] / (SwathWidthC[k] / dml_max(HRatioChroma[k], 1.0)), 1)) - (VTAPsChroma[k] - 1); ++ (double) v->MaxLineBufferLines, ++ dml_floor(v->LineBufferSize / v->LBBitPerPixel[k] / (SwathWidthC[k] / dml_max(v->HRatioChroma[k], 1.0)), 1)) - (v->VTAPsChroma[k] - 1); + +- EffectiveLBLatencyHidingY = v->LBLatencyHidingSourceLinesY / VRatio[k] * (HTotal[k] / PixelClock[k]); ++ EffectiveLBLatencyHidingY = v->LBLatencyHidingSourceLinesY / v->VRatio[k] * (v->HTotal[k] / v->PixelClock[k]); + +- EffectiveLBLatencyHidingC = v->LBLatencyHidingSourceLinesC / VRatioChroma[k] * (HTotal[k] / PixelClock[k]); ++ EffectiveLBLatencyHidingC = v->LBLatencyHidingSourceLinesC / v->VRatioChroma[k] * (v->HTotal[k] / v->PixelClock[k]); + + if (UnboundedRequestEnabled) { + EffectiveDETBufferSizeY = EffectiveDETBufferSizeY +- + CompressedBufferSizeInkByte * 1024 * SwathWidthY[k] * BytePerPixelDETY[k] * VRatio[k] / (HTotal[k] / PixelClock[k]) / TotalPixelBW; ++ + CompressedBufferSizeInkByte * 1024 * SwathWidthY[k] * BytePerPixelDETY[k] * v->VRatio[k] / (v->HTotal[k] / v->PixelClock[k]) / TotalPixelBW; + } + + LinesInDETY[k] = (double) EffectiveDETBufferSizeY / BytePerPixelDETY[k] / SwathWidthY[k]; + LinesInDETYRoundedDownToSwath[k] = dml_floor(LinesInDETY[k], SwathHeightY[k]); +- FullDETBufferingTimeY = LinesInDETYRoundedDownToSwath[k] * (HTotal[k] / PixelClock[k]) / VRatio[k]; ++ FullDETBufferingTimeY = LinesInDETYRoundedDownToSwath[k] * (v->HTotal[k] / v->PixelClock[k]) / v->VRatio[k]; + if (BytePerPixelDETC[k] > 0) { + LinesInDETC = v->DETBufferSizeC[k] / BytePerPixelDETC[k] / SwathWidthC[k]; + LinesInDETCRoundedDownToSwath = dml_floor(LinesInDETC, SwathHeightC[k]); +- FullDETBufferingTimeC = LinesInDETCRoundedDownToSwath * (HTotal[k] / PixelClock[k]) / VRatioChroma[k]; ++ FullDETBufferingTimeC = LinesInDETCRoundedDownToSwath * (v->HTotal[k] / v->PixelClock[k]) / v->VRatioChroma[k]; + } else { + LinesInDETC = 0; + FullDETBufferingTimeC = 999999; + } + + ActiveDRAMClockChangeLatencyMarginY = EffectiveLBLatencyHidingY + FullDETBufferingTimeY +- - ((double) DSTXAfterScaler[k] / HTotal[k] + DSTYAfterScaler[k]) * HTotal[k] / PixelClock[k] - *UrgentWatermark - *DRAMClockChangeWatermark; ++ - ((double) v->DSTXAfterScaler[k] / v->HTotal[k] + v->DSTYAfterScaler[k]) * v->HTotal[k] / v->PixelClock[k] - v->UrgentWatermark - v->DRAMClockChangeWatermark; + +- if (NumberOfActivePlanes > 1) { ++ if (v->NumberOfActivePlanes > 1) { + ActiveDRAMClockChangeLatencyMarginY = ActiveDRAMClockChangeLatencyMarginY +- - (1 - 1.0 / NumberOfActivePlanes) * SwathHeightY[k] * HTotal[k] / PixelClock[k] / VRatio[k]; ++ - (1 - 1.0 / v->NumberOfActivePlanes) * SwathHeightY[k] * v->HTotal[k] / v->PixelClock[k] / v->VRatio[k]; + } + + if (BytePerPixelDETC[k] > 0) { + ActiveDRAMClockChangeLatencyMarginC = EffectiveLBLatencyHidingC + FullDETBufferingTimeC +- - ((double) DSTXAfterScaler[k] / HTotal[k] + DSTYAfterScaler[k]) * HTotal[k] / PixelClock[k] - *UrgentWatermark - *DRAMClockChangeWatermark; ++ - ((double) v->DSTXAfterScaler[k] / v->HTotal[k] + v->DSTYAfterScaler[k]) * v->HTotal[k] / v->PixelClock[k] - v->UrgentWatermark - v->DRAMClockChangeWatermark; + +- if (NumberOfActivePlanes > 1) { ++ if (v->NumberOfActivePlanes > 1) { + ActiveDRAMClockChangeLatencyMarginC = ActiveDRAMClockChangeLatencyMarginC +- - (1 - 1.0 / NumberOfActivePlanes) * SwathHeightC[k] * HTotal[k] / PixelClock[k] / VRatioChroma[k]; ++ - (1 - 1.0 / v->NumberOfActivePlanes) * SwathHeightC[k] * v->HTotal[k] / v->PixelClock[k] / v->VRatioChroma[k]; + } + v->ActiveDRAMClockChangeLatencyMargin[k] = dml_min(ActiveDRAMClockChangeLatencyMarginY, ActiveDRAMClockChangeLatencyMarginC); + } else { + v->ActiveDRAMClockChangeLatencyMargin[k] = ActiveDRAMClockChangeLatencyMarginY; + } + +- if (WritebackEnable[k] == true) { +- WritebackDRAMClockChangeLatencyHiding = WritebackInterfaceBufferSize * 1024 +- / (WritebackDestinationWidth[k] * WritebackDestinationHeight[k] / (WritebackSourceHeight[k] * HTotal[k] / PixelClock[k]) * 4); +- if (WritebackPixelFormat[k] == dm_444_64) { ++ if (v->WritebackEnable[k] == true) { ++ WritebackDRAMClockChangeLatencyHiding = v->WritebackInterfaceBufferSize * 1024 ++ / (v->WritebackDestinationWidth[k] * v->WritebackDestinationHeight[k] / (v->WritebackSourceHeight[k] * v->HTotal[k] / v->PixelClock[k]) * 4); ++ if (v->WritebackPixelFormat[k] == dm_444_64) { + WritebackDRAMClockChangeLatencyHiding = WritebackDRAMClockChangeLatencyHiding / 2; + } + WritebackDRAMClockChangeLatencyMargin = WritebackDRAMClockChangeLatencyHiding - v->WritebackDRAMClockChangeWatermark; +@@ -5846,14 +5624,14 @@ static void CalculateWatermarksAndDRAMSpeedChangeSupport( + + v->MinActiveDRAMClockChangeMargin = 999999; + PlaneWithMinActiveDRAMClockChangeMargin = 0; +- for (k = 0; k < NumberOfActivePlanes; ++k) { ++ for (k = 0; k < v->NumberOfActivePlanes; ++k) { + if (v->ActiveDRAMClockChangeLatencyMargin[k] < v->MinActiveDRAMClockChangeMargin) { + v->MinActiveDRAMClockChangeMargin = v->ActiveDRAMClockChangeLatencyMargin[k]; +- if (BlendingAndTiming[k] == k) { ++ if (v->BlendingAndTiming[k] == k) { + PlaneWithMinActiveDRAMClockChangeMargin = k; + } else { +- for (j = 0; j < NumberOfActivePlanes; ++j) { +- if (BlendingAndTiming[k] == j) { ++ for (j = 0; j < v->NumberOfActivePlanes; ++j) { ++ if (v->BlendingAndTiming[k] == j) { + PlaneWithMinActiveDRAMClockChangeMargin = j; + } + } +@@ -5861,11 +5639,11 @@ static void CalculateWatermarksAndDRAMSpeedChangeSupport( + } + } + +- *MinActiveDRAMClockChangeLatencySupported = v->MinActiveDRAMClockChangeMargin + DRAMClockChangeLatency; ++ v->MinActiveDRAMClockChangeLatencySupported = v->MinActiveDRAMClockChangeMargin + v->DRAMClockChangeLatency ; + + SecondMinActiveDRAMClockChangeMarginOneDisplayInVBLank = 999999; +- for (k = 0; k < NumberOfActivePlanes; ++k) { +- if (!((k == PlaneWithMinActiveDRAMClockChangeMargin) && (BlendingAndTiming[k] == k)) && !(BlendingAndTiming[k] == PlaneWithMinActiveDRAMClockChangeMargin) ++ for (k = 0; k < v->NumberOfActivePlanes; ++k) { ++ if (!((k == PlaneWithMinActiveDRAMClockChangeMargin) && (v->BlendingAndTiming[k] == k)) && !(v->BlendingAndTiming[k] == PlaneWithMinActiveDRAMClockChangeMargin) + && v->ActiveDRAMClockChangeLatencyMargin[k] < SecondMinActiveDRAMClockChangeMarginOneDisplayInVBLank) { + SecondMinActiveDRAMClockChangeMarginOneDisplayInVBLank = v->ActiveDRAMClockChangeLatencyMargin[k]; + } +@@ -5873,25 +5651,25 @@ static void CalculateWatermarksAndDRAMSpeedChangeSupport( + + v->TotalNumberOfActiveOTG = 0; + +- for (k = 0; k < NumberOfActivePlanes; ++k) { +- if (BlendingAndTiming[k] == k) { ++ for (k = 0; k < v->NumberOfActivePlanes; ++k) { ++ if (v->BlendingAndTiming[k] == k) { + v->TotalNumberOfActiveOTG = v->TotalNumberOfActiveOTG + 1; + } + } + + if (v->MinActiveDRAMClockChangeMargin > 0 && PrefetchMode == 0) { + *DRAMClockChangeSupport = dm_dram_clock_change_vactive; +- } else if ((SynchronizedVBlank == true || v->TotalNumberOfActiveOTG == 1 ++ } else if ((v->SynchronizedVBlank == true || v->TotalNumberOfActiveOTG == 1 + || SecondMinActiveDRAMClockChangeMarginOneDisplayInVBLank > 0) && PrefetchMode == 0) { + *DRAMClockChangeSupport = dm_dram_clock_change_vblank; + } else { + *DRAMClockChangeSupport = dm_dram_clock_change_unsupported; + } + +- *StutterExitWatermark = SRExitTime + ExtraLatency + 10 / DCFCLKDeepSleep; +- *StutterEnterPlusExitWatermark = (SREnterPlusExitTime + ExtraLatency + 10 / DCFCLKDeepSleep); +- *Z8StutterExitWatermark = SRExitZ8Time + ExtraLatency + 10 / DCFCLKDeepSleep; +- *Z8StutterEnterPlusExitWatermark = SREnterPlusExitZ8Time + ExtraLatency + 10 / DCFCLKDeepSleep; ++ *StutterExitWatermark = v->SRExitTime + ExtraLatency + 10 / DCFCLKDeepSleep; ++ *StutterEnterPlusExitWatermark = (v->SREnterPlusExitTime + ExtraLatency + 10 / DCFCLKDeepSleep); ++ *Z8StutterExitWatermark = v->SRExitZ8Time + ExtraLatency + 10 / DCFCLKDeepSleep; ++ *Z8StutterEnterPlusExitWatermark = v->SREnterPlusExitZ8Time + ExtraLatency + 10 / DCFCLKDeepSleep; + + #ifdef __DML_VBA_DEBUG__ + dml_print("DML::%s: StutterExitWatermark = %f\n", __func__, *StutterExitWatermark); +diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c +index 64a38f08f4974..5a51be753e87f 100644 +--- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c ++++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c +@@ -1603,6 +1603,7 @@ static void interpolate_user_regamma(uint32_t hw_points_num, + struct fixed31_32 lut2; + struct fixed31_32 delta_lut; + struct fixed31_32 delta_index; ++ const struct fixed31_32 one = dc_fixpt_from_int(1); + + i = 0; + /* fixed_pt library has problems handling too small values */ +@@ -1631,6 +1632,9 @@ static void interpolate_user_regamma(uint32_t hw_points_num, + } else + hw_x = coordinates_x[i].x; + ++ if (dc_fixpt_le(one, hw_x)) ++ hw_x = one; ++ + norm_x = dc_fixpt_mul(norm_factor, hw_x); + index = dc_fixpt_floor(norm_x); + if (index < 0 || index > 255) +diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +index 32bb6b1d95261..d13e455c8827e 100644 +--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c ++++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +@@ -368,6 +368,17 @@ static void sienna_cichlid_check_bxco_support(struct smu_context *smu) + smu_baco->platform_support = + (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK) ? true : + false; ++ ++ /* ++ * Disable BACO entry/exit completely on below SKUs to ++ * avoid hardware intermittent failures. ++ */ ++ if (((adev->pdev->device == 0x73A1) && ++ (adev->pdev->revision == 0x00)) || ++ ((adev->pdev->device == 0x73BF) && ++ (adev->pdev->revision == 0xCF))) ++ smu_baco->platform_support = false; ++ + } + } + +diff --git a/drivers/gpu/drm/gma500/cdv_device.c b/drivers/gpu/drm/gma500/cdv_device.c +index dd32b484dd825..ce96234f3df20 100644 +--- a/drivers/gpu/drm/gma500/cdv_device.c ++++ b/drivers/gpu/drm/gma500/cdv_device.c +@@ -581,11 +581,9 @@ static const struct psb_offset cdv_regmap[2] = { + static int cdv_chip_setup(struct drm_device *dev) + { + struct drm_psb_private *dev_priv = to_drm_psb_private(dev); +- struct pci_dev *pdev = to_pci_dev(dev->dev); + INIT_WORK(&dev_priv->hotplug_work, cdv_hotplug_work_func); + +- if (pci_enable_msi(pdev)) +- dev_warn(dev->dev, "Enabling MSI failed!\n"); ++ dev_priv->use_msi = true; + dev_priv->regmap = cdv_regmap; + gma_get_core_freq(dev); + psb_intel_opregion_init(dev); +diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c +index dffe37490206d..4b7627a726378 100644 +--- a/drivers/gpu/drm/gma500/gem.c ++++ b/drivers/gpu/drm/gma500/gem.c +@@ -112,12 +112,12 @@ static void psb_gem_free_object(struct drm_gem_object *obj) + { + struct psb_gem_object *pobj = to_psb_gem_object(obj); + +- drm_gem_object_release(obj); +- + /* Undo the mmap pin if we are destroying the object */ + if (pobj->mmapping) + psb_gem_unpin(pobj); + ++ drm_gem_object_release(obj); ++ + WARN_ON(pobj->in_gart && !pobj->stolen); + + release_resource(&pobj->resource); +diff --git a/drivers/gpu/drm/gma500/gma_display.c b/drivers/gpu/drm/gma500/gma_display.c +index 34ec3fca09ba6..12287c9bb4d80 100644 +--- a/drivers/gpu/drm/gma500/gma_display.c ++++ b/drivers/gpu/drm/gma500/gma_display.c +@@ -531,15 +531,18 @@ int gma_crtc_page_flip(struct drm_crtc *crtc, + WARN_ON(drm_crtc_vblank_get(crtc) != 0); + + gma_crtc->page_flip_event = event; ++ spin_unlock_irqrestore(&dev->event_lock, flags); + + /* Call this locked if we want an event at vblank interrupt. */ + ret = crtc_funcs->mode_set_base(crtc, crtc->x, crtc->y, old_fb); + if (ret) { +- gma_crtc->page_flip_event = NULL; +- drm_crtc_vblank_put(crtc); ++ spin_lock_irqsave(&dev->event_lock, flags); ++ if (gma_crtc->page_flip_event) { ++ gma_crtc->page_flip_event = NULL; ++ drm_crtc_vblank_put(crtc); ++ } ++ spin_unlock_irqrestore(&dev->event_lock, flags); + } +- +- spin_unlock_irqrestore(&dev->event_lock, flags); + } else { + ret = crtc_funcs->mode_set_base(crtc, crtc->x, crtc->y, old_fb); + } +diff --git a/drivers/gpu/drm/gma500/oaktrail_device.c b/drivers/gpu/drm/gma500/oaktrail_device.c +index 5923a9c893122..f90e628cb482c 100644 +--- a/drivers/gpu/drm/gma500/oaktrail_device.c ++++ b/drivers/gpu/drm/gma500/oaktrail_device.c +@@ -501,12 +501,9 @@ static const struct psb_offset oaktrail_regmap[2] = { + static int oaktrail_chip_setup(struct drm_device *dev) + { + struct drm_psb_private *dev_priv = to_drm_psb_private(dev); +- struct pci_dev *pdev = to_pci_dev(dev->dev); + int ret; + +- if (pci_enable_msi(pdev)) +- dev_warn(dev->dev, "Enabling MSI failed!\n"); +- ++ dev_priv->use_msi = true; + dev_priv->regmap = oaktrail_regmap; + + ret = mid_chip_setup(dev); +diff --git a/drivers/gpu/drm/gma500/power.c b/drivers/gpu/drm/gma500/power.c +index b91de6d36e412..66873085d4505 100644 +--- a/drivers/gpu/drm/gma500/power.c ++++ b/drivers/gpu/drm/gma500/power.c +@@ -139,8 +139,6 @@ static void gma_suspend_pci(struct pci_dev *pdev) + dev_priv->regs.saveBSM = bsm; + pci_read_config_dword(pdev, 0xFC, &vbt); + dev_priv->regs.saveVBT = vbt; +- pci_read_config_dword(pdev, PSB_PCIx_MSI_ADDR_LOC, &dev_priv->msi_addr); +- pci_read_config_dword(pdev, PSB_PCIx_MSI_DATA_LOC, &dev_priv->msi_data); + + pci_disable_device(pdev); + pci_set_power_state(pdev, PCI_D3hot); +@@ -168,9 +166,6 @@ static bool gma_resume_pci(struct pci_dev *pdev) + pci_restore_state(pdev); + pci_write_config_dword(pdev, 0x5c, dev_priv->regs.saveBSM); + pci_write_config_dword(pdev, 0xFC, dev_priv->regs.saveVBT); +- /* restoring MSI address and data in PCIx space */ +- pci_write_config_dword(pdev, PSB_PCIx_MSI_ADDR_LOC, dev_priv->msi_addr); +- pci_write_config_dword(pdev, PSB_PCIx_MSI_DATA_LOC, dev_priv->msi_data); + ret = pci_enable_device(pdev); + + if (ret != 0) +@@ -223,8 +218,7 @@ int gma_power_resume(struct device *_dev) + mutex_lock(&power_mutex); + gma_resume_pci(pdev); + gma_resume_display(pdev); +- gma_irq_preinstall(dev); +- gma_irq_postinstall(dev); ++ gma_irq_install(dev); + mutex_unlock(&power_mutex); + return 0; + } +diff --git a/drivers/gpu/drm/gma500/psb_drv.c b/drivers/gpu/drm/gma500/psb_drv.c +index 1d8744f3e7020..54e756b486060 100644 +--- a/drivers/gpu/drm/gma500/psb_drv.c ++++ b/drivers/gpu/drm/gma500/psb_drv.c +@@ -383,7 +383,7 @@ static int psb_driver_load(struct drm_device *dev, unsigned long flags) + PSB_WVDC32(0xFFFFFFFF, PSB_INT_MASK_R); + spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags); + +- gma_irq_install(dev, pdev->irq); ++ gma_irq_install(dev); + + dev->max_vblank_count = 0xffffff; /* only 24 bits of frame count */ + +diff --git a/drivers/gpu/drm/gma500/psb_drv.h b/drivers/gpu/drm/gma500/psb_drv.h +index 0ddfec1a0851d..4c3fc5eaf6ad5 100644 +--- a/drivers/gpu/drm/gma500/psb_drv.h ++++ b/drivers/gpu/drm/gma500/psb_drv.h +@@ -490,6 +490,7 @@ struct drm_psb_private { + int rpm_enabled; + + /* MID specific */ ++ bool use_msi; + bool has_gct; + struct oaktrail_gct_data gct_data; + +@@ -499,10 +500,6 @@ struct drm_psb_private { + /* Register state */ + struct psb_save_area regs; + +- /* MSI reg save */ +- uint32_t msi_addr; +- uint32_t msi_data; +- + /* Hotplug handling */ + struct work_struct hotplug_work; + +diff --git a/drivers/gpu/drm/gma500/psb_irq.c b/drivers/gpu/drm/gma500/psb_irq.c +index e6e6d61bbeab6..038f18ed0a95e 100644 +--- a/drivers/gpu/drm/gma500/psb_irq.c ++++ b/drivers/gpu/drm/gma500/psb_irq.c +@@ -316,17 +316,24 @@ void gma_irq_postinstall(struct drm_device *dev) + spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags); + } + +-int gma_irq_install(struct drm_device *dev, unsigned int irq) ++int gma_irq_install(struct drm_device *dev) + { ++ struct drm_psb_private *dev_priv = to_drm_psb_private(dev); ++ struct pci_dev *pdev = to_pci_dev(dev->dev); + int ret; + +- if (irq == IRQ_NOTCONNECTED) ++ if (dev_priv->use_msi && pci_enable_msi(pdev)) { ++ dev_warn(dev->dev, "Enabling MSI failed!\n"); ++ dev_priv->use_msi = false; ++ } ++ ++ if (pdev->irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + gma_irq_preinstall(dev); + + /* PCI devices require shared interrupts. */ +- ret = request_irq(irq, gma_irq_handler, IRQF_SHARED, dev->driver->name, dev); ++ ret = request_irq(pdev->irq, gma_irq_handler, IRQF_SHARED, dev->driver->name, dev); + if (ret) + return ret; + +@@ -369,6 +376,8 @@ void gma_irq_uninstall(struct drm_device *dev) + spin_unlock_irqrestore(&dev_priv->irqmask_lock, irqflags); + + free_irq(pdev->irq, dev); ++ if (dev_priv->use_msi) ++ pci_disable_msi(pdev); + } + + int gma_crtc_enable_vblank(struct drm_crtc *crtc) +diff --git a/drivers/gpu/drm/gma500/psb_irq.h b/drivers/gpu/drm/gma500/psb_irq.h +index b51e395194fff..7648f69824a5d 100644 +--- a/drivers/gpu/drm/gma500/psb_irq.h ++++ b/drivers/gpu/drm/gma500/psb_irq.h +@@ -17,7 +17,7 @@ struct drm_device; + + void gma_irq_preinstall(struct drm_device *dev); + void gma_irq_postinstall(struct drm_device *dev); +-int gma_irq_install(struct drm_device *dev, unsigned int irq); ++int gma_irq_install(struct drm_device *dev); + void gma_irq_uninstall(struct drm_device *dev); + + int gma_crtc_enable_vblank(struct drm_crtc *crtc); +diff --git a/drivers/gpu/drm/hisilicon/hibmc/Kconfig b/drivers/gpu/drm/hisilicon/hibmc/Kconfig +index 073adfe438ddd..4e41c144a2902 100644 +--- a/drivers/gpu/drm/hisilicon/hibmc/Kconfig ++++ b/drivers/gpu/drm/hisilicon/hibmc/Kconfig +@@ -2,6 +2,7 @@ + config DRM_HISI_HIBMC + tristate "DRM Support for Hisilicon Hibmc" + depends on DRM && PCI && (ARM64 || COMPILE_TEST) ++ depends on MMU + select DRM_KMS_HELPER + select DRM_VRAM_HELPER + select DRM_TTM +diff --git a/drivers/gpu/drm/i915/display/g4x_dp.c b/drivers/gpu/drm/i915/display/g4x_dp.c +index 5a957acebfd62..82ad8fe7440c0 100644 +--- a/drivers/gpu/drm/i915/display/g4x_dp.c ++++ b/drivers/gpu/drm/i915/display/g4x_dp.c +@@ -395,26 +395,8 @@ static void intel_dp_get_config(struct intel_encoder *encoder, + intel_dotclock_calculate(pipe_config->port_clock, + &pipe_config->dp_m_n); + +- if (intel_dp_is_edp(intel_dp) && dev_priv->vbt.edp.bpp && +- pipe_config->pipe_bpp > dev_priv->vbt.edp.bpp) { +- /* +- * This is a big fat ugly hack. +- * +- * Some machines in UEFI boot mode provide us a VBT that has 18 +- * bpp and 1.62 GHz link bandwidth for eDP, which for reasons +- * unknown we fail to light up. Yet the same BIOS boots up with +- * 24 bpp and 2.7 GHz link. Use the same bpp as the BIOS uses as +- * max, not what it tells us to use. +- * +- * Note: This will still be broken if the eDP panel is not lit +- * up by the BIOS, and thus we can't get the mode at module +- * load. +- */ +- drm_dbg_kms(&dev_priv->drm, +- "pipe has %d bpp for eDP panel, overriding BIOS-provided max %d bpp\n", +- pipe_config->pipe_bpp, dev_priv->vbt.edp.bpp); +- dev_priv->vbt.edp.bpp = pipe_config->pipe_bpp; +- } ++ if (intel_dp_is_edp(intel_dp)) ++ intel_edp_fixup_vbt_bpp(encoder, pipe_config->pipe_bpp); + } + + static void +diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c +index 5508ebb9eb434..f416499dad6f3 100644 +--- a/drivers/gpu/drm/i915/display/icl_dsi.c ++++ b/drivers/gpu/drm/i915/display/icl_dsi.c +@@ -1864,7 +1864,8 @@ static void icl_dphy_param_init(struct intel_dsi *intel_dsi) + { + struct drm_device *dev = intel_dsi->base.base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); +- struct mipi_config *mipi_config = dev_priv->vbt.dsi.config; ++ struct intel_connector *connector = intel_dsi->attached_connector; ++ struct mipi_config *mipi_config = connector->panel.vbt.dsi.config; + u32 tlpx_ns; + u32 prepare_cnt, exit_zero_cnt, clk_zero_cnt, trail_cnt; + u32 ths_prepare_ns, tclk_trail_ns; +@@ -2051,6 +2052,8 @@ void icl_dsi_init(struct drm_i915_private *dev_priv) + /* attach connector to encoder */ + intel_connector_attach_encoder(intel_connector, encoder); + ++ intel_bios_init_panel(dev_priv, &intel_connector->panel); ++ + mutex_lock(&dev->mode_config.mutex); + intel_panel_add_vbt_lfp_fixed_mode(intel_connector); + mutex_unlock(&dev->mode_config.mutex); +@@ -2064,13 +2067,20 @@ void icl_dsi_init(struct drm_i915_private *dev_priv) + + intel_backlight_setup(intel_connector, INVALID_PIPE); + +- if (dev_priv->vbt.dsi.config->dual_link) ++ if (intel_connector->panel.vbt.dsi.config->dual_link) + intel_dsi->ports = BIT(PORT_A) | BIT(PORT_B); + else + intel_dsi->ports = BIT(port); + +- intel_dsi->dcs_backlight_ports = dev_priv->vbt.dsi.bl_ports; +- intel_dsi->dcs_cabc_ports = dev_priv->vbt.dsi.cabc_ports; ++ if (drm_WARN_ON(&dev_priv->drm, intel_connector->panel.vbt.dsi.bl_ports & ~intel_dsi->ports)) ++ intel_connector->panel.vbt.dsi.bl_ports &= intel_dsi->ports; ++ ++ intel_dsi->dcs_backlight_ports = intel_connector->panel.vbt.dsi.bl_ports; ++ ++ if (drm_WARN_ON(&dev_priv->drm, intel_connector->panel.vbt.dsi.cabc_ports & ~intel_dsi->ports)) ++ intel_connector->panel.vbt.dsi.cabc_ports &= intel_dsi->ports; ++ ++ intel_dsi->dcs_cabc_ports = intel_connector->panel.vbt.dsi.cabc_ports; + + for_each_dsi_port(port, intel_dsi->ports) { + struct intel_dsi_host *host; +diff --git a/drivers/gpu/drm/i915/display/intel_backlight.c b/drivers/gpu/drm/i915/display/intel_backlight.c +index 3e200a2e4ba29..5182bb66bd289 100644 +--- a/drivers/gpu/drm/i915/display/intel_backlight.c ++++ b/drivers/gpu/drm/i915/display/intel_backlight.c +@@ -1158,9 +1158,10 @@ static u32 vlv_hz_to_pwm(struct intel_connector *connector, u32 pwm_freq_hz) + return DIV_ROUND_CLOSEST(clock, pwm_freq_hz * mul); + } + +-static u16 get_vbt_pwm_freq(struct drm_i915_private *dev_priv) ++static u16 get_vbt_pwm_freq(struct intel_connector *connector) + { +- u16 pwm_freq_hz = dev_priv->vbt.backlight.pwm_freq_hz; ++ struct drm_i915_private *dev_priv = to_i915(connector->base.dev); ++ u16 pwm_freq_hz = connector->panel.vbt.backlight.pwm_freq_hz; + + if (pwm_freq_hz) { + drm_dbg_kms(&dev_priv->drm, +@@ -1180,7 +1181,7 @@ static u32 get_backlight_max_vbt(struct intel_connector *connector) + { + struct drm_i915_private *dev_priv = to_i915(connector->base.dev); + struct intel_panel *panel = &connector->panel; +- u16 pwm_freq_hz = get_vbt_pwm_freq(dev_priv); ++ u16 pwm_freq_hz = get_vbt_pwm_freq(connector); + u32 pwm; + + if (!panel->backlight.pwm_funcs->hz_to_pwm) { +@@ -1217,11 +1218,11 @@ static u32 get_backlight_min_vbt(struct intel_connector *connector) + * against this by letting the minimum be at most (arbitrarily chosen) + * 25% of the max. + */ +- min = clamp_t(int, dev_priv->vbt.backlight.min_brightness, 0, 64); +- if (min != dev_priv->vbt.backlight.min_brightness) { ++ min = clamp_t(int, connector->panel.vbt.backlight.min_brightness, 0, 64); ++ if (min != connector->panel.vbt.backlight.min_brightness) { + drm_dbg_kms(&dev_priv->drm, + "clamping VBT min backlight %d/255 to %d/255\n", +- dev_priv->vbt.backlight.min_brightness, min); ++ connector->panel.vbt.backlight.min_brightness, min); + } + + /* vbt value is a coefficient in range [0..255] */ +@@ -1410,7 +1411,7 @@ bxt_setup_backlight(struct intel_connector *connector, enum pipe unused) + struct intel_panel *panel = &connector->panel; + u32 pwm_ctl, val; + +- panel->backlight.controller = dev_priv->vbt.backlight.controller; ++ panel->backlight.controller = connector->panel.vbt.backlight.controller; + + pwm_ctl = intel_de_read(dev_priv, + BXT_BLC_PWM_CTL(panel->backlight.controller)); +@@ -1483,7 +1484,7 @@ static int ext_pwm_setup_backlight(struct intel_connector *connector, + u32 level; + + /* Get the right PWM chip for DSI backlight according to VBT */ +- if (dev_priv->vbt.dsi.config->pwm_blc == PPS_BLC_PMIC) { ++ if (connector->panel.vbt.dsi.config->pwm_blc == PPS_BLC_PMIC) { + panel->backlight.pwm = pwm_get(dev->dev, "pwm_pmic_backlight"); + desc = "PMIC"; + } else { +@@ -1512,11 +1513,11 @@ static int ext_pwm_setup_backlight(struct intel_connector *connector, + + drm_dbg_kms(&dev_priv->drm, "PWM already enabled at freq %ld, VBT freq %d, level %d\n", + NSEC_PER_SEC / (unsigned long)panel->backlight.pwm_state.period, +- get_vbt_pwm_freq(dev_priv), level); ++ get_vbt_pwm_freq(connector), level); + } else { + /* Set period from VBT frequency, leave other settings at 0. */ + panel->backlight.pwm_state.period = +- NSEC_PER_SEC / get_vbt_pwm_freq(dev_priv); ++ NSEC_PER_SEC / get_vbt_pwm_freq(connector); + } + + drm_info(&dev_priv->drm, "Using %s PWM for LCD backlight control\n", +@@ -1601,7 +1602,7 @@ int intel_backlight_setup(struct intel_connector *connector, enum pipe pipe) + struct intel_panel *panel = &connector->panel; + int ret; + +- if (!dev_priv->vbt.backlight.present) { ++ if (!connector->panel.vbt.backlight.present) { + if (dev_priv->quirks & QUIRK_BACKLIGHT_PRESENT) { + drm_dbg_kms(&dev_priv->drm, + "no backlight present per VBT, but present per quirk\n"); +diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c +index 91caf4523b34d..b5de61fe9cc67 100644 +--- a/drivers/gpu/drm/i915/display/intel_bios.c ++++ b/drivers/gpu/drm/i915/display/intel_bios.c +@@ -682,7 +682,8 @@ static int get_panel_type(struct drm_i915_private *i915) + + /* Parse general panel options */ + static void +-parse_panel_options(struct drm_i915_private *i915) ++parse_panel_options(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_lvds_options *lvds_options; + int panel_type; +@@ -692,11 +693,11 @@ parse_panel_options(struct drm_i915_private *i915) + if (!lvds_options) + return; + +- i915->vbt.lvds_dither = lvds_options->pixel_dither; ++ panel->vbt.lvds_dither = lvds_options->pixel_dither; + + panel_type = get_panel_type(i915); + +- i915->vbt.panel_type = panel_type; ++ panel->vbt.panel_type = panel_type; + + drrs_mode = (lvds_options->dps_panel_type_bits + >> (panel_type * 2)) & MODE_MASK; +@@ -707,16 +708,16 @@ parse_panel_options(struct drm_i915_private *i915) + */ + switch (drrs_mode) { + case 0: +- i915->vbt.drrs_type = DRRS_TYPE_STATIC; ++ panel->vbt.drrs_type = DRRS_TYPE_STATIC; + drm_dbg_kms(&i915->drm, "DRRS supported mode is static\n"); + break; + case 2: +- i915->vbt.drrs_type = DRRS_TYPE_SEAMLESS; ++ panel->vbt.drrs_type = DRRS_TYPE_SEAMLESS; + drm_dbg_kms(&i915->drm, + "DRRS supported mode is seamless\n"); + break; + default: +- i915->vbt.drrs_type = DRRS_TYPE_NONE; ++ panel->vbt.drrs_type = DRRS_TYPE_NONE; + drm_dbg_kms(&i915->drm, + "DRRS not supported (VBT input)\n"); + break; +@@ -725,13 +726,14 @@ parse_panel_options(struct drm_i915_private *i915) + + static void + parse_lfp_panel_dtd(struct drm_i915_private *i915, ++ struct intel_panel *panel, + const struct bdb_lvds_lfp_data *lvds_lfp_data, + const struct bdb_lvds_lfp_data_ptrs *lvds_lfp_data_ptrs) + { + const struct lvds_dvo_timing *panel_dvo_timing; + const struct lvds_fp_timing *fp_timing; + struct drm_display_mode *panel_fixed_mode; +- int panel_type = i915->vbt.panel_type; ++ int panel_type = panel->vbt.panel_type; + + panel_dvo_timing = get_lvds_dvo_timing(lvds_lfp_data, + lvds_lfp_data_ptrs, +@@ -743,7 +745,7 @@ parse_lfp_panel_dtd(struct drm_i915_private *i915, + + fill_detail_timing_data(panel_fixed_mode, panel_dvo_timing); + +- i915->vbt.lfp_lvds_vbt_mode = panel_fixed_mode; ++ panel->vbt.lfp_lvds_vbt_mode = panel_fixed_mode; + + drm_dbg_kms(&i915->drm, + "Found panel mode in BIOS VBT legacy lfp table: " DRM_MODE_FMT "\n", +@@ -756,20 +758,21 @@ parse_lfp_panel_dtd(struct drm_i915_private *i915, + /* check the resolution, just to be sure */ + if (fp_timing->x_res == panel_fixed_mode->hdisplay && + fp_timing->y_res == panel_fixed_mode->vdisplay) { +- i915->vbt.bios_lvds_val = fp_timing->lvds_reg_val; ++ panel->vbt.bios_lvds_val = fp_timing->lvds_reg_val; + drm_dbg_kms(&i915->drm, + "VBT initial LVDS value %x\n", +- i915->vbt.bios_lvds_val); ++ panel->vbt.bios_lvds_val); + } + } + + static void +-parse_lfp_data(struct drm_i915_private *i915) ++parse_lfp_data(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_lvds_lfp_data *data; + const struct bdb_lvds_lfp_data_tail *tail; + const struct bdb_lvds_lfp_data_ptrs *ptrs; +- int panel_type = i915->vbt.panel_type; ++ int panel_type = panel->vbt.panel_type; + + ptrs = find_section(i915, BDB_LVDS_LFP_DATA_PTRS); + if (!ptrs) +@@ -779,24 +782,25 @@ parse_lfp_data(struct drm_i915_private *i915) + if (!data) + return; + +- if (!i915->vbt.lfp_lvds_vbt_mode) +- parse_lfp_panel_dtd(i915, data, ptrs); ++ if (!panel->vbt.lfp_lvds_vbt_mode) ++ parse_lfp_panel_dtd(i915, panel, data, ptrs); + + tail = get_lfp_data_tail(data, ptrs); + if (!tail) + return; + + if (i915->vbt.version >= 188) { +- i915->vbt.seamless_drrs_min_refresh_rate = ++ panel->vbt.seamless_drrs_min_refresh_rate = + tail->seamless_drrs_min_refresh_rate[panel_type]; + drm_dbg_kms(&i915->drm, + "Seamless DRRS min refresh rate: %d Hz\n", +- i915->vbt.seamless_drrs_min_refresh_rate); ++ panel->vbt.seamless_drrs_min_refresh_rate); + } + } + + static void +-parse_generic_dtd(struct drm_i915_private *i915) ++parse_generic_dtd(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_generic_dtd *generic_dtd; + const struct generic_dtd_entry *dtd; +@@ -831,14 +835,14 @@ parse_generic_dtd(struct drm_i915_private *i915) + + num_dtd = (get_blocksize(generic_dtd) - + sizeof(struct bdb_generic_dtd)) / generic_dtd->gdtd_size; +- if (i915->vbt.panel_type >= num_dtd) { ++ if (panel->vbt.panel_type >= num_dtd) { + drm_err(&i915->drm, + "Panel type %d not found in table of %d DTD's\n", +- i915->vbt.panel_type, num_dtd); ++ panel->vbt.panel_type, num_dtd); + return; + } + +- dtd = &generic_dtd->dtd[i915->vbt.panel_type]; ++ dtd = &generic_dtd->dtd[panel->vbt.panel_type]; + + panel_fixed_mode = kzalloc(sizeof(*panel_fixed_mode), GFP_KERNEL); + if (!panel_fixed_mode) +@@ -881,15 +885,16 @@ parse_generic_dtd(struct drm_i915_private *i915) + "Found panel mode in BIOS VBT generic dtd table: " DRM_MODE_FMT "\n", + DRM_MODE_ARG(panel_fixed_mode)); + +- i915->vbt.lfp_lvds_vbt_mode = panel_fixed_mode; ++ panel->vbt.lfp_lvds_vbt_mode = panel_fixed_mode; + } + + static void +-parse_lfp_backlight(struct drm_i915_private *i915) ++parse_lfp_backlight(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_lfp_backlight_data *backlight_data; + const struct lfp_backlight_data_entry *entry; +- int panel_type = i915->vbt.panel_type; ++ int panel_type = panel->vbt.panel_type; + u16 level; + + backlight_data = find_section(i915, BDB_LVDS_BACKLIGHT); +@@ -905,15 +910,15 @@ parse_lfp_backlight(struct drm_i915_private *i915) + + entry = &backlight_data->data[panel_type]; + +- i915->vbt.backlight.present = entry->type == BDB_BACKLIGHT_TYPE_PWM; +- if (!i915->vbt.backlight.present) { ++ panel->vbt.backlight.present = entry->type == BDB_BACKLIGHT_TYPE_PWM; ++ if (!panel->vbt.backlight.present) { + drm_dbg_kms(&i915->drm, + "PWM backlight not present in VBT (type %u)\n", + entry->type); + return; + } + +- i915->vbt.backlight.type = INTEL_BACKLIGHT_DISPLAY_DDI; ++ panel->vbt.backlight.type = INTEL_BACKLIGHT_DISPLAY_DDI; + if (i915->vbt.version >= 191) { + size_t exp_size; + +@@ -928,13 +933,13 @@ parse_lfp_backlight(struct drm_i915_private *i915) + const struct lfp_backlight_control_method *method; + + method = &backlight_data->backlight_control[panel_type]; +- i915->vbt.backlight.type = method->type; +- i915->vbt.backlight.controller = method->controller; ++ panel->vbt.backlight.type = method->type; ++ panel->vbt.backlight.controller = method->controller; + } + } + +- i915->vbt.backlight.pwm_freq_hz = entry->pwm_freq_hz; +- i915->vbt.backlight.active_low_pwm = entry->active_low_pwm; ++ panel->vbt.backlight.pwm_freq_hz = entry->pwm_freq_hz; ++ panel->vbt.backlight.active_low_pwm = entry->active_low_pwm; + + if (i915->vbt.version >= 234) { + u16 min_level; +@@ -955,28 +960,29 @@ parse_lfp_backlight(struct drm_i915_private *i915) + drm_warn(&i915->drm, "Brightness min level > 255\n"); + level = 255; + } +- i915->vbt.backlight.min_brightness = min_level; ++ panel->vbt.backlight.min_brightness = min_level; + +- i915->vbt.backlight.brightness_precision_bits = ++ panel->vbt.backlight.brightness_precision_bits = + backlight_data->brightness_precision_bits[panel_type]; + } else { + level = backlight_data->level[panel_type]; +- i915->vbt.backlight.min_brightness = entry->min_brightness; ++ panel->vbt.backlight.min_brightness = entry->min_brightness; + } + + drm_dbg_kms(&i915->drm, + "VBT backlight PWM modulation frequency %u Hz, " + "active %s, min brightness %u, level %u, controller %u\n", +- i915->vbt.backlight.pwm_freq_hz, +- i915->vbt.backlight.active_low_pwm ? "low" : "high", +- i915->vbt.backlight.min_brightness, ++ panel->vbt.backlight.pwm_freq_hz, ++ panel->vbt.backlight.active_low_pwm ? "low" : "high", ++ panel->vbt.backlight.min_brightness, + level, +- i915->vbt.backlight.controller); ++ panel->vbt.backlight.controller); + } + + /* Try to find sdvo panel data */ + static void +-parse_sdvo_panel_data(struct drm_i915_private *i915) ++parse_sdvo_panel_data(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_sdvo_panel_dtds *dtds; + struct drm_display_mode *panel_fixed_mode; +@@ -1009,7 +1015,7 @@ parse_sdvo_panel_data(struct drm_i915_private *i915) + + fill_detail_timing_data(panel_fixed_mode, &dtds->dtds[index]); + +- i915->vbt.sdvo_lvds_vbt_mode = panel_fixed_mode; ++ panel->vbt.sdvo_lvds_vbt_mode = panel_fixed_mode; + + drm_dbg_kms(&i915->drm, + "Found SDVO panel mode in BIOS VBT tables: " DRM_MODE_FMT "\n", +@@ -1188,6 +1194,17 @@ parse_driver_features(struct drm_i915_private *i915) + driver->lvds_config != BDB_DRIVER_FEATURE_INT_SDVO_LVDS) + i915->vbt.int_lvds_support = 0; + } ++} ++ ++static void ++parse_panel_driver_features(struct drm_i915_private *i915, ++ struct intel_panel *panel) ++{ ++ const struct bdb_driver_features *driver; ++ ++ driver = find_section(i915, BDB_DRIVER_FEATURES); ++ if (!driver) ++ return; + + if (i915->vbt.version < 228) { + drm_dbg_kms(&i915->drm, "DRRS State Enabled:%d\n", +@@ -1199,17 +1216,18 @@ parse_driver_features(struct drm_i915_private *i915) + * driver->drrs_enabled=false + */ + if (!driver->drrs_enabled) +- i915->vbt.drrs_type = DRRS_TYPE_NONE; ++ panel->vbt.drrs_type = DRRS_TYPE_NONE; + +- i915->vbt.psr.enable = driver->psr_enabled; ++ panel->vbt.psr.enable = driver->psr_enabled; + } + } + + static void +-parse_power_conservation_features(struct drm_i915_private *i915) ++parse_power_conservation_features(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_lfp_power *power; +- u8 panel_type = i915->vbt.panel_type; ++ u8 panel_type = panel->vbt.panel_type; + + if (i915->vbt.version < 228) + return; +@@ -1218,7 +1236,7 @@ parse_power_conservation_features(struct drm_i915_private *i915) + if (!power) + return; + +- i915->vbt.psr.enable = power->psr & BIT(panel_type); ++ panel->vbt.psr.enable = power->psr & BIT(panel_type); + + /* + * If DRRS is not supported, drrs_type has to be set to 0. +@@ -1227,19 +1245,20 @@ parse_power_conservation_features(struct drm_i915_private *i915) + * power->drrs & BIT(panel_type)=false + */ + if (!(power->drrs & BIT(panel_type))) +- i915->vbt.drrs_type = DRRS_TYPE_NONE; ++ panel->vbt.drrs_type = DRRS_TYPE_NONE; + + if (i915->vbt.version >= 232) +- i915->vbt.edp.hobl = power->hobl & BIT(panel_type); ++ panel->vbt.edp.hobl = power->hobl & BIT(panel_type); + } + + static void +-parse_edp(struct drm_i915_private *i915) ++parse_edp(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_edp *edp; + const struct edp_power_seq *edp_pps; + const struct edp_fast_link_params *edp_link_params; +- int panel_type = i915->vbt.panel_type; ++ int panel_type = panel->vbt.panel_type; + + edp = find_section(i915, BDB_EDP); + if (!edp) +@@ -1247,13 +1266,13 @@ parse_edp(struct drm_i915_private *i915) + + switch ((edp->color_depth >> (panel_type * 2)) & 3) { + case EDP_18BPP: +- i915->vbt.edp.bpp = 18; ++ panel->vbt.edp.bpp = 18; + break; + case EDP_24BPP: +- i915->vbt.edp.bpp = 24; ++ panel->vbt.edp.bpp = 24; + break; + case EDP_30BPP: +- i915->vbt.edp.bpp = 30; ++ panel->vbt.edp.bpp = 30; + break; + } + +@@ -1261,14 +1280,14 @@ parse_edp(struct drm_i915_private *i915) + edp_pps = &edp->power_seqs[panel_type]; + edp_link_params = &edp->fast_link_params[panel_type]; + +- i915->vbt.edp.pps = *edp_pps; ++ panel->vbt.edp.pps = *edp_pps; + + switch (edp_link_params->rate) { + case EDP_RATE_1_62: +- i915->vbt.edp.rate = DP_LINK_BW_1_62; ++ panel->vbt.edp.rate = DP_LINK_BW_1_62; + break; + case EDP_RATE_2_7: +- i915->vbt.edp.rate = DP_LINK_BW_2_7; ++ panel->vbt.edp.rate = DP_LINK_BW_2_7; + break; + default: + drm_dbg_kms(&i915->drm, +@@ -1279,13 +1298,13 @@ parse_edp(struct drm_i915_private *i915) + + switch (edp_link_params->lanes) { + case EDP_LANE_1: +- i915->vbt.edp.lanes = 1; ++ panel->vbt.edp.lanes = 1; + break; + case EDP_LANE_2: +- i915->vbt.edp.lanes = 2; ++ panel->vbt.edp.lanes = 2; + break; + case EDP_LANE_4: +- i915->vbt.edp.lanes = 4; ++ panel->vbt.edp.lanes = 4; + break; + default: + drm_dbg_kms(&i915->drm, +@@ -1296,16 +1315,16 @@ parse_edp(struct drm_i915_private *i915) + + switch (edp_link_params->preemphasis) { + case EDP_PREEMPHASIS_NONE: +- i915->vbt.edp.preemphasis = DP_TRAIN_PRE_EMPH_LEVEL_0; ++ panel->vbt.edp.preemphasis = DP_TRAIN_PRE_EMPH_LEVEL_0; + break; + case EDP_PREEMPHASIS_3_5dB: +- i915->vbt.edp.preemphasis = DP_TRAIN_PRE_EMPH_LEVEL_1; ++ panel->vbt.edp.preemphasis = DP_TRAIN_PRE_EMPH_LEVEL_1; + break; + case EDP_PREEMPHASIS_6dB: +- i915->vbt.edp.preemphasis = DP_TRAIN_PRE_EMPH_LEVEL_2; ++ panel->vbt.edp.preemphasis = DP_TRAIN_PRE_EMPH_LEVEL_2; + break; + case EDP_PREEMPHASIS_9_5dB: +- i915->vbt.edp.preemphasis = DP_TRAIN_PRE_EMPH_LEVEL_3; ++ panel->vbt.edp.preemphasis = DP_TRAIN_PRE_EMPH_LEVEL_3; + break; + default: + drm_dbg_kms(&i915->drm, +@@ -1316,16 +1335,16 @@ parse_edp(struct drm_i915_private *i915) + + switch (edp_link_params->vswing) { + case EDP_VSWING_0_4V: +- i915->vbt.edp.vswing = DP_TRAIN_VOLTAGE_SWING_LEVEL_0; ++ panel->vbt.edp.vswing = DP_TRAIN_VOLTAGE_SWING_LEVEL_0; + break; + case EDP_VSWING_0_6V: +- i915->vbt.edp.vswing = DP_TRAIN_VOLTAGE_SWING_LEVEL_1; ++ panel->vbt.edp.vswing = DP_TRAIN_VOLTAGE_SWING_LEVEL_1; + break; + case EDP_VSWING_0_8V: +- i915->vbt.edp.vswing = DP_TRAIN_VOLTAGE_SWING_LEVEL_2; ++ panel->vbt.edp.vswing = DP_TRAIN_VOLTAGE_SWING_LEVEL_2; + break; + case EDP_VSWING_1_2V: +- i915->vbt.edp.vswing = DP_TRAIN_VOLTAGE_SWING_LEVEL_3; ++ panel->vbt.edp.vswing = DP_TRAIN_VOLTAGE_SWING_LEVEL_3; + break; + default: + drm_dbg_kms(&i915->drm, +@@ -1339,24 +1358,25 @@ parse_edp(struct drm_i915_private *i915) + + /* Don't read from VBT if module parameter has valid value*/ + if (i915->params.edp_vswing) { +- i915->vbt.edp.low_vswing = ++ panel->vbt.edp.low_vswing = + i915->params.edp_vswing == 1; + } else { + vswing = (edp->edp_vswing_preemph >> (panel_type * 4)) & 0xF; +- i915->vbt.edp.low_vswing = vswing == 0; ++ panel->vbt.edp.low_vswing = vswing == 0; + } + } + +- i915->vbt.edp.drrs_msa_timing_delay = ++ panel->vbt.edp.drrs_msa_timing_delay = + (edp->sdrrs_msa_timing_delay >> (panel_type * 2)) & 3; + } + + static void +-parse_psr(struct drm_i915_private *i915) ++parse_psr(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_psr *psr; + const struct psr_table *psr_table; +- int panel_type = i915->vbt.panel_type; ++ int panel_type = panel->vbt.panel_type; + + psr = find_section(i915, BDB_PSR); + if (!psr) { +@@ -1366,11 +1386,11 @@ parse_psr(struct drm_i915_private *i915) + + psr_table = &psr->psr_table[panel_type]; + +- i915->vbt.psr.full_link = psr_table->full_link; +- i915->vbt.psr.require_aux_wakeup = psr_table->require_aux_to_wakeup; ++ panel->vbt.psr.full_link = psr_table->full_link; ++ panel->vbt.psr.require_aux_wakeup = psr_table->require_aux_to_wakeup; + + /* Allowed VBT values goes from 0 to 15 */ +- i915->vbt.psr.idle_frames = psr_table->idle_frames < 0 ? 0 : ++ panel->vbt.psr.idle_frames = psr_table->idle_frames < 0 ? 0 : + psr_table->idle_frames > 15 ? 15 : psr_table->idle_frames; + + /* +@@ -1381,13 +1401,13 @@ parse_psr(struct drm_i915_private *i915) + (DISPLAY_VER(i915) >= 9 && !IS_BROXTON(i915))) { + switch (psr_table->tp1_wakeup_time) { + case 0: +- i915->vbt.psr.tp1_wakeup_time_us = 500; ++ panel->vbt.psr.tp1_wakeup_time_us = 500; + break; + case 1: +- i915->vbt.psr.tp1_wakeup_time_us = 100; ++ panel->vbt.psr.tp1_wakeup_time_us = 100; + break; + case 3: +- i915->vbt.psr.tp1_wakeup_time_us = 0; ++ panel->vbt.psr.tp1_wakeup_time_us = 0; + break; + default: + drm_dbg_kms(&i915->drm, +@@ -1395,19 +1415,19 @@ parse_psr(struct drm_i915_private *i915) + psr_table->tp1_wakeup_time); + fallthrough; + case 2: +- i915->vbt.psr.tp1_wakeup_time_us = 2500; ++ panel->vbt.psr.tp1_wakeup_time_us = 2500; + break; + } + + switch (psr_table->tp2_tp3_wakeup_time) { + case 0: +- i915->vbt.psr.tp2_tp3_wakeup_time_us = 500; ++ panel->vbt.psr.tp2_tp3_wakeup_time_us = 500; + break; + case 1: +- i915->vbt.psr.tp2_tp3_wakeup_time_us = 100; ++ panel->vbt.psr.tp2_tp3_wakeup_time_us = 100; + break; + case 3: +- i915->vbt.psr.tp2_tp3_wakeup_time_us = 0; ++ panel->vbt.psr.tp2_tp3_wakeup_time_us = 0; + break; + default: + drm_dbg_kms(&i915->drm, +@@ -1415,12 +1435,12 @@ parse_psr(struct drm_i915_private *i915) + psr_table->tp2_tp3_wakeup_time); + fallthrough; + case 2: +- i915->vbt.psr.tp2_tp3_wakeup_time_us = 2500; ++ panel->vbt.psr.tp2_tp3_wakeup_time_us = 2500; + break; + } + } else { +- i915->vbt.psr.tp1_wakeup_time_us = psr_table->tp1_wakeup_time * 100; +- i915->vbt.psr.tp2_tp3_wakeup_time_us = psr_table->tp2_tp3_wakeup_time * 100; ++ panel->vbt.psr.tp1_wakeup_time_us = psr_table->tp1_wakeup_time * 100; ++ panel->vbt.psr.tp2_tp3_wakeup_time_us = psr_table->tp2_tp3_wakeup_time * 100; + } + + if (i915->vbt.version >= 226) { +@@ -1442,62 +1462,66 @@ parse_psr(struct drm_i915_private *i915) + wakeup_time = 2500; + break; + } +- i915->vbt.psr.psr2_tp2_tp3_wakeup_time_us = wakeup_time; ++ panel->vbt.psr.psr2_tp2_tp3_wakeup_time_us = wakeup_time; + } else { + /* Reusing PSR1 wakeup time for PSR2 in older VBTs */ +- i915->vbt.psr.psr2_tp2_tp3_wakeup_time_us = i915->vbt.psr.tp2_tp3_wakeup_time_us; ++ panel->vbt.psr.psr2_tp2_tp3_wakeup_time_us = panel->vbt.psr.tp2_tp3_wakeup_time_us; + } + } + + static void parse_dsi_backlight_ports(struct drm_i915_private *i915, +- u16 version, enum port port) ++ struct intel_panel *panel, ++ enum port port) + { +- if (!i915->vbt.dsi.config->dual_link || version < 197) { +- i915->vbt.dsi.bl_ports = BIT(port); +- if (i915->vbt.dsi.config->cabc_supported) +- i915->vbt.dsi.cabc_ports = BIT(port); ++ enum port port_bc = DISPLAY_VER(i915) >= 11 ? PORT_B : PORT_C; ++ ++ if (!panel->vbt.dsi.config->dual_link || i915->vbt.version < 197) { ++ panel->vbt.dsi.bl_ports = BIT(port); ++ if (panel->vbt.dsi.config->cabc_supported) ++ panel->vbt.dsi.cabc_ports = BIT(port); + + return; + } + +- switch (i915->vbt.dsi.config->dl_dcs_backlight_ports) { ++ switch (panel->vbt.dsi.config->dl_dcs_backlight_ports) { + case DL_DCS_PORT_A: +- i915->vbt.dsi.bl_ports = BIT(PORT_A); ++ panel->vbt.dsi.bl_ports = BIT(PORT_A); + break; + case DL_DCS_PORT_C: +- i915->vbt.dsi.bl_ports = BIT(PORT_C); ++ panel->vbt.dsi.bl_ports = BIT(port_bc); + break; + default: + case DL_DCS_PORT_A_AND_C: +- i915->vbt.dsi.bl_ports = BIT(PORT_A) | BIT(PORT_C); ++ panel->vbt.dsi.bl_ports = BIT(PORT_A) | BIT(port_bc); + break; + } + +- if (!i915->vbt.dsi.config->cabc_supported) ++ if (!panel->vbt.dsi.config->cabc_supported) + return; + +- switch (i915->vbt.dsi.config->dl_dcs_cabc_ports) { ++ switch (panel->vbt.dsi.config->dl_dcs_cabc_ports) { + case DL_DCS_PORT_A: +- i915->vbt.dsi.cabc_ports = BIT(PORT_A); ++ panel->vbt.dsi.cabc_ports = BIT(PORT_A); + break; + case DL_DCS_PORT_C: +- i915->vbt.dsi.cabc_ports = BIT(PORT_C); ++ panel->vbt.dsi.cabc_ports = BIT(port_bc); + break; + default: + case DL_DCS_PORT_A_AND_C: +- i915->vbt.dsi.cabc_ports = +- BIT(PORT_A) | BIT(PORT_C); ++ panel->vbt.dsi.cabc_ports = ++ BIT(PORT_A) | BIT(port_bc); + break; + } + } + + static void +-parse_mipi_config(struct drm_i915_private *i915) ++parse_mipi_config(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + const struct bdb_mipi_config *start; + const struct mipi_config *config; + const struct mipi_pps_data *pps; +- int panel_type = i915->vbt.panel_type; ++ int panel_type = panel->vbt.panel_type; + enum port port; + + /* parse MIPI blocks only if LFP type is MIPI */ +@@ -1505,7 +1529,7 @@ parse_mipi_config(struct drm_i915_private *i915) + return; + + /* Initialize this to undefined indicating no generic MIPI support */ +- i915->vbt.dsi.panel_id = MIPI_DSI_UNDEFINED_PANEL_ID; ++ panel->vbt.dsi.panel_id = MIPI_DSI_UNDEFINED_PANEL_ID; + + /* Block #40 is already parsed and panel_fixed_mode is + * stored in i915->lfp_lvds_vbt_mode +@@ -1532,17 +1556,17 @@ parse_mipi_config(struct drm_i915_private *i915) + pps = &start->pps[panel_type]; + + /* store as of now full data. Trim when we realise all is not needed */ +- i915->vbt.dsi.config = kmemdup(config, sizeof(struct mipi_config), GFP_KERNEL); +- if (!i915->vbt.dsi.config) ++ panel->vbt.dsi.config = kmemdup(config, sizeof(struct mipi_config), GFP_KERNEL); ++ if (!panel->vbt.dsi.config) + return; + +- i915->vbt.dsi.pps = kmemdup(pps, sizeof(struct mipi_pps_data), GFP_KERNEL); +- if (!i915->vbt.dsi.pps) { +- kfree(i915->vbt.dsi.config); ++ panel->vbt.dsi.pps = kmemdup(pps, sizeof(struct mipi_pps_data), GFP_KERNEL); ++ if (!panel->vbt.dsi.pps) { ++ kfree(panel->vbt.dsi.config); + return; + } + +- parse_dsi_backlight_ports(i915, i915->vbt.version, port); ++ parse_dsi_backlight_ports(i915, panel, port); + + /* FIXME is the 90 vs. 270 correct? */ + switch (config->rotation) { +@@ -1551,25 +1575,25 @@ parse_mipi_config(struct drm_i915_private *i915) + * Most (all?) VBTs claim 0 degrees despite having + * an upside down panel, thus we do not trust this. + */ +- i915->vbt.dsi.orientation = ++ panel->vbt.dsi.orientation = + DRM_MODE_PANEL_ORIENTATION_UNKNOWN; + break; + case ENABLE_ROTATION_90: +- i915->vbt.dsi.orientation = ++ panel->vbt.dsi.orientation = + DRM_MODE_PANEL_ORIENTATION_RIGHT_UP; + break; + case ENABLE_ROTATION_180: +- i915->vbt.dsi.orientation = ++ panel->vbt.dsi.orientation = + DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP; + break; + case ENABLE_ROTATION_270: +- i915->vbt.dsi.orientation = ++ panel->vbt.dsi.orientation = + DRM_MODE_PANEL_ORIENTATION_LEFT_UP; + break; + } + + /* We have mandatory mipi config blocks. Initialize as generic panel */ +- i915->vbt.dsi.panel_id = MIPI_DSI_GENERIC_PANEL_ID; ++ panel->vbt.dsi.panel_id = MIPI_DSI_GENERIC_PANEL_ID; + } + + /* Find the sequence block and size for the given panel. */ +@@ -1732,13 +1756,14 @@ static int goto_next_sequence_v3(const u8 *data, int index, int total) + * Get len of pre-fixed deassert fragment from a v1 init OTP sequence, + * skip all delay + gpio operands and stop at the first DSI packet op. + */ +-static int get_init_otp_deassert_fragment_len(struct drm_i915_private *i915) ++static int get_init_otp_deassert_fragment_len(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { +- const u8 *data = i915->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP]; ++ const u8 *data = panel->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP]; + int index, len; + + if (drm_WARN_ON(&i915->drm, +- !data || i915->vbt.dsi.seq_version != 1)) ++ !data || panel->vbt.dsi.seq_version != 1)) + return 0; + + /* index = 1 to skip sequence byte */ +@@ -1766,7 +1791,8 @@ static int get_init_otp_deassert_fragment_len(struct drm_i915_private *i915) + * these devices we split the init OTP sequence into a deassert sequence and + * the actual init OTP part. + */ +-static void fixup_mipi_sequences(struct drm_i915_private *i915) ++static void fixup_mipi_sequences(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { + u8 *init_otp; + int len; +@@ -1776,18 +1802,18 @@ static void fixup_mipi_sequences(struct drm_i915_private *i915) + return; + + /* Limit this to v1 vid-mode sequences */ +- if (i915->vbt.dsi.config->is_cmd_mode || +- i915->vbt.dsi.seq_version != 1) ++ if (panel->vbt.dsi.config->is_cmd_mode || ++ panel->vbt.dsi.seq_version != 1) + return; + + /* Only do this if there are otp and assert seqs and no deassert seq */ +- if (!i915->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP] || +- !i915->vbt.dsi.sequence[MIPI_SEQ_ASSERT_RESET] || +- i915->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET]) ++ if (!panel->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP] || ++ !panel->vbt.dsi.sequence[MIPI_SEQ_ASSERT_RESET] || ++ panel->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET]) + return; + + /* The deassert-sequence ends at the first DSI packet */ +- len = get_init_otp_deassert_fragment_len(i915); ++ len = get_init_otp_deassert_fragment_len(i915, panel); + if (!len) + return; + +@@ -1795,25 +1821,26 @@ static void fixup_mipi_sequences(struct drm_i915_private *i915) + "Using init OTP fragment to deassert reset\n"); + + /* Copy the fragment, update seq byte and terminate it */ +- init_otp = (u8 *)i915->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP]; +- i915->vbt.dsi.deassert_seq = kmemdup(init_otp, len + 1, GFP_KERNEL); +- if (!i915->vbt.dsi.deassert_seq) ++ init_otp = (u8 *)panel->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP]; ++ panel->vbt.dsi.deassert_seq = kmemdup(init_otp, len + 1, GFP_KERNEL); ++ if (!panel->vbt.dsi.deassert_seq) + return; +- i915->vbt.dsi.deassert_seq[0] = MIPI_SEQ_DEASSERT_RESET; +- i915->vbt.dsi.deassert_seq[len] = MIPI_SEQ_ELEM_END; ++ panel->vbt.dsi.deassert_seq[0] = MIPI_SEQ_DEASSERT_RESET; ++ panel->vbt.dsi.deassert_seq[len] = MIPI_SEQ_ELEM_END; + /* Use the copy for deassert */ +- i915->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET] = +- i915->vbt.dsi.deassert_seq; ++ panel->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET] = ++ panel->vbt.dsi.deassert_seq; + /* Replace the last byte of the fragment with init OTP seq byte */ + init_otp[len - 1] = MIPI_SEQ_INIT_OTP; + /* And make MIPI_MIPI_SEQ_INIT_OTP point to it */ +- i915->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP] = init_otp + len - 1; ++ panel->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP] = init_otp + len - 1; + } + + static void +-parse_mipi_sequence(struct drm_i915_private *i915) ++parse_mipi_sequence(struct drm_i915_private *i915, ++ struct intel_panel *panel) + { +- int panel_type = i915->vbt.panel_type; ++ int panel_type = panel->vbt.panel_type; + const struct bdb_mipi_sequence *sequence; + const u8 *seq_data; + u32 seq_size; +@@ -1821,7 +1848,7 @@ parse_mipi_sequence(struct drm_i915_private *i915) + int index = 0; + + /* Only our generic panel driver uses the sequence block. */ +- if (i915->vbt.dsi.panel_id != MIPI_DSI_GENERIC_PANEL_ID) ++ if (panel->vbt.dsi.panel_id != MIPI_DSI_GENERIC_PANEL_ID) + return; + + sequence = find_section(i915, BDB_MIPI_SEQUENCE); +@@ -1867,7 +1894,7 @@ parse_mipi_sequence(struct drm_i915_private *i915) + drm_dbg_kms(&i915->drm, + "Unsupported sequence %u\n", seq_id); + +- i915->vbt.dsi.sequence[seq_id] = data + index; ++ panel->vbt.dsi.sequence[seq_id] = data + index; + + if (sequence->version >= 3) + index = goto_next_sequence_v3(data, index, seq_size); +@@ -1880,18 +1907,18 @@ parse_mipi_sequence(struct drm_i915_private *i915) + } + } + +- i915->vbt.dsi.data = data; +- i915->vbt.dsi.size = seq_size; +- i915->vbt.dsi.seq_version = sequence->version; ++ panel->vbt.dsi.data = data; ++ panel->vbt.dsi.size = seq_size; ++ panel->vbt.dsi.seq_version = sequence->version; + +- fixup_mipi_sequences(i915); ++ fixup_mipi_sequences(i915, panel); + + drm_dbg(&i915->drm, "MIPI related VBT parsing complete\n"); + return; + + err: + kfree(data); +- memset(i915->vbt.dsi.sequence, 0, sizeof(i915->vbt.dsi.sequence)); ++ memset(panel->vbt.dsi.sequence, 0, sizeof(panel->vbt.dsi.sequence)); + } + + static void +@@ -2645,15 +2672,6 @@ init_vbt_defaults(struct drm_i915_private *i915) + { + i915->vbt.crt_ddc_pin = GMBUS_PIN_VGADDC; + +- /* Default to having backlight */ +- i915->vbt.backlight.present = true; +- +- /* LFP panel data */ +- i915->vbt.lvds_dither = 1; +- +- /* SDVO panel data */ +- i915->vbt.sdvo_lvds_vbt_mode = NULL; +- + /* general features */ + i915->vbt.int_tv_support = 1; + i915->vbt.int_crt_support = 1; +@@ -2673,6 +2691,17 @@ init_vbt_defaults(struct drm_i915_private *i915) + i915->vbt.lvds_ssc_freq); + } + ++/* Common defaults which may be overridden by VBT. */ ++static void ++init_vbt_panel_defaults(struct intel_panel *panel) ++{ ++ /* Default to having backlight */ ++ panel->vbt.backlight.present = true; ++ ++ /* LFP panel data */ ++ panel->vbt.lvds_dither = true; ++} ++ + /* Defaults to initialize only if there is no VBT. */ + static void + init_vbt_missing_defaults(struct drm_i915_private *i915) +@@ -2959,17 +2988,7 @@ void intel_bios_init(struct drm_i915_private *i915) + /* Grab useful general definitions */ + parse_general_features(i915); + parse_general_definitions(i915); +- parse_panel_options(i915); +- parse_generic_dtd(i915); +- parse_lfp_data(i915); +- parse_lfp_backlight(i915); +- parse_sdvo_panel_data(i915); + parse_driver_features(i915); +- parse_power_conservation_features(i915); +- parse_edp(i915); +- parse_psr(i915); +- parse_mipi_config(i915); +- parse_mipi_sequence(i915); + + /* Depends on child device list */ + parse_compression_parameters(i915); +@@ -2988,6 +3007,24 @@ out: + kfree(oprom_vbt); + } + ++void intel_bios_init_panel(struct drm_i915_private *i915, ++ struct intel_panel *panel) ++{ ++ init_vbt_panel_defaults(panel); ++ ++ parse_panel_options(i915, panel); ++ parse_generic_dtd(i915, panel); ++ parse_lfp_data(i915, panel); ++ parse_lfp_backlight(i915, panel); ++ parse_sdvo_panel_data(i915, panel); ++ parse_panel_driver_features(i915, panel); ++ parse_power_conservation_features(i915, panel); ++ parse_edp(i915, panel); ++ parse_psr(i915, panel); ++ parse_mipi_config(i915, panel); ++ parse_mipi_sequence(i915, panel); ++} ++ + /** + * intel_bios_driver_remove - Free any resources allocated by intel_bios_init() + * @i915: i915 device instance +@@ -3007,19 +3044,22 @@ void intel_bios_driver_remove(struct drm_i915_private *i915) + list_del(&entry->node); + kfree(entry); + } ++} + +- kfree(i915->vbt.sdvo_lvds_vbt_mode); +- i915->vbt.sdvo_lvds_vbt_mode = NULL; +- kfree(i915->vbt.lfp_lvds_vbt_mode); +- i915->vbt.lfp_lvds_vbt_mode = NULL; +- kfree(i915->vbt.dsi.data); +- i915->vbt.dsi.data = NULL; +- kfree(i915->vbt.dsi.pps); +- i915->vbt.dsi.pps = NULL; +- kfree(i915->vbt.dsi.config); +- i915->vbt.dsi.config = NULL; +- kfree(i915->vbt.dsi.deassert_seq); +- i915->vbt.dsi.deassert_seq = NULL; ++void intel_bios_fini_panel(struct intel_panel *panel) ++{ ++ kfree(panel->vbt.sdvo_lvds_vbt_mode); ++ panel->vbt.sdvo_lvds_vbt_mode = NULL; ++ kfree(panel->vbt.lfp_lvds_vbt_mode); ++ panel->vbt.lfp_lvds_vbt_mode = NULL; ++ kfree(panel->vbt.dsi.data); ++ panel->vbt.dsi.data = NULL; ++ kfree(panel->vbt.dsi.pps); ++ panel->vbt.dsi.pps = NULL; ++ kfree(panel->vbt.dsi.config); ++ panel->vbt.dsi.config = NULL; ++ kfree(panel->vbt.dsi.deassert_seq); ++ panel->vbt.dsi.deassert_seq = NULL; + } + + /** +diff --git a/drivers/gpu/drm/i915/display/intel_bios.h b/drivers/gpu/drm/i915/display/intel_bios.h +index 4709c4d298059..86129f015718d 100644 +--- a/drivers/gpu/drm/i915/display/intel_bios.h ++++ b/drivers/gpu/drm/i915/display/intel_bios.h +@@ -36,6 +36,7 @@ struct drm_i915_private; + struct intel_bios_encoder_data; + struct intel_crtc_state; + struct intel_encoder; ++struct intel_panel; + enum port; + + enum intel_backlight_type { +@@ -230,6 +231,9 @@ struct mipi_pps_data { + } __packed; + + void intel_bios_init(struct drm_i915_private *dev_priv); ++void intel_bios_init_panel(struct drm_i915_private *dev_priv, ++ struct intel_panel *panel); ++void intel_bios_fini_panel(struct intel_panel *panel); + void intel_bios_driver_remove(struct drm_i915_private *dev_priv); + bool intel_bios_is_valid_vbt(const void *buf, size_t size); + bool intel_bios_is_tv_present(struct drm_i915_private *dev_priv); +diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c +index 9e6fa59eabba7..333871cf3a2c5 100644 +--- a/drivers/gpu/drm/i915/display/intel_ddi.c ++++ b/drivers/gpu/drm/i915/display/intel_ddi.c +@@ -3433,26 +3433,8 @@ static void intel_ddi_get_config(struct intel_encoder *encoder, + pipe_config->has_audio = + intel_ddi_is_audio_enabled(dev_priv, cpu_transcoder); + +- if (encoder->type == INTEL_OUTPUT_EDP && dev_priv->vbt.edp.bpp && +- pipe_config->pipe_bpp > dev_priv->vbt.edp.bpp) { +- /* +- * This is a big fat ugly hack. +- * +- * Some machines in UEFI boot mode provide us a VBT that has 18 +- * bpp and 1.62 GHz link bandwidth for eDP, which for reasons +- * unknown we fail to light up. Yet the same BIOS boots up with +- * 24 bpp and 2.7 GHz link. Use the same bpp as the BIOS uses as +- * max, not what it tells us to use. +- * +- * Note: This will still be broken if the eDP panel is not lit +- * up by the BIOS, and thus we can't get the mode at module +- * load. +- */ +- drm_dbg_kms(&dev_priv->drm, +- "pipe has %d bpp for eDP panel, overriding BIOS-provided max %d bpp\n", +- pipe_config->pipe_bpp, dev_priv->vbt.edp.bpp); +- dev_priv->vbt.edp.bpp = pipe_config->pipe_bpp; +- } ++ if (encoder->type == INTEL_OUTPUT_EDP) ++ intel_edp_fixup_vbt_bpp(encoder, pipe_config->pipe_bpp); + + ddi_dotclock_get(pipe_config); + +diff --git a/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c b/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c +index 85f58dd3df722..b490acd0ab691 100644 +--- a/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c ++++ b/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c +@@ -1062,17 +1062,18 @@ bool is_hobl_buf_trans(const struct intel_ddi_buf_trans *table) + + static bool use_edp_hobl(struct intel_encoder *encoder) + { +- struct drm_i915_private *i915 = to_i915(encoder->base.dev); + struct intel_dp *intel_dp = enc_to_intel_dp(encoder); ++ struct intel_connector *connector = intel_dp->attached_connector; + +- return i915->vbt.edp.hobl && !intel_dp->hobl_failed; ++ return connector->panel.vbt.edp.hobl && !intel_dp->hobl_failed; + } + + static bool use_edp_low_vswing(struct intel_encoder *encoder) + { +- struct drm_i915_private *i915 = to_i915(encoder->base.dev); ++ struct intel_dp *intel_dp = enc_to_intel_dp(encoder); ++ struct intel_connector *connector = intel_dp->attached_connector; + +- return i915->vbt.edp.low_vswing; ++ return connector->panel.vbt.edp.low_vswing; + } + + static const struct intel_ddi_buf_trans * +diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h +index 408152f9f46a4..e2561c5d4953c 100644 +--- a/drivers/gpu/drm/i915/display/intel_display_types.h ++++ b/drivers/gpu/drm/i915/display/intel_display_types.h +@@ -279,6 +279,73 @@ struct intel_panel_bl_funcs { + u32 (*hz_to_pwm)(struct intel_connector *connector, u32 hz); + }; + ++enum drrs_type { ++ DRRS_TYPE_NONE, ++ DRRS_TYPE_STATIC, ++ DRRS_TYPE_SEAMLESS, ++}; ++ ++struct intel_vbt_panel_data { ++ struct drm_display_mode *lfp_lvds_vbt_mode; /* if any */ ++ struct drm_display_mode *sdvo_lvds_vbt_mode; /* if any */ ++ ++ /* Feature bits */ ++ unsigned int panel_type:4; ++ unsigned int lvds_dither:1; ++ unsigned int bios_lvds_val; /* initial [PCH_]LVDS reg val in VBIOS */ ++ ++ u8 seamless_drrs_min_refresh_rate; ++ enum drrs_type drrs_type; ++ ++ struct { ++ int rate; ++ int lanes; ++ int preemphasis; ++ int vswing; ++ int bpp; ++ struct edp_power_seq pps; ++ u8 drrs_msa_timing_delay; ++ bool low_vswing; ++ bool initialized; ++ bool hobl; ++ } edp; ++ ++ struct { ++ bool enable; ++ bool full_link; ++ bool require_aux_wakeup; ++ int idle_frames; ++ int tp1_wakeup_time_us; ++ int tp2_tp3_wakeup_time_us; ++ int psr2_tp2_tp3_wakeup_time_us; ++ } psr; ++ ++ struct { ++ u16 pwm_freq_hz; ++ u16 brightness_precision_bits; ++ bool present; ++ bool active_low_pwm; ++ u8 min_brightness; /* min_brightness/255 of max */ ++ u8 controller; /* brightness controller number */ ++ enum intel_backlight_type type; ++ } backlight; ++ ++ /* MIPI DSI */ ++ struct { ++ u16 panel_id; ++ struct mipi_config *config; ++ struct mipi_pps_data *pps; ++ u16 bl_ports; ++ u16 cabc_ports; ++ u8 seq_version; ++ u32 size; ++ u8 *data; ++ const u8 *sequence[MIPI_SEQ_MAX]; ++ u8 *deassert_seq; /* Used by fixup_mipi_sequences() */ ++ enum drm_panel_orientation orientation; ++ } dsi; ++}; ++ + struct intel_panel { + struct list_head fixed_modes; + +@@ -318,6 +385,8 @@ struct intel_panel { + const struct intel_panel_bl_funcs *pwm_funcs; + void (*power)(struct intel_connector *, bool enable); + } backlight; ++ ++ struct intel_vbt_panel_data vbt; + }; + + struct intel_digital_port; +diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c +index fe8b6b72970a2..0efec6023fbe8 100644 +--- a/drivers/gpu/drm/i915/display/intel_dp.c ++++ b/drivers/gpu/drm/i915/display/intel_dp.c +@@ -1246,11 +1246,12 @@ static int intel_dp_max_bpp(struct intel_dp *intel_dp, + if (intel_dp_is_edp(intel_dp)) { + /* Get bpp from vbt only for panels that dont have bpp in edid */ + if (intel_connector->base.display_info.bpc == 0 && +- dev_priv->vbt.edp.bpp && dev_priv->vbt.edp.bpp < bpp) { ++ intel_connector->panel.vbt.edp.bpp && ++ intel_connector->panel.vbt.edp.bpp < bpp) { + drm_dbg_kms(&dev_priv->drm, + "clamping bpp for eDP panel to BIOS-provided %i\n", +- dev_priv->vbt.edp.bpp); +- bpp = dev_priv->vbt.edp.bpp; ++ intel_connector->panel.vbt.edp.bpp); ++ bpp = intel_connector->panel.vbt.edp.bpp; + } + } + +@@ -1907,7 +1908,7 @@ intel_dp_drrs_compute_config(struct intel_connector *connector, + } + + if (IS_IRONLAKE(i915) || IS_SANDYBRIDGE(i915) || IS_IVYBRIDGE(i915)) +- pipe_config->msa_timing_delay = i915->vbt.edp.drrs_msa_timing_delay; ++ pipe_config->msa_timing_delay = connector->panel.vbt.edp.drrs_msa_timing_delay; + + pipe_config->has_drrs = true; + +@@ -2737,6 +2738,33 @@ static void intel_edp_mso_mode_fixup(struct intel_connector *connector, + DRM_MODE_ARG(mode)); + } + ++void intel_edp_fixup_vbt_bpp(struct intel_encoder *encoder, int pipe_bpp) ++{ ++ struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); ++ struct intel_dp *intel_dp = enc_to_intel_dp(encoder); ++ struct intel_connector *connector = intel_dp->attached_connector; ++ ++ if (connector->panel.vbt.edp.bpp && pipe_bpp > connector->panel.vbt.edp.bpp) { ++ /* ++ * This is a big fat ugly hack. ++ * ++ * Some machines in UEFI boot mode provide us a VBT that has 18 ++ * bpp and 1.62 GHz link bandwidth for eDP, which for reasons ++ * unknown we fail to light up. Yet the same BIOS boots up with ++ * 24 bpp and 2.7 GHz link. Use the same bpp as the BIOS uses as ++ * max, not what it tells us to use. ++ * ++ * Note: This will still be broken if the eDP panel is not lit ++ * up by the BIOS, and thus we can't get the mode at module ++ * load. ++ */ ++ drm_dbg_kms(&dev_priv->drm, ++ "pipe has %d bpp for eDP panel, overriding BIOS-provided max %d bpp\n", ++ pipe_bpp, connector->panel.vbt.edp.bpp); ++ connector->panel.vbt.edp.bpp = pipe_bpp; ++ } ++} ++ + static void intel_edp_mso_init(struct intel_dp *intel_dp) + { + struct drm_i915_private *i915 = dp_to_i915(intel_dp); +@@ -5212,8 +5240,10 @@ static bool intel_edp_init_connector(struct intel_dp *intel_dp, + } + intel_connector->edid = edid; + ++ intel_bios_init_panel(dev_priv, &intel_connector->panel); ++ + intel_panel_add_edid_fixed_modes(intel_connector, +- dev_priv->vbt.drrs_type != DRRS_TYPE_NONE); ++ intel_connector->panel.vbt.drrs_type != DRRS_TYPE_NONE); + + /* MSO requires information from the EDID */ + intel_edp_mso_init(intel_dp); +diff --git a/drivers/gpu/drm/i915/display/intel_dp.h b/drivers/gpu/drm/i915/display/intel_dp.h +index d457e17bdc57e..a54902c713a34 100644 +--- a/drivers/gpu/drm/i915/display/intel_dp.h ++++ b/drivers/gpu/drm/i915/display/intel_dp.h +@@ -29,6 +29,7 @@ struct link_config_limits { + int min_bpp, max_bpp; + }; + ++void intel_edp_fixup_vbt_bpp(struct intel_encoder *encoder, int pipe_bpp); + void intel_dp_adjust_compliance_config(struct intel_dp *intel_dp, + struct intel_crtc_state *pipe_config, + struct link_config_limits *limits); +@@ -63,6 +64,7 @@ enum irqreturn intel_dp_hpd_pulse(struct intel_digital_port *dig_port, + void intel_edp_backlight_on(const struct intel_crtc_state *crtc_state, + const struct drm_connector_state *conn_state); + void intel_edp_backlight_off(const struct drm_connector_state *conn_state); ++void intel_edp_fixup_vbt_bpp(struct intel_encoder *encoder, int pipe_bpp); + void intel_dp_mst_suspend(struct drm_i915_private *dev_priv); + void intel_dp_mst_resume(struct drm_i915_private *dev_priv); + int intel_dp_max_link_rate(struct intel_dp *intel_dp); +diff --git a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c +index fb6cf30ee6281..c92d5bb2326a3 100644 +--- a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c ++++ b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c +@@ -370,7 +370,7 @@ static int intel_dp_aux_vesa_setup_backlight(struct intel_connector *connector, + int ret; + + ret = drm_edp_backlight_init(&intel_dp->aux, &panel->backlight.edp.vesa.info, +- i915->vbt.backlight.pwm_freq_hz, intel_dp->edp_dpcd, ++ panel->vbt.backlight.pwm_freq_hz, intel_dp->edp_dpcd, + ¤t_level, ¤t_mode); + if (ret < 0) + return ret; +@@ -454,7 +454,7 @@ int intel_dp_aux_init_backlight_funcs(struct intel_connector *connector) + case INTEL_DP_AUX_BACKLIGHT_OFF: + return -ENODEV; + case INTEL_DP_AUX_BACKLIGHT_AUTO: +- switch (i915->vbt.backlight.type) { ++ switch (panel->vbt.backlight.type) { + case INTEL_BACKLIGHT_VESA_EDP_AUX_INTERFACE: + try_vesa_interface = true; + break; +@@ -466,7 +466,7 @@ int intel_dp_aux_init_backlight_funcs(struct intel_connector *connector) + } + break; + case INTEL_DP_AUX_BACKLIGHT_ON: +- if (i915->vbt.backlight.type != INTEL_BACKLIGHT_VESA_EDP_AUX_INTERFACE) ++ if (panel->vbt.backlight.type != INTEL_BACKLIGHT_VESA_EDP_AUX_INTERFACE) + try_intel_interface = true; + + try_vesa_interface = true; +diff --git a/drivers/gpu/drm/i915/display/intel_drrs.c b/drivers/gpu/drm/i915/display/intel_drrs.c +index 166caf293f7bc..7da4a9cbe4ba4 100644 +--- a/drivers/gpu/drm/i915/display/intel_drrs.c ++++ b/drivers/gpu/drm/i915/display/intel_drrs.c +@@ -217,9 +217,6 @@ static void intel_drrs_frontbuffer_update(struct drm_i915_private *dev_priv, + { + struct intel_crtc *crtc; + +- if (dev_priv->vbt.drrs_type != DRRS_TYPE_SEAMLESS) +- return; +- + for_each_intel_crtc(&dev_priv->drm, crtc) { + unsigned int frontbuffer_bits; + +diff --git a/drivers/gpu/drm/i915/display/intel_dsi.c b/drivers/gpu/drm/i915/display/intel_dsi.c +index 389a8c24cdc1e..35e121cd226c5 100644 +--- a/drivers/gpu/drm/i915/display/intel_dsi.c ++++ b/drivers/gpu/drm/i915/display/intel_dsi.c +@@ -102,7 +102,7 @@ intel_dsi_get_panel_orientation(struct intel_connector *connector) + struct drm_i915_private *dev_priv = to_i915(connector->base.dev); + enum drm_panel_orientation orientation; + +- orientation = dev_priv->vbt.dsi.orientation; ++ orientation = connector->panel.vbt.dsi.orientation; + if (orientation != DRM_MODE_PANEL_ORIENTATION_UNKNOWN) + return orientation; + +diff --git a/drivers/gpu/drm/i915/display/intel_dsi_dcs_backlight.c b/drivers/gpu/drm/i915/display/intel_dsi_dcs_backlight.c +index 7d234429e71ef..1bc7118c56a2a 100644 +--- a/drivers/gpu/drm/i915/display/intel_dsi_dcs_backlight.c ++++ b/drivers/gpu/drm/i915/display/intel_dsi_dcs_backlight.c +@@ -160,12 +160,10 @@ static void dcs_enable_backlight(const struct intel_crtc_state *crtc_state, + static int dcs_setup_backlight(struct intel_connector *connector, + enum pipe unused) + { +- struct drm_device *dev = connector->base.dev; +- struct drm_i915_private *dev_priv = to_i915(dev); + struct intel_panel *panel = &connector->panel; + +- if (dev_priv->vbt.backlight.brightness_precision_bits > 8) +- panel->backlight.max = (1 << dev_priv->vbt.backlight.brightness_precision_bits) - 1; ++ if (panel->vbt.backlight.brightness_precision_bits > 8) ++ panel->backlight.max = (1 << panel->vbt.backlight.brightness_precision_bits) - 1; + else + panel->backlight.max = PANEL_PWM_MAX_VALUE; + +@@ -185,11 +183,10 @@ static const struct intel_panel_bl_funcs dcs_bl_funcs = { + int intel_dsi_dcs_init_backlight_funcs(struct intel_connector *intel_connector) + { + struct drm_device *dev = intel_connector->base.dev; +- struct drm_i915_private *dev_priv = to_i915(dev); + struct intel_encoder *encoder = intel_attached_encoder(intel_connector); + struct intel_panel *panel = &intel_connector->panel; + +- if (dev_priv->vbt.backlight.type != INTEL_BACKLIGHT_DSI_DCS) ++ if (panel->vbt.backlight.type != INTEL_BACKLIGHT_DSI_DCS) + return -ENODEV; + + if (drm_WARN_ON(dev, encoder->type != INTEL_OUTPUT_DSI)) +diff --git a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c +index dd24aef925f2e..75e8cc4337c93 100644 +--- a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c ++++ b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c +@@ -240,9 +240,10 @@ static const u8 *mipi_exec_delay(struct intel_dsi *intel_dsi, const u8 *data) + return data; + } + +-static void vlv_exec_gpio(struct drm_i915_private *dev_priv, ++static void vlv_exec_gpio(struct intel_connector *connector, + u8 gpio_source, u8 gpio_index, bool value) + { ++ struct drm_i915_private *dev_priv = to_i915(connector->base.dev); + struct gpio_map *map; + u16 pconf0, padval; + u32 tmp; +@@ -256,7 +257,7 @@ static void vlv_exec_gpio(struct drm_i915_private *dev_priv, + + map = &vlv_gpio_table[gpio_index]; + +- if (dev_priv->vbt.dsi.seq_version >= 3) { ++ if (connector->panel.vbt.dsi.seq_version >= 3) { + /* XXX: this assumes vlv_gpio_table only has NC GPIOs. */ + port = IOSF_PORT_GPIO_NC; + } else { +@@ -287,14 +288,15 @@ static void vlv_exec_gpio(struct drm_i915_private *dev_priv, + vlv_iosf_sb_put(dev_priv, BIT(VLV_IOSF_SB_GPIO)); + } + +-static void chv_exec_gpio(struct drm_i915_private *dev_priv, ++static void chv_exec_gpio(struct intel_connector *connector, + u8 gpio_source, u8 gpio_index, bool value) + { ++ struct drm_i915_private *dev_priv = to_i915(connector->base.dev); + u16 cfg0, cfg1; + u16 family_num; + u8 port; + +- if (dev_priv->vbt.dsi.seq_version >= 3) { ++ if (connector->panel.vbt.dsi.seq_version >= 3) { + if (gpio_index >= CHV_GPIO_IDX_START_SE) { + /* XXX: it's unclear whether 255->57 is part of SE. */ + gpio_index -= CHV_GPIO_IDX_START_SE; +@@ -340,9 +342,10 @@ static void chv_exec_gpio(struct drm_i915_private *dev_priv, + vlv_iosf_sb_put(dev_priv, BIT(VLV_IOSF_SB_GPIO)); + } + +-static void bxt_exec_gpio(struct drm_i915_private *dev_priv, ++static void bxt_exec_gpio(struct intel_connector *connector, + u8 gpio_source, u8 gpio_index, bool value) + { ++ struct drm_i915_private *dev_priv = to_i915(connector->base.dev); + /* XXX: this table is a quick ugly hack. */ + static struct gpio_desc *bxt_gpio_table[U8_MAX + 1]; + struct gpio_desc *gpio_desc = bxt_gpio_table[gpio_index]; +@@ -366,9 +369,11 @@ static void bxt_exec_gpio(struct drm_i915_private *dev_priv, + gpiod_set_value(gpio_desc, value); + } + +-static void icl_exec_gpio(struct drm_i915_private *dev_priv, ++static void icl_exec_gpio(struct intel_connector *connector, + u8 gpio_source, u8 gpio_index, bool value) + { ++ struct drm_i915_private *dev_priv = to_i915(connector->base.dev); ++ + drm_dbg_kms(&dev_priv->drm, "Skipping ICL GPIO element execution\n"); + } + +@@ -376,18 +381,19 @@ static const u8 *mipi_exec_gpio(struct intel_dsi *intel_dsi, const u8 *data) + { + struct drm_device *dev = intel_dsi->base.base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); ++ struct intel_connector *connector = intel_dsi->attached_connector; + u8 gpio_source, gpio_index = 0, gpio_number; + bool value; + + drm_dbg_kms(&dev_priv->drm, "\n"); + +- if (dev_priv->vbt.dsi.seq_version >= 3) ++ if (connector->panel.vbt.dsi.seq_version >= 3) + gpio_index = *data++; + + gpio_number = *data++; + + /* gpio source in sequence v2 only */ +- if (dev_priv->vbt.dsi.seq_version == 2) ++ if (connector->panel.vbt.dsi.seq_version == 2) + gpio_source = (*data >> 1) & 3; + else + gpio_source = 0; +@@ -396,13 +402,13 @@ static const u8 *mipi_exec_gpio(struct intel_dsi *intel_dsi, const u8 *data) + value = *data++ & 1; + + if (DISPLAY_VER(dev_priv) >= 11) +- icl_exec_gpio(dev_priv, gpio_source, gpio_index, value); ++ icl_exec_gpio(connector, gpio_source, gpio_index, value); + else if (IS_VALLEYVIEW(dev_priv)) +- vlv_exec_gpio(dev_priv, gpio_source, gpio_number, value); ++ vlv_exec_gpio(connector, gpio_source, gpio_number, value); + else if (IS_CHERRYVIEW(dev_priv)) +- chv_exec_gpio(dev_priv, gpio_source, gpio_number, value); ++ chv_exec_gpio(connector, gpio_source, gpio_number, value); + else +- bxt_exec_gpio(dev_priv, gpio_source, gpio_index, value); ++ bxt_exec_gpio(connector, gpio_source, gpio_index, value); + + return data; + } +@@ -585,14 +591,15 @@ static void intel_dsi_vbt_exec(struct intel_dsi *intel_dsi, + enum mipi_seq seq_id) + { + struct drm_i915_private *dev_priv = to_i915(intel_dsi->base.base.dev); ++ struct intel_connector *connector = intel_dsi->attached_connector; + const u8 *data; + fn_mipi_elem_exec mipi_elem_exec; + + if (drm_WARN_ON(&dev_priv->drm, +- seq_id >= ARRAY_SIZE(dev_priv->vbt.dsi.sequence))) ++ seq_id >= ARRAY_SIZE(connector->panel.vbt.dsi.sequence))) + return; + +- data = dev_priv->vbt.dsi.sequence[seq_id]; ++ data = connector->panel.vbt.dsi.sequence[seq_id]; + if (!data) + return; + +@@ -605,7 +612,7 @@ static void intel_dsi_vbt_exec(struct intel_dsi *intel_dsi, + data++; + + /* Skip Size of Sequence. */ +- if (dev_priv->vbt.dsi.seq_version >= 3) ++ if (connector->panel.vbt.dsi.seq_version >= 3) + data += 4; + + while (1) { +@@ -621,7 +628,7 @@ static void intel_dsi_vbt_exec(struct intel_dsi *intel_dsi, + mipi_elem_exec = NULL; + + /* Size of Operation. */ +- if (dev_priv->vbt.dsi.seq_version >= 3) ++ if (connector->panel.vbt.dsi.seq_version >= 3) + operation_size = *data++; + + if (mipi_elem_exec) { +@@ -669,10 +676,10 @@ void intel_dsi_vbt_exec_sequence(struct intel_dsi *intel_dsi, + + void intel_dsi_msleep(struct intel_dsi *intel_dsi, int msec) + { +- struct drm_i915_private *dev_priv = to_i915(intel_dsi->base.base.dev); ++ struct intel_connector *connector = intel_dsi->attached_connector; + + /* For v3 VBTs in vid-mode the delays are part of the VBT sequences */ +- if (is_vid_mode(intel_dsi) && dev_priv->vbt.dsi.seq_version >= 3) ++ if (is_vid_mode(intel_dsi) && connector->panel.vbt.dsi.seq_version >= 3) + return; + + msleep(msec); +@@ -734,9 +741,10 @@ bool intel_dsi_vbt_init(struct intel_dsi *intel_dsi, u16 panel_id) + { + struct drm_device *dev = intel_dsi->base.base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); +- struct mipi_config *mipi_config = dev_priv->vbt.dsi.config; +- struct mipi_pps_data *pps = dev_priv->vbt.dsi.pps; +- struct drm_display_mode *mode = dev_priv->vbt.lfp_lvds_vbt_mode; ++ struct intel_connector *connector = intel_dsi->attached_connector; ++ struct mipi_config *mipi_config = connector->panel.vbt.dsi.config; ++ struct mipi_pps_data *pps = connector->panel.vbt.dsi.pps; ++ struct drm_display_mode *mode = connector->panel.vbt.lfp_lvds_vbt_mode; + u16 burst_mode_ratio; + enum port port; + +@@ -872,7 +880,8 @@ void intel_dsi_vbt_gpio_init(struct intel_dsi *intel_dsi, bool panel_is_on) + { + struct drm_device *dev = intel_dsi->base.base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); +- struct mipi_config *mipi_config = dev_priv->vbt.dsi.config; ++ struct intel_connector *connector = intel_dsi->attached_connector; ++ struct mipi_config *mipi_config = connector->panel.vbt.dsi.config; + enum gpiod_flags flags = panel_is_on ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; + bool want_backlight_gpio = false; + bool want_panel_gpio = false; +@@ -927,7 +936,8 @@ void intel_dsi_vbt_gpio_cleanup(struct intel_dsi *intel_dsi) + { + struct drm_device *dev = intel_dsi->base.base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); +- struct mipi_config *mipi_config = dev_priv->vbt.dsi.config; ++ struct intel_connector *connector = intel_dsi->attached_connector; ++ struct mipi_config *mipi_config = connector->panel.vbt.dsi.config; + + if (intel_dsi->gpio_panel) { + gpiod_put(intel_dsi->gpio_panel); +diff --git a/drivers/gpu/drm/i915/display/intel_lvds.c b/drivers/gpu/drm/i915/display/intel_lvds.c +index e8478161f8b9b..9f250a70519aa 100644 +--- a/drivers/gpu/drm/i915/display/intel_lvds.c ++++ b/drivers/gpu/drm/i915/display/intel_lvds.c +@@ -809,7 +809,7 @@ static bool compute_is_dual_link_lvds(struct intel_lvds_encoder *lvds_encoder) + else + val &= ~(LVDS_DETECTED | LVDS_PIPE_SEL_MASK); + if (val == 0) +- val = dev_priv->vbt.bios_lvds_val; ++ val = connector->panel.vbt.bios_lvds_val; + + return (val & LVDS_CLKB_POWER_MASK) == LVDS_CLKB_POWER_UP; + } +@@ -967,9 +967,11 @@ void intel_lvds_init(struct drm_i915_private *dev_priv) + } + intel_connector->edid = edid; + ++ intel_bios_init_panel(dev_priv, &intel_connector->panel); ++ + /* Try EDID first */ + intel_panel_add_edid_fixed_modes(intel_connector, +- dev_priv->vbt.drrs_type != DRRS_TYPE_NONE); ++ intel_connector->panel.vbt.drrs_type != DRRS_TYPE_NONE); + + /* Failed to get EDID, what about VBT? */ + if (!intel_panel_preferred_fixed_mode(intel_connector)) +diff --git a/drivers/gpu/drm/i915/display/intel_panel.c b/drivers/gpu/drm/i915/display/intel_panel.c +index d1d1b59102d69..d055e41185582 100644 +--- a/drivers/gpu/drm/i915/display/intel_panel.c ++++ b/drivers/gpu/drm/i915/display/intel_panel.c +@@ -75,9 +75,8 @@ const struct drm_display_mode * + intel_panel_downclock_mode(struct intel_connector *connector, + const struct drm_display_mode *adjusted_mode) + { +- struct drm_i915_private *i915 = to_i915(connector->base.dev); + const struct drm_display_mode *fixed_mode, *best_mode = NULL; +- int min_vrefresh = i915->vbt.seamless_drrs_min_refresh_rate; ++ int min_vrefresh = connector->panel.vbt.seamless_drrs_min_refresh_rate; + int max_vrefresh = drm_mode_vrefresh(adjusted_mode); + + /* pick the fixed_mode with the lowest refresh rate */ +@@ -113,13 +112,11 @@ int intel_panel_get_modes(struct intel_connector *connector) + + enum drrs_type intel_panel_drrs_type(struct intel_connector *connector) + { +- struct drm_i915_private *i915 = to_i915(connector->base.dev); +- + if (list_empty(&connector->panel.fixed_modes) || + list_is_singular(&connector->panel.fixed_modes)) + return DRRS_TYPE_NONE; + +- return i915->vbt.drrs_type; ++ return connector->panel.vbt.drrs_type; + } + + int intel_panel_compute_config(struct intel_connector *connector, +@@ -260,7 +257,7 @@ void intel_panel_add_vbt_lfp_fixed_mode(struct intel_connector *connector) + struct drm_i915_private *i915 = to_i915(connector->base.dev); + const struct drm_display_mode *mode; + +- mode = i915->vbt.lfp_lvds_vbt_mode; ++ mode = connector->panel.vbt.lfp_lvds_vbt_mode; + if (!mode) + return; + +@@ -274,7 +271,7 @@ void intel_panel_add_vbt_sdvo_fixed_mode(struct intel_connector *connector) + struct drm_i915_private *i915 = to_i915(connector->base.dev); + const struct drm_display_mode *mode; + +- mode = i915->vbt.sdvo_lvds_vbt_mode; ++ mode = connector->panel.vbt.sdvo_lvds_vbt_mode; + if (!mode) + return; + +@@ -639,6 +636,8 @@ void intel_panel_fini(struct intel_connector *connector) + + intel_backlight_destroy(panel); + ++ intel_bios_fini_panel(panel); ++ + list_for_each_entry_safe(fixed_mode, next, &panel->fixed_modes, head) { + list_del(&fixed_mode->head); + drm_mode_destroy(connector->base.dev, fixed_mode); +diff --git a/drivers/gpu/drm/i915/display/intel_pps.c b/drivers/gpu/drm/i915/display/intel_pps.c +index 5a598dd060391..a226e4e5c5698 100644 +--- a/drivers/gpu/drm/i915/display/intel_pps.c ++++ b/drivers/gpu/drm/i915/display/intel_pps.c +@@ -209,7 +209,8 @@ static int + bxt_power_sequencer_idx(struct intel_dp *intel_dp) + { + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); +- int backlight_controller = dev_priv->vbt.backlight.controller; ++ struct intel_connector *connector = intel_dp->attached_connector; ++ int backlight_controller = connector->panel.vbt.backlight.controller; + + lockdep_assert_held(&dev_priv->pps_mutex); + +@@ -1159,53 +1160,84 @@ intel_pps_verify_state(struct intel_dp *intel_dp) + } + } + +-static void pps_init_delays(struct intel_dp *intel_dp) ++static void pps_init_delays_cur(struct intel_dp *intel_dp, ++ struct edp_power_seq *cur) + { + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); +- struct edp_power_seq cur, vbt, spec, +- *final = &intel_dp->pps.pps_delays; + + lockdep_assert_held(&dev_priv->pps_mutex); + +- /* already initialized? */ +- if (final->t11_t12 != 0) +- return; ++ intel_pps_readout_hw_state(intel_dp, cur); ++ ++ intel_pps_dump_state(intel_dp, "cur", cur); ++} + +- intel_pps_readout_hw_state(intel_dp, &cur); ++static void pps_init_delays_vbt(struct intel_dp *intel_dp, ++ struct edp_power_seq *vbt) ++{ ++ struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); ++ struct intel_connector *connector = intel_dp->attached_connector; + +- intel_pps_dump_state(intel_dp, "cur", &cur); ++ *vbt = connector->panel.vbt.edp.pps; + +- vbt = dev_priv->vbt.edp.pps; + /* On Toshiba Satellite P50-C-18C system the VBT T12 delay + * of 500ms appears to be too short. Ocassionally the panel + * just fails to power back on. Increasing the delay to 800ms + * seems sufficient to avoid this problem. + */ + if (dev_priv->quirks & QUIRK_INCREASE_T12_DELAY) { +- vbt.t11_t12 = max_t(u16, vbt.t11_t12, 1300 * 10); ++ vbt->t11_t12 = max_t(u16, vbt->t11_t12, 1300 * 10); + drm_dbg_kms(&dev_priv->drm, + "Increasing T12 panel delay as per the quirk to %d\n", +- vbt.t11_t12); ++ vbt->t11_t12); + } ++ + /* T11_T12 delay is special and actually in units of 100ms, but zero + * based in the hw (so we need to add 100 ms). But the sw vbt + * table multiplies it with 1000 to make it in units of 100usec, + * too. */ +- vbt.t11_t12 += 100 * 10; ++ vbt->t11_t12 += 100 * 10; ++ ++ intel_pps_dump_state(intel_dp, "vbt", vbt); ++} ++ ++static void pps_init_delays_spec(struct intel_dp *intel_dp, ++ struct edp_power_seq *spec) ++{ ++ struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); ++ ++ lockdep_assert_held(&dev_priv->pps_mutex); + + /* Upper limits from eDP 1.3 spec. Note that we use the clunky units of + * our hw here, which are all in 100usec. */ +- spec.t1_t3 = 210 * 10; +- spec.t8 = 50 * 10; /* no limit for t8, use t7 instead */ +- spec.t9 = 50 * 10; /* no limit for t9, make it symmetric with t8 */ +- spec.t10 = 500 * 10; ++ spec->t1_t3 = 210 * 10; ++ spec->t8 = 50 * 10; /* no limit for t8, use t7 instead */ ++ spec->t9 = 50 * 10; /* no limit for t9, make it symmetric with t8 */ ++ spec->t10 = 500 * 10; + /* This one is special and actually in units of 100ms, but zero + * based in the hw (so we need to add 100 ms). But the sw vbt + * table multiplies it with 1000 to make it in units of 100usec, + * too. */ +- spec.t11_t12 = (510 + 100) * 10; ++ spec->t11_t12 = (510 + 100) * 10; ++ ++ intel_pps_dump_state(intel_dp, "spec", spec); ++} ++ ++static void pps_init_delays(struct intel_dp *intel_dp) ++{ ++ struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); ++ struct edp_power_seq cur, vbt, spec, ++ *final = &intel_dp->pps.pps_delays; ++ ++ lockdep_assert_held(&dev_priv->pps_mutex); ++ ++ /* already initialized? */ ++ if (final->t11_t12 != 0) ++ return; + +- intel_pps_dump_state(intel_dp, "vbt", &vbt); ++ pps_init_delays_cur(intel_dp, &cur); ++ pps_init_delays_vbt(intel_dp, &vbt); ++ pps_init_delays_spec(intel_dp, &spec); + + /* Use the max of the register settings and vbt. If both are + * unset, fall back to the spec limits. */ +diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c +index 06db407e2749f..8f09203e0cf03 100644 +--- a/drivers/gpu/drm/i915/display/intel_psr.c ++++ b/drivers/gpu/drm/i915/display/intel_psr.c +@@ -86,10 +86,13 @@ + + static bool psr_global_enabled(struct intel_dp *intel_dp) + { ++ struct intel_connector *connector = intel_dp->attached_connector; + struct drm_i915_private *i915 = dp_to_i915(intel_dp); + + switch (intel_dp->psr.debug & I915_PSR_DEBUG_MODE_MASK) { + case I915_PSR_DEBUG_DEFAULT: ++ if (i915->params.enable_psr == -1) ++ return connector->panel.vbt.psr.enable; + return i915->params.enable_psr; + case I915_PSR_DEBUG_DISABLE: + return false; +@@ -399,6 +402,7 @@ static void intel_psr_enable_sink(struct intel_dp *intel_dp) + + static u32 intel_psr1_get_tp_time(struct intel_dp *intel_dp) + { ++ struct intel_connector *connector = intel_dp->attached_connector; + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); + u32 val = 0; + +@@ -411,20 +415,20 @@ static u32 intel_psr1_get_tp_time(struct intel_dp *intel_dp) + goto check_tp3_sel; + } + +- if (dev_priv->vbt.psr.tp1_wakeup_time_us == 0) ++ if (connector->panel.vbt.psr.tp1_wakeup_time_us == 0) + val |= EDP_PSR_TP1_TIME_0us; +- else if (dev_priv->vbt.psr.tp1_wakeup_time_us <= 100) ++ else if (connector->panel.vbt.psr.tp1_wakeup_time_us <= 100) + val |= EDP_PSR_TP1_TIME_100us; +- else if (dev_priv->vbt.psr.tp1_wakeup_time_us <= 500) ++ else if (connector->panel.vbt.psr.tp1_wakeup_time_us <= 500) + val |= EDP_PSR_TP1_TIME_500us; + else + val |= EDP_PSR_TP1_TIME_2500us; + +- if (dev_priv->vbt.psr.tp2_tp3_wakeup_time_us == 0) ++ if (connector->panel.vbt.psr.tp2_tp3_wakeup_time_us == 0) + val |= EDP_PSR_TP2_TP3_TIME_0us; +- else if (dev_priv->vbt.psr.tp2_tp3_wakeup_time_us <= 100) ++ else if (connector->panel.vbt.psr.tp2_tp3_wakeup_time_us <= 100) + val |= EDP_PSR_TP2_TP3_TIME_100us; +- else if (dev_priv->vbt.psr.tp2_tp3_wakeup_time_us <= 500) ++ else if (connector->panel.vbt.psr.tp2_tp3_wakeup_time_us <= 500) + val |= EDP_PSR_TP2_TP3_TIME_500us; + else + val |= EDP_PSR_TP2_TP3_TIME_2500us; +@@ -441,13 +445,14 @@ check_tp3_sel: + + static u8 psr_compute_idle_frames(struct intel_dp *intel_dp) + { ++ struct intel_connector *connector = intel_dp->attached_connector; + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); + int idle_frames; + + /* Let's use 6 as the minimum to cover all known cases including the + * off-by-one issue that HW has in some cases. + */ +- idle_frames = max(6, dev_priv->vbt.psr.idle_frames); ++ idle_frames = max(6, connector->panel.vbt.psr.idle_frames); + idle_frames = max(idle_frames, intel_dp->psr.sink_sync_latency + 1); + + if (drm_WARN_ON(&dev_priv->drm, idle_frames > 0xf)) +@@ -483,18 +488,19 @@ static void hsw_activate_psr1(struct intel_dp *intel_dp) + + static u32 intel_psr2_get_tp_time(struct intel_dp *intel_dp) + { ++ struct intel_connector *connector = intel_dp->attached_connector; + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); + u32 val = 0; + + if (dev_priv->params.psr_safest_params) + return EDP_PSR2_TP2_TIME_2500us; + +- if (dev_priv->vbt.psr.psr2_tp2_tp3_wakeup_time_us >= 0 && +- dev_priv->vbt.psr.psr2_tp2_tp3_wakeup_time_us <= 50) ++ if (connector->panel.vbt.psr.psr2_tp2_tp3_wakeup_time_us >= 0 && ++ connector->panel.vbt.psr.psr2_tp2_tp3_wakeup_time_us <= 50) + val |= EDP_PSR2_TP2_TIME_50us; +- else if (dev_priv->vbt.psr.psr2_tp2_tp3_wakeup_time_us <= 100) ++ else if (connector->panel.vbt.psr.psr2_tp2_tp3_wakeup_time_us <= 100) + val |= EDP_PSR2_TP2_TIME_100us; +- else if (dev_priv->vbt.psr.psr2_tp2_tp3_wakeup_time_us <= 500) ++ else if (connector->panel.vbt.psr.psr2_tp2_tp3_wakeup_time_us <= 500) + val |= EDP_PSR2_TP2_TIME_500us; + else + val |= EDP_PSR2_TP2_TIME_2500us; +@@ -2344,6 +2350,7 @@ unlock: + */ + void intel_psr_init(struct intel_dp *intel_dp) + { ++ struct intel_connector *connector = intel_dp->attached_connector; + struct intel_digital_port *dig_port = dp_to_dig_port(intel_dp); + struct drm_i915_private *dev_priv = dp_to_i915(intel_dp); + +@@ -2367,14 +2374,10 @@ void intel_psr_init(struct intel_dp *intel_dp) + + intel_dp->psr.source_support = true; + +- if (dev_priv->params.enable_psr == -1) +- if (!dev_priv->vbt.psr.enable) +- dev_priv->params.enable_psr = 0; +- + /* Set link_standby x link_off defaults */ + if (DISPLAY_VER(dev_priv) < 12) + /* For new platforms up to TGL let's respect VBT back again */ +- intel_dp->psr.link_standby = dev_priv->vbt.psr.full_link; ++ intel_dp->psr.link_standby = connector->panel.vbt.psr.full_link; + + INIT_WORK(&intel_dp->psr.work, intel_psr_work); + INIT_DELAYED_WORK(&intel_dp->psr.dc3co_work, tgl_dc3co_disable_work); +diff --git a/drivers/gpu/drm/i915/display/intel_sdvo.c b/drivers/gpu/drm/i915/display/intel_sdvo.c +index d81855d57cdc9..14a64bd61176d 100644 +--- a/drivers/gpu/drm/i915/display/intel_sdvo.c ++++ b/drivers/gpu/drm/i915/display/intel_sdvo.c +@@ -2869,6 +2869,7 @@ static bool + intel_sdvo_lvds_init(struct intel_sdvo *intel_sdvo, int device) + { + struct drm_encoder *encoder = &intel_sdvo->base.base; ++ struct drm_i915_private *i915 = to_i915(encoder->dev); + struct drm_connector *connector; + struct intel_connector *intel_connector; + struct intel_sdvo_connector *intel_sdvo_connector; +@@ -2900,6 +2901,8 @@ intel_sdvo_lvds_init(struct intel_sdvo *intel_sdvo, int device) + if (!intel_sdvo_create_enhance_property(intel_sdvo, intel_sdvo_connector)) + goto err; + ++ intel_bios_init_panel(i915, &intel_connector->panel); ++ + /* + * Fetch modes from VBT. For SDVO prefer the VBT mode since some + * SDVO->LVDS transcoders can't cope with the EDID mode. +diff --git a/drivers/gpu/drm/i915/display/vlv_dsi.c b/drivers/gpu/drm/i915/display/vlv_dsi.c +index 1954f07f0d3ec..02f75e95b2ec1 100644 +--- a/drivers/gpu/drm/i915/display/vlv_dsi.c ++++ b/drivers/gpu/drm/i915/display/vlv_dsi.c +@@ -782,6 +782,7 @@ static void intel_dsi_pre_enable(struct intel_atomic_state *state, + { + struct intel_dsi *intel_dsi = enc_to_intel_dsi(encoder); + struct intel_crtc *crtc = to_intel_crtc(pipe_config->uapi.crtc); ++ struct intel_connector *connector = to_intel_connector(conn_state->connector); + struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); + enum pipe pipe = crtc->pipe; + enum port port; +@@ -838,7 +839,7 @@ static void intel_dsi_pre_enable(struct intel_atomic_state *state, + * the delay in that case. If there is no deassert-seq, then an + * unconditional msleep is used to give the panel time to power-on. + */ +- if (dev_priv->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET]) { ++ if (connector->panel.vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET]) { + intel_dsi_msleep(intel_dsi, intel_dsi->panel_on_delay); + intel_dsi_vbt_exec_sequence(intel_dsi, MIPI_SEQ_DEASSERT_RESET); + } else { +@@ -1690,7 +1691,8 @@ static void vlv_dphy_param_init(struct intel_dsi *intel_dsi) + { + struct drm_device *dev = intel_dsi->base.base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); +- struct mipi_config *mipi_config = dev_priv->vbt.dsi.config; ++ struct intel_connector *connector = intel_dsi->attached_connector; ++ struct mipi_config *mipi_config = connector->panel.vbt.dsi.config; + u32 tlpx_ns, extra_byte_count, tlpx_ui; + u32 ui_num, ui_den; + u32 prepare_cnt, exit_zero_cnt, clk_zero_cnt, trail_cnt; +@@ -1924,13 +1926,22 @@ void vlv_dsi_init(struct drm_i915_private *dev_priv) + + intel_dsi->panel_power_off_time = ktime_get_boottime(); + +- if (dev_priv->vbt.dsi.config->dual_link) ++ intel_bios_init_panel(dev_priv, &intel_connector->panel); ++ ++ if (intel_connector->panel.vbt.dsi.config->dual_link) + intel_dsi->ports = BIT(PORT_A) | BIT(PORT_C); + else + intel_dsi->ports = BIT(port); + +- intel_dsi->dcs_backlight_ports = dev_priv->vbt.dsi.bl_ports; +- intel_dsi->dcs_cabc_ports = dev_priv->vbt.dsi.cabc_ports; ++ if (drm_WARN_ON(&dev_priv->drm, intel_connector->panel.vbt.dsi.bl_ports & ~intel_dsi->ports)) ++ intel_connector->panel.vbt.dsi.bl_ports &= intel_dsi->ports; ++ ++ intel_dsi->dcs_backlight_ports = intel_connector->panel.vbt.dsi.bl_ports; ++ ++ if (drm_WARN_ON(&dev_priv->drm, intel_connector->panel.vbt.dsi.cabc_ports & ~intel_dsi->ports)) ++ intel_connector->panel.vbt.dsi.cabc_ports &= intel_dsi->ports; ++ ++ intel_dsi->dcs_cabc_ports = intel_connector->panel.vbt.dsi.cabc_ports; + + /* Create a DSI host (and a device) for each port. */ + for_each_dsi_port(port, intel_dsi->ports) { +diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c +index 321af109d484f..8da42af0256ab 100644 +--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c ++++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c +@@ -1269,6 +1269,10 @@ static void i915_gem_context_release_work(struct work_struct *work) + trace_i915_context_free(ctx); + GEM_BUG_ON(!i915_gem_context_is_closed(ctx)); + ++ spin_lock(&ctx->i915->gem.contexts.lock); ++ list_del(&ctx->link); ++ spin_unlock(&ctx->i915->gem.contexts.lock); ++ + if (ctx->syncobj) + drm_syncobj_put(ctx->syncobj); + +@@ -1514,10 +1518,6 @@ static void context_close(struct i915_gem_context *ctx) + + ctx->file_priv = ERR_PTR(-EBADF); + +- spin_lock(&ctx->i915->gem.contexts.lock); +- list_del(&ctx->link); +- spin_unlock(&ctx->i915->gem.contexts.lock); +- + client = ctx->client; + if (client) { + spin_lock(&client->ctx_lock); +diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h +index 5184d70d48382..554d79bc0312d 100644 +--- a/drivers/gpu/drm/i915/i915_drv.h ++++ b/drivers/gpu/drm/i915/i915_drv.h +@@ -194,12 +194,6 @@ struct drm_i915_display_funcs { + + #define I915_COLOR_UNEVICTABLE (-1) /* a non-vma sharing the address space */ + +-enum drrs_type { +- DRRS_TYPE_NONE, +- DRRS_TYPE_STATIC, +- DRRS_TYPE_SEAMLESS, +-}; +- + #define QUIRK_LVDS_SSC_DISABLE (1<<1) + #define QUIRK_INVERT_BRIGHTNESS (1<<2) + #define QUIRK_BACKLIGHT_PRESENT (1<<3) +@@ -308,76 +302,19 @@ struct intel_vbt_data { + /* bdb version */ + u16 version; + +- struct drm_display_mode *lfp_lvds_vbt_mode; /* if any */ +- struct drm_display_mode *sdvo_lvds_vbt_mode; /* if any */ +- + /* Feature bits */ + unsigned int int_tv_support:1; +- unsigned int lvds_dither:1; + unsigned int int_crt_support:1; + unsigned int lvds_use_ssc:1; + unsigned int int_lvds_support:1; + unsigned int display_clock_mode:1; + unsigned int fdi_rx_polarity_inverted:1; +- unsigned int panel_type:4; + int lvds_ssc_freq; +- unsigned int bios_lvds_val; /* initial [PCH_]LVDS reg val in VBIOS */ + enum drm_panel_orientation orientation; + + bool override_afc_startup; + u8 override_afc_startup_val; + +- u8 seamless_drrs_min_refresh_rate; +- enum drrs_type drrs_type; +- +- struct { +- int rate; +- int lanes; +- int preemphasis; +- int vswing; +- int bpp; +- struct edp_power_seq pps; +- u8 drrs_msa_timing_delay; +- bool low_vswing; +- bool initialized; +- bool hobl; +- } edp; +- +- struct { +- bool enable; +- bool full_link; +- bool require_aux_wakeup; +- int idle_frames; +- int tp1_wakeup_time_us; +- int tp2_tp3_wakeup_time_us; +- int psr2_tp2_tp3_wakeup_time_us; +- } psr; +- +- struct { +- u16 pwm_freq_hz; +- u16 brightness_precision_bits; +- bool present; +- bool active_low_pwm; +- u8 min_brightness; /* min_brightness/255 of max */ +- u8 controller; /* brightness controller number */ +- enum intel_backlight_type type; +- } backlight; +- +- /* MIPI DSI */ +- struct { +- u16 panel_id; +- struct mipi_config *config; +- struct mipi_pps_data *pps; +- u16 bl_ports; +- u16 cabc_ports; +- u8 seq_version; +- u32 size; +- u8 *data; +- const u8 *sequence[MIPI_SEQ_MAX]; +- u8 *deassert_seq; /* Used by fixup_mipi_sequences() */ +- enum drm_panel_orientation orientation; +- } dsi; +- + int crt_ddc_pin; + + struct list_head display_devices; +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index 702e5b89be226..b605d0ceaefad 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -1191,7 +1191,8 @@ void i915_gem_driver_release(struct drm_i915_private *dev_priv) + + intel_uc_cleanup_firmwares(&to_gt(dev_priv)->uc); + +- i915_gem_drain_freed_objects(dev_priv); ++ /* Flush any outstanding work, including i915_gem_context.release_work. */ ++ i915_gem_drain_workqueue(dev_priv); + + drm_WARN_ON(&dev_priv->drm, !list_empty(&dev_priv->gem.contexts.list)); + } +diff --git a/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.c b/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.c +index 5d7504a72b11c..e244aa408d9d4 100644 +--- a/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.c ++++ b/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.c +@@ -151,7 +151,7 @@ static void mtk_dither_config(struct device *dev, unsigned int w, + { + struct mtk_ddp_comp_dev *priv = dev_get_drvdata(dev); + +- mtk_ddp_write(cmdq_pkt, h << 16 | w, &priv->cmdq_reg, priv->regs, DISP_REG_DITHER_SIZE); ++ mtk_ddp_write(cmdq_pkt, w << 16 | h, &priv->cmdq_reg, priv->regs, DISP_REG_DITHER_SIZE); + mtk_ddp_write(cmdq_pkt, DITHER_RELAY_MODE, &priv->cmdq_reg, priv->regs, + DISP_REG_DITHER_CFG); + mtk_dither_set_common(priv->regs, &priv->cmdq_reg, bpc, DISP_REG_DITHER_CFG, +diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c b/drivers/gpu/drm/mediatek/mtk_dsi.c +index af2f123e9a9a9..9a3b86c29b503 100644 +--- a/drivers/gpu/drm/mediatek/mtk_dsi.c ++++ b/drivers/gpu/drm/mediatek/mtk_dsi.c +@@ -685,6 +685,16 @@ static void mtk_dsi_poweroff(struct mtk_dsi *dsi) + if (--dsi->refcount != 0) + return; + ++ /* ++ * mtk_dsi_stop() and mtk_dsi_start() is asymmetric, since ++ * mtk_dsi_stop() should be called after mtk_drm_crtc_atomic_disable(), ++ * which needs irq for vblank, and mtk_dsi_stop() will disable irq. ++ * mtk_dsi_start() needs to be called in mtk_output_dsi_enable(), ++ * after dsi is fully set. ++ */ ++ mtk_dsi_stop(dsi); ++ ++ mtk_dsi_switch_to_cmd_mode(dsi, VM_DONE_INT_FLAG, 500); + mtk_dsi_reset_engine(dsi); + mtk_dsi_lane0_ulp_mode_enter(dsi); + mtk_dsi_clk_ulp_mode_enter(dsi); +@@ -735,17 +745,6 @@ static void mtk_output_dsi_disable(struct mtk_dsi *dsi) + if (!dsi->enabled) + return; + +- /* +- * mtk_dsi_stop() and mtk_dsi_start() is asymmetric, since +- * mtk_dsi_stop() should be called after mtk_drm_crtc_atomic_disable(), +- * which needs irq for vblank, and mtk_dsi_stop() will disable irq. +- * mtk_dsi_start() needs to be called in mtk_output_dsi_enable(), +- * after dsi is fully set. +- */ +- mtk_dsi_stop(dsi); +- +- mtk_dsi_switch_to_cmd_mode(dsi, VM_DONE_INT_FLAG, 500); +- + dsi->enabled = false; + } + +@@ -808,10 +807,13 @@ static void mtk_dsi_bridge_atomic_post_disable(struct drm_bridge *bridge, + + static const struct drm_bridge_funcs mtk_dsi_bridge_funcs = { + .attach = mtk_dsi_bridge_attach, ++ .atomic_destroy_state = drm_atomic_helper_bridge_destroy_state, + .atomic_disable = mtk_dsi_bridge_atomic_disable, ++ .atomic_duplicate_state = drm_atomic_helper_bridge_duplicate_state, + .atomic_enable = mtk_dsi_bridge_atomic_enable, + .atomic_pre_enable = mtk_dsi_bridge_atomic_pre_enable, + .atomic_post_disable = mtk_dsi_bridge_atomic_post_disable, ++ .atomic_reset = drm_atomic_helper_bridge_reset, + .mode_set = mtk_dsi_bridge_mode_set, + }; + +diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c +index 4a2e580a2f7b7..0e001ce8a40fd 100644 +--- a/drivers/gpu/drm/panel/panel-simple.c ++++ b/drivers/gpu/drm/panel/panel-simple.c +@@ -2136,7 +2136,7 @@ static const struct panel_desc innolux_g121i1_l01 = { + .enable = 200, + .disable = 20, + }, +- .bus_format = MEDIA_BUS_FMT_RGB888_1X7X4_SPWG, ++ .bus_format = MEDIA_BUS_FMT_RGB666_1X7X3_SPWG, + .connector_type = DRM_MODE_CONNECTOR_LVDS, + }; + +diff --git a/drivers/gpu/drm/rockchip/cdn-dp-core.c b/drivers/gpu/drm/rockchip/cdn-dp-core.c +index c204e9b95c1f7..518ee13b1d6f4 100644 +--- a/drivers/gpu/drm/rockchip/cdn-dp-core.c ++++ b/drivers/gpu/drm/rockchip/cdn-dp-core.c +@@ -283,8 +283,9 @@ static int cdn_dp_connector_get_modes(struct drm_connector *connector) + return ret; + } + +-static int cdn_dp_connector_mode_valid(struct drm_connector *connector, +- struct drm_display_mode *mode) ++static enum drm_mode_status ++cdn_dp_connector_mode_valid(struct drm_connector *connector, ++ struct drm_display_mode *mode) + { + struct cdn_dp_device *dp = connector_to_dp(connector); + struct drm_display_info *display_info = &dp->connector.display_info; +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index 547ae334e5cd8..027029efb0088 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -2309,7 +2309,7 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, + bool fb_overlap_ok) + { + struct resource *iter, *shadow; +- resource_size_t range_min, range_max, start; ++ resource_size_t range_min, range_max, start, end; + const char *dev_n = dev_name(&device_obj->device); + int retval; + +@@ -2344,6 +2344,14 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, + range_max = iter->end; + start = (range_min + align - 1) & ~(align - 1); + for (; start + size - 1 <= range_max; start += align) { ++ end = start + size - 1; ++ ++ /* Skip the whole fb_mmio region if not fb_overlap_ok */ ++ if (!fb_overlap_ok && fb_mmio && ++ (((start >= fb_mmio->start) && (start <= fb_mmio->end)) || ++ ((end >= fb_mmio->start) && (end <= fb_mmio->end)))) ++ continue; ++ + shadow = __request_region(iter, start, size, NULL, + IORESOURCE_BUSY); + if (!shadow) +diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c +index e47fa34656717..3082183bd66a4 100644 +--- a/drivers/i2c/busses/i2c-imx.c ++++ b/drivers/i2c/busses/i2c-imx.c +@@ -1583,7 +1583,7 @@ static int i2c_imx_remove(struct platform_device *pdev) + if (i2c_imx->dma) + i2c_imx_dma_free(i2c_imx); + +- if (ret == 0) { ++ if (ret >= 0) { + /* setup chip registers to defaults */ + imx_i2c_write_reg(0, i2c_imx, IMX_I2C_IADR); + imx_i2c_write_reg(0, i2c_imx, IMX_I2C_IFDR); +diff --git a/drivers/i2c/busses/i2c-mlxbf.c b/drivers/i2c/busses/i2c-mlxbf.c +index 8716032f030a0..ad5efd7497d1c 100644 +--- a/drivers/i2c/busses/i2c-mlxbf.c ++++ b/drivers/i2c/busses/i2c-mlxbf.c +@@ -6,6 +6,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -63,13 +64,14 @@ + */ + #define MLXBF_I2C_TYU_PLL_OUT_FREQ (400 * 1000 * 1000) + /* Reference clock for Bluefield - 156 MHz. */ +-#define MLXBF_I2C_PLL_IN_FREQ (156 * 1000 * 1000) ++#define MLXBF_I2C_PLL_IN_FREQ 156250000ULL + + /* Constant used to determine the PLL frequency. */ +-#define MLNXBF_I2C_COREPLL_CONST 16384 ++#define MLNXBF_I2C_COREPLL_CONST 16384ULL ++ ++#define MLXBF_I2C_FREQUENCY_1GHZ 1000000000ULL + + /* PLL registers. */ +-#define MLXBF_I2C_CORE_PLL_REG0 0x0 + #define MLXBF_I2C_CORE_PLL_REG1 0x4 + #define MLXBF_I2C_CORE_PLL_REG2 0x8 + +@@ -181,22 +183,15 @@ + #define MLXBF_I2C_COREPLL_FREQ MLXBF_I2C_TYU_PLL_OUT_FREQ + + /* Core PLL TYU configuration. */ +-#define MLXBF_I2C_COREPLL_CORE_F_TYU_MASK GENMASK(12, 0) +-#define MLXBF_I2C_COREPLL_CORE_OD_TYU_MASK GENMASK(3, 0) +-#define MLXBF_I2C_COREPLL_CORE_R_TYU_MASK GENMASK(5, 0) +- +-#define MLXBF_I2C_COREPLL_CORE_F_TYU_SHIFT 3 +-#define MLXBF_I2C_COREPLL_CORE_OD_TYU_SHIFT 16 +-#define MLXBF_I2C_COREPLL_CORE_R_TYU_SHIFT 20 ++#define MLXBF_I2C_COREPLL_CORE_F_TYU_MASK GENMASK(15, 3) ++#define MLXBF_I2C_COREPLL_CORE_OD_TYU_MASK GENMASK(19, 16) ++#define MLXBF_I2C_COREPLL_CORE_R_TYU_MASK GENMASK(25, 20) + + /* Core PLL YU configuration. */ + #define MLXBF_I2C_COREPLL_CORE_F_YU_MASK GENMASK(25, 0) + #define MLXBF_I2C_COREPLL_CORE_OD_YU_MASK GENMASK(3, 0) +-#define MLXBF_I2C_COREPLL_CORE_R_YU_MASK GENMASK(5, 0) ++#define MLXBF_I2C_COREPLL_CORE_R_YU_MASK GENMASK(31, 26) + +-#define MLXBF_I2C_COREPLL_CORE_F_YU_SHIFT 0 +-#define MLXBF_I2C_COREPLL_CORE_OD_YU_SHIFT 1 +-#define MLXBF_I2C_COREPLL_CORE_R_YU_SHIFT 26 + + /* Core PLL frequency. */ + static u64 mlxbf_i2c_corepll_frequency; +@@ -479,8 +474,6 @@ static struct mutex mlxbf_i2c_bus_lock; + #define MLXBF_I2C_MASK_8 GENMASK(7, 0) + #define MLXBF_I2C_MASK_16 GENMASK(15, 0) + +-#define MLXBF_I2C_FREQUENCY_1GHZ 1000000000 +- + /* + * Function to poll a set of bits at a specific address; it checks whether + * the bits are equal to zero when eq_zero is set to 'true', and not equal +@@ -669,7 +662,7 @@ static int mlxbf_i2c_smbus_enable(struct mlxbf_i2c_priv *priv, u8 slave, + /* Clear status bits. */ + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_MASTER_STATUS); + /* Set the cause data. */ +- writel(~0x0, priv->smbus->io + MLXBF_I2C_CAUSE_OR_CLEAR); ++ writel(~0x0, priv->mst_cause->io + MLXBF_I2C_CAUSE_OR_CLEAR); + /* Zero PEC byte. */ + writel(0x0, priv->smbus->io + MLXBF_I2C_SMBUS_MASTER_PEC); + /* Zero byte count. */ +@@ -738,6 +731,9 @@ mlxbf_i2c_smbus_start_transaction(struct mlxbf_i2c_priv *priv, + if (flags & MLXBF_I2C_F_WRITE) { + write_en = 1; + write_len += operation->length; ++ if (data_idx + operation->length > ++ MLXBF_I2C_MASTER_DATA_DESC_SIZE) ++ return -ENOBUFS; + memcpy(data_desc + data_idx, + operation->buffer, operation->length); + data_idx += operation->length; +@@ -1407,24 +1403,19 @@ static int mlxbf_i2c_init_master(struct platform_device *pdev, + return 0; + } + +-static u64 mlxbf_calculate_freq_from_tyu(struct mlxbf_i2c_resource *corepll_res) ++static u64 mlxbf_i2c_calculate_freq_from_tyu(struct mlxbf_i2c_resource *corepll_res) + { +- u64 core_frequency, pad_frequency; ++ u64 core_frequency; + u8 core_od, core_r; + u32 corepll_val; + u16 core_f; + +- pad_frequency = MLXBF_I2C_PLL_IN_FREQ; +- + corepll_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG1); + + /* Get Core PLL configuration bits. */ +- core_f = rol32(corepll_val, MLXBF_I2C_COREPLL_CORE_F_TYU_SHIFT) & +- MLXBF_I2C_COREPLL_CORE_F_TYU_MASK; +- core_od = rol32(corepll_val, MLXBF_I2C_COREPLL_CORE_OD_TYU_SHIFT) & +- MLXBF_I2C_COREPLL_CORE_OD_TYU_MASK; +- core_r = rol32(corepll_val, MLXBF_I2C_COREPLL_CORE_R_TYU_SHIFT) & +- MLXBF_I2C_COREPLL_CORE_R_TYU_MASK; ++ core_f = FIELD_GET(MLXBF_I2C_COREPLL_CORE_F_TYU_MASK, corepll_val); ++ core_od = FIELD_GET(MLXBF_I2C_COREPLL_CORE_OD_TYU_MASK, corepll_val); ++ core_r = FIELD_GET(MLXBF_I2C_COREPLL_CORE_R_TYU_MASK, corepll_val); + + /* + * Compute PLL output frequency as follow: +@@ -1436,31 +1427,26 @@ static u64 mlxbf_calculate_freq_from_tyu(struct mlxbf_i2c_resource *corepll_res) + * Where PLL_OUT_FREQ and PLL_IN_FREQ refer to CoreFrequency + * and PadFrequency, respectively. + */ +- core_frequency = pad_frequency * (++core_f); ++ core_frequency = MLXBF_I2C_PLL_IN_FREQ * (++core_f); + core_frequency /= (++core_r) * (++core_od); + + return core_frequency; + } + +-static u64 mlxbf_calculate_freq_from_yu(struct mlxbf_i2c_resource *corepll_res) ++static u64 mlxbf_i2c_calculate_freq_from_yu(struct mlxbf_i2c_resource *corepll_res) + { + u32 corepll_reg1_val, corepll_reg2_val; +- u64 corepll_frequency, pad_frequency; ++ u64 corepll_frequency; + u8 core_od, core_r; + u32 core_f; + +- pad_frequency = MLXBF_I2C_PLL_IN_FREQ; +- + corepll_reg1_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG1); + corepll_reg2_val = readl(corepll_res->io + MLXBF_I2C_CORE_PLL_REG2); + + /* Get Core PLL configuration bits */ +- core_f = rol32(corepll_reg1_val, MLXBF_I2C_COREPLL_CORE_F_YU_SHIFT) & +- MLXBF_I2C_COREPLL_CORE_F_YU_MASK; +- core_r = rol32(corepll_reg1_val, MLXBF_I2C_COREPLL_CORE_R_YU_SHIFT) & +- MLXBF_I2C_COREPLL_CORE_R_YU_MASK; +- core_od = rol32(corepll_reg2_val, MLXBF_I2C_COREPLL_CORE_OD_YU_SHIFT) & +- MLXBF_I2C_COREPLL_CORE_OD_YU_MASK; ++ core_f = FIELD_GET(MLXBF_I2C_COREPLL_CORE_F_YU_MASK, corepll_reg1_val); ++ core_r = FIELD_GET(MLXBF_I2C_COREPLL_CORE_R_YU_MASK, corepll_reg1_val); ++ core_od = FIELD_GET(MLXBF_I2C_COREPLL_CORE_OD_YU_MASK, corepll_reg2_val); + + /* + * Compute PLL output frequency as follow: +@@ -1472,7 +1458,7 @@ static u64 mlxbf_calculate_freq_from_yu(struct mlxbf_i2c_resource *corepll_res) + * Where PLL_OUT_FREQ and PLL_IN_FREQ refer to CoreFrequency + * and PadFrequency, respectively. + */ +- corepll_frequency = (pad_frequency * core_f) / MLNXBF_I2C_COREPLL_CONST; ++ corepll_frequency = (MLXBF_I2C_PLL_IN_FREQ * core_f) / MLNXBF_I2C_COREPLL_CONST; + corepll_frequency /= (++core_r) * (++core_od); + + return corepll_frequency; +@@ -2180,14 +2166,14 @@ static struct mlxbf_i2c_chip_info mlxbf_i2c_chip[] = { + [1] = &mlxbf_i2c_corepll_res[MLXBF_I2C_CHIP_TYPE_1], + [2] = &mlxbf_i2c_gpio_res[MLXBF_I2C_CHIP_TYPE_1] + }, +- .calculate_freq = mlxbf_calculate_freq_from_tyu ++ .calculate_freq = mlxbf_i2c_calculate_freq_from_tyu + }, + [MLXBF_I2C_CHIP_TYPE_2] = { + .type = MLXBF_I2C_CHIP_TYPE_2, + .shared_res = { + [0] = &mlxbf_i2c_corepll_res[MLXBF_I2C_CHIP_TYPE_2] + }, +- .calculate_freq = mlxbf_calculate_freq_from_yu ++ .calculate_freq = mlxbf_i2c_calculate_freq_from_yu + } + }; + +diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c +index 774507b54b57b..313904be5f3bd 100644 +--- a/drivers/i2c/i2c-mux.c ++++ b/drivers/i2c/i2c-mux.c +@@ -243,9 +243,10 @@ struct i2c_mux_core *i2c_mux_alloc(struct i2c_adapter *parent, + int (*deselect)(struct i2c_mux_core *, u32)) + { + struct i2c_mux_core *muxc; ++ size_t mux_size; + +- muxc = devm_kzalloc(dev, struct_size(muxc, adapter, max_adapters) +- + sizeof_priv, GFP_KERNEL); ++ mux_size = struct_size(muxc, adapter, max_adapters); ++ muxc = devm_kzalloc(dev, size_add(mux_size, sizeof_priv), GFP_KERNEL); + if (!muxc) + return NULL; + if (sizeof_priv) +diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c +index 861a239d905a4..3ed15e8ca6775 100644 +--- a/drivers/iommu/intel/iommu.c ++++ b/drivers/iommu/intel/iommu.c +@@ -419,7 +419,7 @@ static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) + { + unsigned long fl_sagaw, sl_sagaw; + +- fl_sagaw = BIT(2) | (cap_fl1gp_support(iommu->cap) ? BIT(3) : 0); ++ fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0); + sl_sagaw = cap_sagaw(iommu->cap); + + /* Second level only. */ +diff --git a/drivers/media/usb/b2c2/flexcop-usb.c b/drivers/media/usb/b2c2/flexcop-usb.c +index 7835bb0f32fc3..e012b21c4fd7a 100644 +--- a/drivers/media/usb/b2c2/flexcop-usb.c ++++ b/drivers/media/usb/b2c2/flexcop-usb.c +@@ -511,7 +511,7 @@ static int flexcop_usb_init(struct flexcop_usb *fc_usb) + + if (fc_usb->uintf->cur_altsetting->desc.bNumEndpoints < 1) + return -ENODEV; +- if (!usb_endpoint_is_isoc_in(&fc_usb->uintf->cur_altsetting->endpoint[1].desc)) ++ if (!usb_endpoint_is_isoc_in(&fc_usb->uintf->cur_altsetting->endpoint[0].desc)) + return -ENODEV; + + switch (fc_usb->udev->speed) { +diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c +index f8fdf88fb240c..ecbc46714e681 100644 +--- a/drivers/memstick/core/ms_block.c ++++ b/drivers/memstick/core/ms_block.c +@@ -2188,7 +2188,6 @@ static void msb_remove(struct memstick_dev *card) + + /* Remove the disk */ + del_gendisk(msb->disk); +- blk_cleanup_queue(msb->queue); + blk_mq_free_tag_set(&msb->tag_set); + msb->queue = NULL; + +diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c +index 725ba74ded308..72e91c06c618b 100644 +--- a/drivers/memstick/core/mspro_block.c ++++ b/drivers/memstick/core/mspro_block.c +@@ -1294,7 +1294,6 @@ static void mspro_block_remove(struct memstick_dev *card) + del_gendisk(msb->disk); + dev_dbg(&card->dev, "mspro block remove\n"); + +- blk_cleanup_queue(msb->queue); + blk_mq_free_tag_set(&msb->tag_set); + msb->queue = NULL; + +diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c +index 912a398a9a764..2f89ae55c1773 100644 +--- a/drivers/mmc/core/block.c ++++ b/drivers/mmc/core/block.c +@@ -2509,7 +2509,6 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, + return md; + + err_cleanup_queue: +- blk_cleanup_queue(md->disk->queue); + blk_mq_free_tag_set(&md->queue.tag_set); + err_kfree: + kfree(md); +diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c +index fa5324ceeebe4..f824cfdab75ac 100644 +--- a/drivers/mmc/core/queue.c ++++ b/drivers/mmc/core/queue.c +@@ -494,7 +494,6 @@ void mmc_cleanup_queue(struct mmc_queue *mq) + if (blk_queue_quiesced(q)) + blk_mq_unquiesce_queue(q); + +- blk_cleanup_queue(q); + blk_mq_free_tag_set(&mq->tag_set); + + /* +diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c +index 1f0120cbe9e80..8ad095c19f271 100644 +--- a/drivers/net/bonding/bond_3ad.c ++++ b/drivers/net/bonding/bond_3ad.c +@@ -87,8 +87,9 @@ static const u8 null_mac_addr[ETH_ALEN + 2] __long_aligned = { + static u16 ad_ticks_per_sec; + static const int ad_delta_in_ticks = (AD_TIMER_INTERVAL * HZ) / 1000; + +-static const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned = +- MULTICAST_LACPDU_ADDR; ++const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned = { ++ 0x01, 0x80, 0xC2, 0x00, 0x00, 0x02 ++}; + + /* ================= main 802.3ad protocol functions ================== */ + static int ad_lacpdu_send(struct port *port); +diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c +index bff0bfd10e235..ab7cb48f8dfdd 100644 +--- a/drivers/net/bonding/bond_main.c ++++ b/drivers/net/bonding/bond_main.c +@@ -865,12 +865,8 @@ static void bond_hw_addr_flush(struct net_device *bond_dev, + dev_uc_unsync(slave_dev, bond_dev); + dev_mc_unsync(slave_dev, bond_dev); + +- if (BOND_MODE(bond) == BOND_MODE_8023AD) { +- /* del lacpdu mc addr from mc list */ +- u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR; +- +- dev_mc_del(slave_dev, lacpdu_multicast); +- } ++ if (BOND_MODE(bond) == BOND_MODE_8023AD) ++ dev_mc_del(slave_dev, lacpdu_mcast_addr); + } + + /*--------------------------- Active slave change ---------------------------*/ +@@ -890,7 +886,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, + if (bond->dev->flags & IFF_ALLMULTI) + dev_set_allmulti(old_active->dev, -1); + +- bond_hw_addr_flush(bond->dev, old_active->dev); ++ if (bond->dev->flags & IFF_UP) ++ bond_hw_addr_flush(bond->dev, old_active->dev); + } + + if (new_active) { +@@ -901,10 +898,12 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, + if (bond->dev->flags & IFF_ALLMULTI) + dev_set_allmulti(new_active->dev, 1); + +- netif_addr_lock_bh(bond->dev); +- dev_uc_sync(new_active->dev, bond->dev); +- dev_mc_sync(new_active->dev, bond->dev); +- netif_addr_unlock_bh(bond->dev); ++ if (bond->dev->flags & IFF_UP) { ++ netif_addr_lock_bh(bond->dev); ++ dev_uc_sync(new_active->dev, bond->dev); ++ dev_mc_sync(new_active->dev, bond->dev); ++ netif_addr_unlock_bh(bond->dev); ++ } + } + } + +@@ -2139,16 +2138,14 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, + } + } + +- netif_addr_lock_bh(bond_dev); +- dev_mc_sync_multiple(slave_dev, bond_dev); +- dev_uc_sync_multiple(slave_dev, bond_dev); +- netif_addr_unlock_bh(bond_dev); +- +- if (BOND_MODE(bond) == BOND_MODE_8023AD) { +- /* add lacpdu mc addr to mc list */ +- u8 lacpdu_multicast[ETH_ALEN] = MULTICAST_LACPDU_ADDR; ++ if (bond_dev->flags & IFF_UP) { ++ netif_addr_lock_bh(bond_dev); ++ dev_mc_sync_multiple(slave_dev, bond_dev); ++ dev_uc_sync_multiple(slave_dev, bond_dev); ++ netif_addr_unlock_bh(bond_dev); + +- dev_mc_add(slave_dev, lacpdu_multicast); ++ if (BOND_MODE(bond) == BOND_MODE_8023AD) ++ dev_mc_add(slave_dev, lacpdu_mcast_addr); + } + } + +@@ -2420,7 +2417,8 @@ static int __bond_release_one(struct net_device *bond_dev, + if (old_flags & IFF_ALLMULTI) + dev_set_allmulti(slave_dev, -1); + +- bond_hw_addr_flush(bond_dev, slave_dev); ++ if (old_flags & IFF_UP) ++ bond_hw_addr_flush(bond_dev, slave_dev); + } + + slave_disable_netpoll(slave); +@@ -4157,6 +4155,12 @@ static int bond_open(struct net_device *bond_dev) + struct list_head *iter; + struct slave *slave; + ++ if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN && !bond->rr_tx_counter) { ++ bond->rr_tx_counter = alloc_percpu(u32); ++ if (!bond->rr_tx_counter) ++ return -ENOMEM; ++ } ++ + /* reset slave->backup and slave->inactive */ + if (bond_has_slaves(bond)) { + bond_for_each_slave(bond, slave, iter) { +@@ -4194,6 +4198,9 @@ static int bond_open(struct net_device *bond_dev) + /* register to receive LACPDUs */ + bond->recv_probe = bond_3ad_lacpdu_recv; + bond_3ad_initiate_agg_selection(bond, 1); ++ ++ bond_for_each_slave(bond, slave, iter) ++ dev_mc_add(slave->dev, lacpdu_mcast_addr); + } + + if (bond_mode_can_use_xmit_hash(bond)) +@@ -4205,6 +4212,7 @@ static int bond_open(struct net_device *bond_dev) + static int bond_close(struct net_device *bond_dev) + { + struct bonding *bond = netdev_priv(bond_dev); ++ struct slave *slave; + + bond_work_cancel_all(bond); + bond->send_peer_notif = 0; +@@ -4212,6 +4220,19 @@ static int bond_close(struct net_device *bond_dev) + bond_alb_deinitialize(bond); + bond->recv_probe = NULL; + ++ if (bond_uses_primary(bond)) { ++ rcu_read_lock(); ++ slave = rcu_dereference(bond->curr_active_slave); ++ if (slave) ++ bond_hw_addr_flush(bond_dev, slave->dev); ++ rcu_read_unlock(); ++ } else { ++ struct list_head *iter; ++ ++ bond_for_each_slave(bond, slave, iter) ++ bond_hw_addr_flush(bond_dev, slave->dev); ++ } ++ + return 0; + } + +@@ -6195,15 +6216,6 @@ static int bond_init(struct net_device *bond_dev) + if (!bond->wq) + return -ENOMEM; + +- if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN) { +- bond->rr_tx_counter = alloc_percpu(u32); +- if (!bond->rr_tx_counter) { +- destroy_workqueue(bond->wq); +- bond->wq = NULL; +- return -ENOMEM; +- } +- } +- + spin_lock_init(&bond->stats_lock); + netdev_lockdep_set_classes(bond_dev); + +diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c +index d060088047f16..131467d37a45b 100644 +--- a/drivers/net/can/flexcan/flexcan-core.c ++++ b/drivers/net/can/flexcan/flexcan-core.c +@@ -941,11 +941,6 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload, + u32 reg_ctrl, reg_id, reg_iflag1; + int i; + +- if (unlikely(drop)) { +- skb = ERR_PTR(-ENOBUFS); +- goto mark_as_read; +- } +- + mb = flexcan_get_mb(priv, n); + + if (priv->devtype_data.quirks & FLEXCAN_QUIRK_USE_RX_MAILBOX) { +@@ -974,6 +969,11 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload, + reg_ctrl = priv->read(&mb->can_ctrl); + } + ++ if (unlikely(drop)) { ++ skb = ERR_PTR(-ENOBUFS); ++ goto mark_as_read; ++ } ++ + if (reg_ctrl & FLEXCAN_MB_CNT_EDL) + skb = alloc_canfd_skb(offload->dev, &cfd); + else +diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c +index d3a658b444b5f..092cd51b3926e 100644 +--- a/drivers/net/can/usb/gs_usb.c ++++ b/drivers/net/can/usb/gs_usb.c +@@ -824,6 +824,7 @@ static int gs_can_open(struct net_device *netdev) + flags |= GS_CAN_MODE_TRIPLE_SAMPLE; + + /* finally start device */ ++ dev->can.state = CAN_STATE_ERROR_ACTIVE; + dm->mode = cpu_to_le32(GS_CAN_MODE_START); + dm->flags = cpu_to_le32(flags); + rc = usb_control_msg(interface_to_usbdev(dev->iface), +@@ -835,13 +836,12 @@ static int gs_can_open(struct net_device *netdev) + if (rc < 0) { + netdev_err(netdev, "Couldn't start device (err=%d)\n", rc); + kfree(dm); ++ dev->can.state = CAN_STATE_STOPPED; + return rc; + } + + kfree(dm); + +- dev->can.state = CAN_STATE_ERROR_ACTIVE; +- + parent->active_channels++; + if (!(dev->can.ctrlmode & CAN_CTRLMODE_LISTENONLY)) + netif_start_queue(netdev); +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 964354536f9ce..111a952f880ee 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -662,7 +662,6 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts) + + for (i = 0; i < nr_pkts; i++) { + struct bnxt_sw_tx_bd *tx_buf; +- bool compl_deferred = false; + struct sk_buff *skb; + int j, last; + +@@ -671,6 +670,8 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts) + skb = tx_buf->skb; + tx_buf->skb = NULL; + ++ tx_bytes += skb->len; ++ + if (tx_buf->is_push) { + tx_buf->is_push = 0; + goto next_tx_int; +@@ -691,8 +692,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts) + } + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) { + if (bp->flags & BNXT_FLAG_CHIP_P5) { ++ /* PTP worker takes ownership of the skb */ + if (!bnxt_get_tx_ts_p5(bp, skb)) +- compl_deferred = true; ++ skb = NULL; + else + atomic_inc(&bp->ptp_cfg->tx_avail); + } +@@ -701,9 +703,7 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts) + next_tx_int: + cons = NEXT_TX(cons); + +- tx_bytes += skb->len; +- if (!compl_deferred) +- dev_kfree_skb_any(skb); ++ dev_kfree_skb_any(skb); + } + + netdev_tx_completed_queue(txq, nr_pkts, tx_bytes); +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +index 7f3c0875b6f58..8e316367f6ced 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +@@ -317,9 +317,9 @@ void bnxt_ptp_cfg_tstamp_filters(struct bnxt *bp) + + if (!(bp->fw_cap & BNXT_FW_CAP_RX_ALL_PKT_TS) && (ptp->tstamp_filters & + (PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_ENABLE | +- PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_DISABLE))) { ++ PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_DISABLE))) { + ptp->tstamp_filters &= ~(PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_ENABLE | +- PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_DISABLE); ++ PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_DISABLE); + netdev_warn(bp->dev, "Unsupported FW for all RX pkts timestamp filter\n"); + } + +diff --git a/drivers/net/ethernet/freescale/enetc/Makefile b/drivers/net/ethernet/freescale/enetc/Makefile +index a139f2e9d59f0..e0e8dfd137930 100644 +--- a/drivers/net/ethernet/freescale/enetc/Makefile ++++ b/drivers/net/ethernet/freescale/enetc/Makefile +@@ -9,7 +9,6 @@ fsl-enetc-$(CONFIG_FSL_ENETC_QOS) += enetc_qos.o + + obj-$(CONFIG_FSL_ENETC_VF) += fsl-enetc-vf.o + fsl-enetc-vf-y := enetc_vf.o $(common-objs) +-fsl-enetc-vf-$(CONFIG_FSL_ENETC_QOS) += enetc_qos.o + + obj-$(CONFIG_FSL_ENETC_IERB) += fsl-enetc-ierb.o + fsl-enetc-ierb-y := enetc_ierb.o +diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c +index 4470a4a3e4c3e..9f5b921039bd4 100644 +--- a/drivers/net/ethernet/freescale/enetc/enetc.c ++++ b/drivers/net/ethernet/freescale/enetc/enetc.c +@@ -2432,7 +2432,7 @@ int enetc_close(struct net_device *ndev) + return 0; + } + +-static int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data) ++int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data) + { + struct enetc_ndev_priv *priv = netdev_priv(ndev); + struct tc_mqprio_qopt *mqprio = type_data; +@@ -2486,25 +2486,6 @@ static int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data) + return 0; + } + +-int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, +- void *type_data) +-{ +- switch (type) { +- case TC_SETUP_QDISC_MQPRIO: +- return enetc_setup_tc_mqprio(ndev, type_data); +- case TC_SETUP_QDISC_TAPRIO: +- return enetc_setup_tc_taprio(ndev, type_data); +- case TC_SETUP_QDISC_CBS: +- return enetc_setup_tc_cbs(ndev, type_data); +- case TC_SETUP_QDISC_ETF: +- return enetc_setup_tc_txtime(ndev, type_data); +- case TC_SETUP_BLOCK: +- return enetc_setup_tc_psfp(ndev, type_data); +- default: +- return -EOPNOTSUPP; +- } +-} +- + static int enetc_setup_xdp_prog(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) + { +@@ -2600,29 +2581,6 @@ static int enetc_set_rss(struct net_device *ndev, int en) + return 0; + } + +-static int enetc_set_psfp(struct net_device *ndev, int en) +-{ +- struct enetc_ndev_priv *priv = netdev_priv(ndev); +- int err; +- +- if (en) { +- err = enetc_psfp_enable(priv); +- if (err) +- return err; +- +- priv->active_offloads |= ENETC_F_QCI; +- return 0; +- } +- +- err = enetc_psfp_disable(priv); +- if (err) +- return err; +- +- priv->active_offloads &= ~ENETC_F_QCI; +- +- return 0; +-} +- + static void enetc_enable_rxvlan(struct net_device *ndev, bool en) + { + struct enetc_ndev_priv *priv = netdev_priv(ndev); +@@ -2641,11 +2599,9 @@ static void enetc_enable_txvlan(struct net_device *ndev, bool en) + enetc_bdr_enable_txvlan(&priv->si->hw, i, en); + } + +-int enetc_set_features(struct net_device *ndev, +- netdev_features_t features) ++void enetc_set_features(struct net_device *ndev, netdev_features_t features) + { + netdev_features_t changed = ndev->features ^ features; +- int err = 0; + + if (changed & NETIF_F_RXHASH) + enetc_set_rss(ndev, !!(features & NETIF_F_RXHASH)); +@@ -2657,11 +2613,6 @@ int enetc_set_features(struct net_device *ndev, + if (changed & NETIF_F_HW_VLAN_CTAG_TX) + enetc_enable_txvlan(ndev, + !!(features & NETIF_F_HW_VLAN_CTAG_TX)); +- +- if (changed & NETIF_F_HW_TC) +- err = enetc_set_psfp(ndev, !!(features & NETIF_F_HW_TC)); +- +- return err; + } + + #ifdef CONFIG_FSL_ENETC_PTP_CLOCK +diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h +index 29922c20531f0..2cfe6944ebd32 100644 +--- a/drivers/net/ethernet/freescale/enetc/enetc.h ++++ b/drivers/net/ethernet/freescale/enetc/enetc.h +@@ -393,11 +393,9 @@ void enetc_start(struct net_device *ndev); + void enetc_stop(struct net_device *ndev); + netdev_tx_t enetc_xmit(struct sk_buff *skb, struct net_device *ndev); + struct net_device_stats *enetc_get_stats(struct net_device *ndev); +-int enetc_set_features(struct net_device *ndev, +- netdev_features_t features); ++void enetc_set_features(struct net_device *ndev, netdev_features_t features); + int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd); +-int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, +- void *type_data); ++int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data); + int enetc_setup_bpf(struct net_device *dev, struct netdev_bpf *xdp); + int enetc_xdp_xmit(struct net_device *ndev, int num_frames, + struct xdp_frame **frames, u32 flags); +@@ -465,6 +463,7 @@ int enetc_setup_tc_block_cb(enum tc_setup_type type, void *type_data, + int enetc_setup_tc_psfp(struct net_device *ndev, void *type_data); + int enetc_psfp_init(struct enetc_ndev_priv *priv); + int enetc_psfp_clean(struct enetc_ndev_priv *priv); ++int enetc_set_psfp(struct net_device *ndev, bool en); + + static inline void enetc_get_max_cap(struct enetc_ndev_priv *priv) + { +@@ -540,4 +539,9 @@ static inline int enetc_psfp_disable(struct enetc_ndev_priv *priv) + { + return 0; + } ++ ++static inline int enetc_set_psfp(struct net_device *ndev, bool en) ++{ ++ return 0; ++} + #endif +diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c +index c4a0e836d4f09..bb7750222691d 100644 +--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c ++++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c +@@ -709,6 +709,13 @@ static int enetc_pf_set_features(struct net_device *ndev, + { + netdev_features_t changed = ndev->features ^ features; + struct enetc_ndev_priv *priv = netdev_priv(ndev); ++ int err; ++ ++ if (changed & NETIF_F_HW_TC) { ++ err = enetc_set_psfp(ndev, !!(features & NETIF_F_HW_TC)); ++ if (err) ++ return err; ++ } + + if (changed & NETIF_F_HW_VLAN_CTAG_FILTER) { + struct enetc_pf *pf = enetc_si_priv(priv->si); +@@ -722,7 +729,28 @@ static int enetc_pf_set_features(struct net_device *ndev, + if (changed & NETIF_F_LOOPBACK) + enetc_set_loopback(ndev, !!(features & NETIF_F_LOOPBACK)); + +- return enetc_set_features(ndev, features); ++ enetc_set_features(ndev, features); ++ ++ return 0; ++} ++ ++static int enetc_pf_setup_tc(struct net_device *ndev, enum tc_setup_type type, ++ void *type_data) ++{ ++ switch (type) { ++ case TC_SETUP_QDISC_MQPRIO: ++ return enetc_setup_tc_mqprio(ndev, type_data); ++ case TC_SETUP_QDISC_TAPRIO: ++ return enetc_setup_tc_taprio(ndev, type_data); ++ case TC_SETUP_QDISC_CBS: ++ return enetc_setup_tc_cbs(ndev, type_data); ++ case TC_SETUP_QDISC_ETF: ++ return enetc_setup_tc_txtime(ndev, type_data); ++ case TC_SETUP_BLOCK: ++ return enetc_setup_tc_psfp(ndev, type_data); ++ default: ++ return -EOPNOTSUPP; ++ } + } + + static const struct net_device_ops enetc_ndev_ops = { +@@ -739,7 +767,7 @@ static const struct net_device_ops enetc_ndev_ops = { + .ndo_set_vf_spoofchk = enetc_pf_set_vf_spoofchk, + .ndo_set_features = enetc_pf_set_features, + .ndo_eth_ioctl = enetc_ioctl, +- .ndo_setup_tc = enetc_setup_tc, ++ .ndo_setup_tc = enetc_pf_setup_tc, + .ndo_bpf = enetc_setup_bpf, + .ndo_xdp_xmit = enetc_xdp_xmit, + }; +diff --git a/drivers/net/ethernet/freescale/enetc/enetc_qos.c b/drivers/net/ethernet/freescale/enetc/enetc_qos.c +index 582a663ed0ba4..f8a2f02ce22de 100644 +--- a/drivers/net/ethernet/freescale/enetc/enetc_qos.c ++++ b/drivers/net/ethernet/freescale/enetc/enetc_qos.c +@@ -1517,6 +1517,29 @@ int enetc_setup_tc_block_cb(enum tc_setup_type type, void *type_data, + } + } + ++int enetc_set_psfp(struct net_device *ndev, bool en) ++{ ++ struct enetc_ndev_priv *priv = netdev_priv(ndev); ++ int err; ++ ++ if (en) { ++ err = enetc_psfp_enable(priv); ++ if (err) ++ return err; ++ ++ priv->active_offloads |= ENETC_F_QCI; ++ return 0; ++ } ++ ++ err = enetc_psfp_disable(priv); ++ if (err) ++ return err; ++ ++ priv->active_offloads &= ~ENETC_F_QCI; ++ ++ return 0; ++} ++ + int enetc_psfp_init(struct enetc_ndev_priv *priv) + { + if (epsfp.psfp_sfi_bitmap) +diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c +index 17924305afa2f..dfcaac302e245 100644 +--- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c ++++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c +@@ -88,7 +88,20 @@ static int enetc_vf_set_mac_addr(struct net_device *ndev, void *addr) + static int enetc_vf_set_features(struct net_device *ndev, + netdev_features_t features) + { +- return enetc_set_features(ndev, features); ++ enetc_set_features(ndev, features); ++ ++ return 0; ++} ++ ++static int enetc_vf_setup_tc(struct net_device *ndev, enum tc_setup_type type, ++ void *type_data) ++{ ++ switch (type) { ++ case TC_SETUP_QDISC_MQPRIO: ++ return enetc_setup_tc_mqprio(ndev, type_data); ++ default: ++ return -EOPNOTSUPP; ++ } + } + + /* Probing/ Init */ +@@ -100,7 +113,7 @@ static const struct net_device_ops enetc_ndev_ops = { + .ndo_set_mac_address = enetc_vf_set_mac_addr, + .ndo_set_features = enetc_vf_set_features, + .ndo_eth_ioctl = enetc_ioctl, +- .ndo_setup_tc = enetc_setup_tc, ++ .ndo_setup_tc = enetc_vf_setup_tc, + }; + + static void enetc_vf_netdev_setup(struct enetc_si *si, struct net_device *ndev, +diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c +index 8c939628e2d85..2e6461b0ea8bc 100644 +--- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c ++++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c +@@ -157,7 +157,7 @@ static int gve_alloc_page_dqo(struct gve_priv *priv, + int err; + + err = gve_alloc_page(priv, &priv->pdev->dev, &buf_state->page_info.page, +- &buf_state->addr, DMA_FROM_DEVICE, GFP_KERNEL); ++ &buf_state->addr, DMA_FROM_DEVICE, GFP_ATOMIC); + if (err) + return err; + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index 1aaf0c5ddf6cf..57e27f2024d38 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -5785,6 +5785,26 @@ static int i40e_get_link_speed(struct i40e_vsi *vsi) + } + } + ++/** ++ * i40e_bw_bytes_to_mbits - Convert max_tx_rate from bytes to mbits ++ * @vsi: Pointer to vsi structure ++ * @max_tx_rate: max TX rate in bytes to be converted into Mbits ++ * ++ * Helper function to convert units before send to set BW limit ++ **/ ++static u64 i40e_bw_bytes_to_mbits(struct i40e_vsi *vsi, u64 max_tx_rate) ++{ ++ if (max_tx_rate < I40E_BW_MBPS_DIVISOR) { ++ dev_warn(&vsi->back->pdev->dev, ++ "Setting max tx rate to minimum usable value of 50Mbps.\n"); ++ max_tx_rate = I40E_BW_CREDIT_DIVISOR; ++ } else { ++ do_div(max_tx_rate, I40E_BW_MBPS_DIVISOR); ++ } ++ ++ return max_tx_rate; ++} ++ + /** + * i40e_set_bw_limit - setup BW limit for Tx traffic based on max_tx_rate + * @vsi: VSI to be configured +@@ -5807,10 +5827,10 @@ int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate) + max_tx_rate, seid); + return -EINVAL; + } +- if (max_tx_rate && max_tx_rate < 50) { ++ if (max_tx_rate && max_tx_rate < I40E_BW_CREDIT_DIVISOR) { + dev_warn(&pf->pdev->dev, + "Setting max tx rate to minimum usable value of 50Mbps.\n"); +- max_tx_rate = 50; ++ max_tx_rate = I40E_BW_CREDIT_DIVISOR; + } + + /* Tx rate credits are in values of 50Mbps, 0 is disabled */ +@@ -8101,9 +8121,9 @@ config_tc: + + if (i40e_is_tc_mqprio_enabled(pf)) { + if (vsi->mqprio_qopt.max_rate[0]) { +- u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0]; ++ u64 max_tx_rate = i40e_bw_bytes_to_mbits(vsi, ++ vsi->mqprio_qopt.max_rate[0]); + +- do_div(max_tx_rate, I40E_BW_MBPS_DIVISOR); + ret = i40e_set_bw_limit(vsi, vsi->seid, max_tx_rate); + if (!ret) { + u64 credits = max_tx_rate; +@@ -10848,10 +10868,10 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) + } + + if (vsi->mqprio_qopt.max_rate[0]) { +- u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0]; ++ u64 max_tx_rate = i40e_bw_bytes_to_mbits(vsi, ++ vsi->mqprio_qopt.max_rate[0]); + u64 credits = 0; + +- do_div(max_tx_rate, I40E_BW_MBPS_DIVISOR); + ret = i40e_set_bw_limit(vsi, vsi->seid, max_tx_rate); + if (ret) + goto end_unlock; +diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +index 86b0f21287dc8..67fbaaad39859 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -2038,6 +2038,25 @@ static void i40e_del_qch(struct i40e_vf *vf) + } + } + ++/** ++ * i40e_vc_get_max_frame_size ++ * @vf: pointer to the VF ++ * ++ * Max frame size is determined based on the current port's max frame size and ++ * whether a port VLAN is configured on this VF. The VF is not aware whether ++ * it's in a port VLAN so the PF needs to account for this in max frame size ++ * checks and sending the max frame size to the VF. ++ **/ ++static u16 i40e_vc_get_max_frame_size(struct i40e_vf *vf) ++{ ++ u16 max_frame_size = vf->pf->hw.phy.link_info.max_frame_size; ++ ++ if (vf->port_vlan_id) ++ max_frame_size -= VLAN_HLEN; ++ ++ return max_frame_size; ++} ++ + /** + * i40e_vc_get_vf_resources_msg + * @vf: pointer to the VF info +@@ -2139,6 +2158,7 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg) + vfres->max_vectors = pf->hw.func_caps.num_msix_vectors_vf; + vfres->rss_key_size = I40E_HKEY_ARRAY_SIZE; + vfres->rss_lut_size = I40E_VF_HLUT_ARRAY_SIZE; ++ vfres->max_mtu = i40e_vc_get_max_frame_size(vf); + + if (vf->lan_vsi_idx) { + vfres->vsi_res[0].vsi_id = vf->lan_vsi_id; +diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c +index 06d18797d25a2..18b6a702a1d6d 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c +@@ -114,8 +114,11 @@ u32 iavf_get_tx_pending(struct iavf_ring *ring, bool in_sw) + { + u32 head, tail; + ++ /* underlying hardware might not allow access and/or always return ++ * 0 for the head/tail registers so just use the cached values ++ */ + head = ring->next_to_clean; +- tail = readl(ring->tail); ++ tail = ring->next_to_use; + + if (head != tail) + return (head < tail) ? +@@ -1390,7 +1393,7 @@ static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring, + #endif + struct sk_buff *skb; + +- if (!rx_buffer) ++ if (!rx_buffer || !size) + return NULL; + /* prefetch first cache line of first page */ + va = page_address(rx_buffer->page) + rx_buffer->page_offset; +@@ -1548,7 +1551,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget) + /* exit if we failed to retrieve a buffer */ + if (!skb) { + rx_ring->rx_stats.alloc_buff_failed++; +- if (rx_buffer) ++ if (rx_buffer && size) + rx_buffer->pagecnt_bias++; + break; + } +diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +index 1603e99bae4af..498797a0a0a95 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +@@ -273,11 +273,14 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter) + void iavf_configure_queues(struct iavf_adapter *adapter) + { + struct virtchnl_vsi_queue_config_info *vqci; +- struct virtchnl_queue_pair_info *vqpi; ++ int i, max_frame = adapter->vf_res->max_mtu; + int pairs = adapter->num_active_queues; +- int i, max_frame = IAVF_MAX_RXBUFFER; ++ struct virtchnl_queue_pair_info *vqpi; + size_t len; + ++ if (max_frame > IAVF_MAX_RXBUFFER || !max_frame) ++ max_frame = IAVF_MAX_RXBUFFER; ++ + if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) { + /* bail because we already have a command pending */ + dev_err(&adapter->pdev->dev, "Cannot configure queues, command %d pending\n", +diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c +index 6c4e1d45235ef..1169fd7811b09 100644 +--- a/drivers/net/ethernet/intel/ice/ice_lib.c ++++ b/drivers/net/ethernet/intel/ice/ice_lib.c +@@ -911,7 +911,7 @@ static void ice_set_dflt_vsi_ctx(struct ice_hw *hw, struct ice_vsi_ctx *ctxt) + */ + static int ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt) + { +- u16 offset = 0, qmap = 0, tx_count = 0, pow = 0; ++ u16 offset = 0, qmap = 0, tx_count = 0, rx_count = 0, pow = 0; + u16 num_txq_per_tc, num_rxq_per_tc; + u16 qcount_tx = vsi->alloc_txq; + u16 qcount_rx = vsi->alloc_rxq; +@@ -978,23 +978,25 @@ static int ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt) + * at least 1) + */ + if (offset) +- vsi->num_rxq = offset; ++ rx_count = offset; + else +- vsi->num_rxq = num_rxq_per_tc; ++ rx_count = num_rxq_per_tc; + +- if (vsi->num_rxq > vsi->alloc_rxq) { ++ if (rx_count > vsi->alloc_rxq) { + dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Rx queues (%u), than were allocated (%u)!\n", +- vsi->num_rxq, vsi->alloc_rxq); ++ rx_count, vsi->alloc_rxq); + return -EINVAL; + } + +- vsi->num_txq = tx_count; +- if (vsi->num_txq > vsi->alloc_txq) { ++ if (tx_count > vsi->alloc_txq) { + dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n", +- vsi->num_txq, vsi->alloc_txq); ++ tx_count, vsi->alloc_txq); + return -EINVAL; + } + ++ vsi->num_txq = tx_count; ++ vsi->num_rxq = rx_count; ++ + if (vsi->type == ICE_VSI_VF && vsi->num_txq != vsi->num_rxq) { + dev_dbg(ice_pf_to_dev(vsi->back), "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n"); + /* since there is a chance that num_rxq could have been changed +@@ -3487,6 +3489,7 @@ ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt, + u16 pow, offset = 0, qcount_tx = 0, qcount_rx = 0, qmap; + u16 tc0_offset = vsi->mqprio_qopt.qopt.offset[0]; + int tc0_qcount = vsi->mqprio_qopt.qopt.count[0]; ++ u16 new_txq, new_rxq; + u8 netdev_tc = 0; + int i; + +@@ -3527,21 +3530,24 @@ ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt, + } + } + +- /* Set actual Tx/Rx queue pairs */ +- vsi->num_txq = offset + qcount_tx; +- if (vsi->num_txq > vsi->alloc_txq) { ++ new_txq = offset + qcount_tx; ++ if (new_txq > vsi->alloc_txq) { + dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n", +- vsi->num_txq, vsi->alloc_txq); ++ new_txq, vsi->alloc_txq); + return -EINVAL; + } + +- vsi->num_rxq = offset + qcount_rx; +- if (vsi->num_rxq > vsi->alloc_rxq) { ++ new_rxq = offset + qcount_rx; ++ if (new_rxq > vsi->alloc_rxq) { + dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Rx queues (%u), than were allocated (%u)!\n", +- vsi->num_rxq, vsi->alloc_rxq); ++ new_rxq, vsi->alloc_rxq); + return -EINVAL; + } + ++ /* Set actual Tx/Rx queue pairs */ ++ vsi->num_txq = new_txq; ++ vsi->num_rxq = new_rxq; ++ + /* Setup queue TC[0].qmap for given VSI context */ + ctxt->info.tc_mapping[0] = cpu_to_le16(qmap); + ctxt->info.q_mapping[0] = cpu_to_le16(vsi->rxq_map[0]); +@@ -3573,6 +3579,7 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc) + { + u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 }; + struct ice_pf *pf = vsi->back; ++ struct ice_tc_cfg old_tc_cfg; + struct ice_vsi_ctx *ctx; + struct device *dev; + int i, ret = 0; +@@ -3597,6 +3604,7 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc) + max_txqs[i] = vsi->num_txq; + } + ++ memcpy(&old_tc_cfg, &vsi->tc_cfg, sizeof(old_tc_cfg)); + vsi->tc_cfg.ena_tc = ena_tc; + vsi->tc_cfg.numtc = num_tc; + +@@ -3613,8 +3621,10 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc) + else + ret = ice_vsi_setup_q_map(vsi, ctx); + +- if (ret) ++ if (ret) { ++ memcpy(&vsi->tc_cfg, &old_tc_cfg, sizeof(vsi->tc_cfg)); + goto out; ++ } + + /* must to indicate which section of VSI context are being modified */ + ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID); +diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c +index 4c6bb7482b362..48befe1e2872c 100644 +--- a/drivers/net/ethernet/intel/ice/ice_main.c ++++ b/drivers/net/ethernet/intel/ice/ice_main.c +@@ -2399,8 +2399,6 @@ int ice_schedule_reset(struct ice_pf *pf, enum ice_reset_req reset) + return -EBUSY; + } + +- ice_unplug_aux_dev(pf); +- + switch (reset) { + case ICE_RESET_PFR: + set_bit(ICE_PFR_REQ, pf->state); +@@ -6629,7 +6627,7 @@ static void ice_napi_disable_all(struct ice_vsi *vsi) + */ + int ice_down(struct ice_vsi *vsi) + { +- int i, tx_err, rx_err, link_err = 0, vlan_err = 0; ++ int i, tx_err, rx_err, vlan_err = 0; + + WARN_ON(!test_bit(ICE_VSI_DOWN, vsi->state)); + +@@ -6663,20 +6661,13 @@ int ice_down(struct ice_vsi *vsi) + + ice_napi_disable_all(vsi); + +- if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) { +- link_err = ice_force_phys_link_state(vsi, false); +- if (link_err) +- netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", +- vsi->vsi_num, link_err); +- } +- + ice_for_each_txq(vsi, i) + ice_clean_tx_ring(vsi->tx_rings[i]); + + ice_for_each_rxq(vsi, i) + ice_clean_rx_ring(vsi->rx_rings[i]); + +- if (tx_err || rx_err || link_err || vlan_err) { ++ if (tx_err || rx_err || vlan_err) { + netdev_err(vsi->netdev, "Failed to close VSI 0x%04X on switch 0x%04X\n", + vsi->vsi_num, vsi->vsw->sw_id); + return -EIO; +@@ -6838,6 +6829,8 @@ int ice_vsi_open(struct ice_vsi *vsi) + if (err) + goto err_setup_rx; + ++ ice_vsi_cfg_netdev_tc(vsi, vsi->tc_cfg.ena_tc); ++ + if (vsi->type == ICE_VSI_PF) { + /* Notify the stack of the actual queue counts. */ + err = netif_set_real_num_tx_queues(vsi->netdev, vsi->num_txq); +@@ -8876,6 +8869,16 @@ int ice_stop(struct net_device *netdev) + return -EBUSY; + } + ++ if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) { ++ int link_err = ice_force_phys_link_state(vsi, false); ++ ++ if (link_err) { ++ netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", ++ vsi->vsi_num, link_err); ++ return -EIO; ++ } ++ } ++ + ice_vsi_close(vsi); + + return 0; +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c +index 836dce8407124..97453d1dfafed 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx.c ++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c +@@ -610,7 +610,7 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, + if (test_bit(ICE_VSI_DOWN, vsi->state)) + return -ENETDOWN; + +- if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq) ++ if (!ice_is_xdp_ena_vsi(vsi)) + return -ENXIO; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) +@@ -621,6 +621,9 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, + xdp_ring = vsi->xdp_rings[queue_index]; + spin_lock(&xdp_ring->tx_lock); + } else { ++ /* Generally, should not happen */ ++ if (unlikely(queue_index >= vsi->num_xdp_txq)) ++ return -ENXIO; + xdp_ring = vsi->xdp_rings[queue_index]; + } + +diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c +index 85155cd9405c5..4aeb927c37153 100644 +--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c ++++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_mdio.c +@@ -179,6 +179,9 @@ static int mlxbf_gige_mdio_read(struct mii_bus *bus, int phy_add, int phy_reg) + /* Only return ad bits of the gw register */ + ret &= MLXBF_GIGE_MDIO_GW_AD_MASK; + ++ /* The MDIO lock is set on read. To release it, clear gw register */ ++ writel(0, priv->mdio_io + MLXBF_GIGE_MDIO_GW_OFFSET); ++ + return ret; + } + +@@ -203,6 +206,9 @@ static int mlxbf_gige_mdio_write(struct mii_bus *bus, int phy_add, + temp, !(temp & MLXBF_GIGE_MDIO_GW_BUSY_MASK), + 5, 1000000); + ++ /* The MDIO lock is set on read. To release it, clear gw register */ ++ writel(0, priv->mdio_io + MLXBF_GIGE_MDIO_GW_OFFSET); ++ + return ret; + } + +diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c +index 49b85ca578b01..9820efce72ffe 100644 +--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c ++++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c +@@ -370,6 +370,11 @@ static void mana_gd_process_eq_events(void *arg) + break; + } + ++ /* Per GDMA spec, rmb is necessary after checking owner_bits, before ++ * reading eqe. ++ */ ++ rmb(); ++ + mana_gd_process_eqe(eq); + + eq->head++; +@@ -1107,6 +1112,11 @@ static int mana_gd_read_cqe(struct gdma_queue *cq, struct gdma_comp *comp) + if (WARN_ON_ONCE(owner_bits != new_bits)) + return -1; + ++ /* Per GDMA spec, rmb is necessary after checking owner_bits, before ++ * reading completion info ++ */ ++ rmb(); ++ + comp->wq_num = cqe->cqe_info.wq_num; + comp->is_sq = cqe->cqe_info.is_sq; + memcpy(comp->cqe_data, cqe->cqe_data, GDMA_COMP_DATA_SIZE); +diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c +index b357ac4c56c59..7e32b04eb0c75 100644 +--- a/drivers/net/ethernet/renesas/ravb_main.c ++++ b/drivers/net/ethernet/renesas/ravb_main.c +@@ -1449,6 +1449,8 @@ static int ravb_phy_init(struct net_device *ndev) + phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_100baseT_Half_BIT); + } + ++ /* Indicate that the MAC is responsible for managing PHY PM */ ++ phydev->mac_managed_pm = true; + phy_attached_info(phydev); + + return 0; +diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c +index 67ade78fb7671..7fd8828d3a846 100644 +--- a/drivers/net/ethernet/renesas/sh_eth.c ++++ b/drivers/net/ethernet/renesas/sh_eth.c +@@ -2029,6 +2029,8 @@ static int sh_eth_phy_init(struct net_device *ndev) + if (mdp->cd->register_type != SH_ETH_REG_GIGABIT) + phy_set_max_speed(phydev, SPEED_100); + ++ /* Indicate that the MAC is responsible for managing PHY PM */ ++ phydev->mac_managed_pm = true; + phy_attached_info(phydev); + + return 0; +diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c +index 032b8c0bd7889..5b4d661ab9867 100644 +--- a/drivers/net/ethernet/sfc/efx_channels.c ++++ b/drivers/net/ethernet/sfc/efx_channels.c +@@ -319,7 +319,7 @@ int efx_probe_interrupts(struct efx_nic *efx) + efx->n_channels = 1 + (efx_separate_tx_channels ? 1 : 0); + efx->n_rx_channels = 1; + efx->n_tx_channels = 1; +- efx->tx_channel_offset = 1; ++ efx->tx_channel_offset = efx_separate_tx_channels ? 1 : 0; + efx->n_xdp_channels = 0; + efx->xdp_channel_offset = efx->n_channels; + efx->legacy_irq = efx->pci_dev->irq; +diff --git a/drivers/net/ethernet/sfc/siena/efx_channels.c b/drivers/net/ethernet/sfc/siena/efx_channels.c +index 017212a40df38..f54ebd0072868 100644 +--- a/drivers/net/ethernet/sfc/siena/efx_channels.c ++++ b/drivers/net/ethernet/sfc/siena/efx_channels.c +@@ -320,7 +320,7 @@ int efx_siena_probe_interrupts(struct efx_nic *efx) + efx->n_channels = 1 + (efx_siena_separate_tx_channels ? 1 : 0); + efx->n_rx_channels = 1; + efx->n_tx_channels = 1; +- efx->tx_channel_offset = 1; ++ efx->tx_channel_offset = efx_siena_separate_tx_channels ? 1 : 0; + efx->n_xdp_channels = 0; + efx->xdp_channel_offset = efx->n_channels; + efx->legacy_irq = efx->pci_dev->irq; +diff --git a/drivers/net/ethernet/sfc/siena/tx.c b/drivers/net/ethernet/sfc/siena/tx.c +index e166dcb9b99ce..91e87594ed1ea 100644 +--- a/drivers/net/ethernet/sfc/siena/tx.c ++++ b/drivers/net/ethernet/sfc/siena/tx.c +@@ -336,7 +336,7 @@ netdev_tx_t efx_siena_hard_start_xmit(struct sk_buff *skb, + * previous packets out. + */ + if (!netdev_xmit_more()) +- efx_tx_send_pending(tx_queue->channel); ++ efx_tx_send_pending(efx_get_tx_channel(efx, index)); + return NETDEV_TX_OK; + } + +diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c +index 138bca6113415..80ed7f760bd30 100644 +--- a/drivers/net/ethernet/sfc/tx.c ++++ b/drivers/net/ethernet/sfc/tx.c +@@ -549,7 +549,7 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb, + * previous packets out. + */ + if (!netdev_xmit_more()) +- efx_tx_send_pending(tx_queue->channel); ++ efx_tx_send_pending(efx_get_tx_channel(efx, index)); + return NETDEV_TX_OK; + } + +diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c +index 8594ee839628b..88aa0d310aeef 100644 +--- a/drivers/net/ethernet/sun/sunhme.c ++++ b/drivers/net/ethernet/sun/sunhme.c +@@ -2020,9 +2020,9 @@ static void happy_meal_rx(struct happy_meal *hp, struct net_device *dev) + + skb_reserve(copy_skb, 2); + skb_put(copy_skb, len); +- dma_sync_single_for_cpu(hp->dma_dev, dma_addr, len, DMA_FROM_DEVICE); ++ dma_sync_single_for_cpu(hp->dma_dev, dma_addr, len + 2, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, copy_skb->data, len); +- dma_sync_single_for_device(hp->dma_dev, dma_addr, len, DMA_FROM_DEVICE); ++ dma_sync_single_for_device(hp->dma_dev, dma_addr, len + 2, DMA_FROM_DEVICE); + /* Reuse original ring buffer. */ + hme_write_rxd(hp, this, + (RXFLAG_OWN|((RX_BUF_ALLOC_SIZE-RX_OFFSET)<<16)), +diff --git a/drivers/net/ipa/ipa_qmi.c b/drivers/net/ipa/ipa_qmi.c +index ec010cf2e816a..6f874f99b910c 100644 +--- a/drivers/net/ipa/ipa_qmi.c ++++ b/drivers/net/ipa/ipa_qmi.c +@@ -308,12 +308,12 @@ init_modem_driver_req(struct ipa_qmi *ipa_qmi) + mem = ipa_mem_find(ipa, IPA_MEM_V4_ROUTE); + req.v4_route_tbl_info_valid = 1; + req.v4_route_tbl_info.start = ipa->mem_offset + mem->offset; +- req.v4_route_tbl_info.count = mem->size / sizeof(__le64); ++ req.v4_route_tbl_info.end = IPA_ROUTE_MODEM_COUNT - 1; + + mem = ipa_mem_find(ipa, IPA_MEM_V6_ROUTE); + req.v6_route_tbl_info_valid = 1; + req.v6_route_tbl_info.start = ipa->mem_offset + mem->offset; +- req.v6_route_tbl_info.count = mem->size / sizeof(__le64); ++ req.v6_route_tbl_info.end = IPA_ROUTE_MODEM_COUNT - 1; + + mem = ipa_mem_find(ipa, IPA_MEM_V4_FILTER); + req.v4_filter_tbl_start_valid = 1; +@@ -352,7 +352,7 @@ init_modem_driver_req(struct ipa_qmi *ipa_qmi) + req.v4_hash_route_tbl_info_valid = 1; + req.v4_hash_route_tbl_info.start = + ipa->mem_offset + mem->offset; +- req.v4_hash_route_tbl_info.count = mem->size / sizeof(__le64); ++ req.v4_hash_route_tbl_info.end = IPA_ROUTE_MODEM_COUNT - 1; + } + + mem = ipa_mem_find(ipa, IPA_MEM_V6_ROUTE_HASHED); +@@ -360,7 +360,7 @@ init_modem_driver_req(struct ipa_qmi *ipa_qmi) + req.v6_hash_route_tbl_info_valid = 1; + req.v6_hash_route_tbl_info.start = + ipa->mem_offset + mem->offset; +- req.v6_hash_route_tbl_info.count = mem->size / sizeof(__le64); ++ req.v6_hash_route_tbl_info.end = IPA_ROUTE_MODEM_COUNT - 1; + } + + mem = ipa_mem_find(ipa, IPA_MEM_V4_FILTER_HASHED); +diff --git a/drivers/net/ipa/ipa_qmi_msg.c b/drivers/net/ipa/ipa_qmi_msg.c +index 6838e8065072b..75d3fc0092e92 100644 +--- a/drivers/net/ipa/ipa_qmi_msg.c ++++ b/drivers/net/ipa/ipa_qmi_msg.c +@@ -311,7 +311,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = { + .tlv_type = 0x12, + .offset = offsetof(struct ipa_init_modem_driver_req, + v4_route_tbl_info), +- .ei_array = ipa_mem_array_ei, ++ .ei_array = ipa_mem_bounds_ei, + }, + { + .data_type = QMI_OPT_FLAG, +@@ -332,7 +332,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = { + .tlv_type = 0x13, + .offset = offsetof(struct ipa_init_modem_driver_req, + v6_route_tbl_info), +- .ei_array = ipa_mem_array_ei, ++ .ei_array = ipa_mem_bounds_ei, + }, + { + .data_type = QMI_OPT_FLAG, +@@ -496,7 +496,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = { + .tlv_type = 0x1b, + .offset = offsetof(struct ipa_init_modem_driver_req, + v4_hash_route_tbl_info), +- .ei_array = ipa_mem_array_ei, ++ .ei_array = ipa_mem_bounds_ei, + }, + { + .data_type = QMI_OPT_FLAG, +@@ -517,7 +517,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = { + .tlv_type = 0x1c, + .offset = offsetof(struct ipa_init_modem_driver_req, + v6_hash_route_tbl_info), +- .ei_array = ipa_mem_array_ei, ++ .ei_array = ipa_mem_bounds_ei, + }, + { + .data_type = QMI_OPT_FLAG, +diff --git a/drivers/net/ipa/ipa_qmi_msg.h b/drivers/net/ipa/ipa_qmi_msg.h +index 495e85abe50bd..9651aa59b5968 100644 +--- a/drivers/net/ipa/ipa_qmi_msg.h ++++ b/drivers/net/ipa/ipa_qmi_msg.h +@@ -86,9 +86,11 @@ enum ipa_platform_type { + IPA_QMI_PLATFORM_TYPE_MSM_QNX_V01 = 0x5, /* QNX MSM */ + }; + +-/* This defines the start and end offset of a range of memory. Both +- * fields are offsets relative to the start of IPA shared memory. +- * The end value is the last addressable byte *within* the range. ++/* This defines the start and end offset of a range of memory. The start ++ * value is a byte offset relative to the start of IPA shared memory. The ++ * end value is the last addressable unit *within* the range. Typically ++ * the end value is in units of bytes, however it can also be a maximum ++ * array index value. + */ + struct ipa_mem_bounds { + u32 start; +@@ -129,18 +131,19 @@ struct ipa_init_modem_driver_req { + u8 hdr_tbl_info_valid; + struct ipa_mem_bounds hdr_tbl_info; + +- /* Routing table information. These define the location and size of +- * non-hashable IPv4 and IPv6 filter tables. The start values are +- * offsets relative to the start of IPA shared memory. ++ /* Routing table information. These define the location and maximum ++ * *index* (not byte) for the modem portion of non-hashable IPv4 and ++ * IPv6 routing tables. The start values are byte offsets relative ++ * to the start of IPA shared memory. + */ + u8 v4_route_tbl_info_valid; +- struct ipa_mem_array v4_route_tbl_info; ++ struct ipa_mem_bounds v4_route_tbl_info; + u8 v6_route_tbl_info_valid; +- struct ipa_mem_array v6_route_tbl_info; ++ struct ipa_mem_bounds v6_route_tbl_info; + + /* Filter table information. These define the location of the + * non-hashable IPv4 and IPv6 filter tables. The start values are +- * offsets relative to the start of IPA shared memory. ++ * byte offsets relative to the start of IPA shared memory. + */ + u8 v4_filter_tbl_start_valid; + u32 v4_filter_tbl_start; +@@ -181,18 +184,20 @@ struct ipa_init_modem_driver_req { + u8 zip_tbl_info_valid; + struct ipa_mem_bounds zip_tbl_info; + +- /* Routing table information. These define the location and size +- * of hashable IPv4 and IPv6 filter tables. The start values are +- * offsets relative to the start of IPA shared memory. ++ /* Routing table information. These define the location and maximum ++ * *index* (not byte) for the modem portion of hashable IPv4 and IPv6 ++ * routing tables (if supported by hardware). The start values are ++ * byte offsets relative to the start of IPA shared memory. + */ + u8 v4_hash_route_tbl_info_valid; +- struct ipa_mem_array v4_hash_route_tbl_info; ++ struct ipa_mem_bounds v4_hash_route_tbl_info; + u8 v6_hash_route_tbl_info_valid; +- struct ipa_mem_array v6_hash_route_tbl_info; ++ struct ipa_mem_bounds v6_hash_route_tbl_info; + + /* Filter table information. These define the location and size +- * of hashable IPv4 and IPv6 filter tables. The start values are +- * offsets relative to the start of IPA shared memory. ++ * of hashable IPv4 and IPv6 filter tables (if supported by hardware). ++ * The start values are byte offsets relative to the start of IPA ++ * shared memory. + */ + u8 v4_hash_filter_tbl_start_valid; + u32 v4_hash_filter_tbl_start; +diff --git a/drivers/net/ipa/ipa_table.c b/drivers/net/ipa/ipa_table.c +index 2f5a58bfc529a..69efe672ca528 100644 +--- a/drivers/net/ipa/ipa_table.c ++++ b/drivers/net/ipa/ipa_table.c +@@ -108,8 +108,6 @@ + + /* Assignment of route table entries to the modem and AP */ + #define IPA_ROUTE_MODEM_MIN 0 +-#define IPA_ROUTE_MODEM_COUNT 8 +- + #define IPA_ROUTE_AP_MIN IPA_ROUTE_MODEM_COUNT + #define IPA_ROUTE_AP_COUNT \ + (IPA_ROUTE_COUNT_MAX - IPA_ROUTE_MODEM_COUNT) +diff --git a/drivers/net/ipa/ipa_table.h b/drivers/net/ipa/ipa_table.h +index b6a9a0d79d68e..1538e2e1732fe 100644 +--- a/drivers/net/ipa/ipa_table.h ++++ b/drivers/net/ipa/ipa_table.h +@@ -13,6 +13,9 @@ struct ipa; + /* The maximum number of filter table entries (IPv4, IPv6; hashed or not) */ + #define IPA_FILTER_COUNT_MAX 14 + ++/* The number of route table entries allotted to the modem */ ++#define IPA_ROUTE_MODEM_COUNT 8 ++ + /* The maximum number of route table entries (IPv4, IPv6; hashed or not) */ + #define IPA_ROUTE_COUNT_MAX 15 + +diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c +index 6ffb27419e64b..c58123e136896 100644 +--- a/drivers/net/ipvlan/ipvlan_core.c ++++ b/drivers/net/ipvlan/ipvlan_core.c +@@ -495,7 +495,6 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb) + + static int ipvlan_process_outbound(struct sk_buff *skb) + { +- struct ethhdr *ethh = eth_hdr(skb); + int ret = NET_XMIT_DROP; + + /* The ipvlan is a pseudo-L2 device, so the packets that we receive +@@ -505,6 +504,8 @@ static int ipvlan_process_outbound(struct sk_buff *skb) + if (skb_mac_header_was_set(skb)) { + /* In this mode we dont care about + * multicast and broadcast traffic */ ++ struct ethhdr *ethh = eth_hdr(skb); ++ + if (is_multicast_ether_addr(ethh->h_dest)) { + pr_debug_ratelimited( + "Dropped {multi|broad}cast of type=[%x]\n", +@@ -589,7 +590,7 @@ out: + static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) + { + const struct ipvl_dev *ipvlan = netdev_priv(dev); +- struct ethhdr *eth = eth_hdr(skb); ++ struct ethhdr *eth = skb_eth_hdr(skb); + struct ipvl_addr *addr; + void *lyr3h; + int addr_type; +@@ -619,6 +620,7 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) + return dev_forward_skb(ipvlan->phy_dev, skb); + + } else if (is_multicast_ether_addr(eth->h_dest)) { ++ skb_reset_mac_header(skb); + ipvlan_skb_crossing_ns(skb, NULL); + ipvlan_multicast_enqueue(ipvlan->port, skb, true); + return NET_XMIT_SUCCESS; +diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c +index 9e3c815a070f1..796e9c7857d09 100644 +--- a/drivers/net/mdio/of_mdio.c ++++ b/drivers/net/mdio/of_mdio.c +@@ -231,6 +231,7 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) + return 0; + + unregister: ++ of_node_put(child); + mdiobus_unregister(mdio); + return rc; + } +diff --git a/drivers/net/netdevsim/hwstats.c b/drivers/net/netdevsim/hwstats.c +index 605a38e16db05..0e58aa7f0374e 100644 +--- a/drivers/net/netdevsim/hwstats.c ++++ b/drivers/net/netdevsim/hwstats.c +@@ -433,11 +433,11 @@ int nsim_dev_hwstats_init(struct nsim_dev *nsim_dev) + goto err_remove_hwstats_recursive; + } + +- debugfs_create_file("enable_ifindex", 0600, hwstats->l3_ddir, hwstats, ++ debugfs_create_file("enable_ifindex", 0200, hwstats->l3_ddir, hwstats, + &nsim_dev_hwstats_l3_enable_fops.fops); +- debugfs_create_file("disable_ifindex", 0600, hwstats->l3_ddir, hwstats, ++ debugfs_create_file("disable_ifindex", 0200, hwstats->l3_ddir, hwstats, + &nsim_dev_hwstats_l3_disable_fops.fops); +- debugfs_create_file("fail_next_enable", 0600, hwstats->l3_ddir, hwstats, ++ debugfs_create_file("fail_next_enable", 0200, hwstats->l3_ddir, hwstats, + &nsim_dev_hwstats_l3_fail_fops.fops); + + INIT_DELAYED_WORK(&hwstats->traffic_dw, +diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c +index c7047f5d7a9b0..8bc0957a0f6d3 100644 +--- a/drivers/net/phy/aquantia_main.c ++++ b/drivers/net/phy/aquantia_main.c +@@ -90,6 +90,9 @@ + #define VEND1_GLOBAL_FW_ID_MAJOR GENMASK(15, 8) + #define VEND1_GLOBAL_FW_ID_MINOR GENMASK(7, 0) + ++#define VEND1_GLOBAL_GEN_STAT2 0xc831 ++#define VEND1_GLOBAL_GEN_STAT2_OP_IN_PROG BIT(15) ++ + #define VEND1_GLOBAL_RSVD_STAT1 0xc885 + #define VEND1_GLOBAL_RSVD_STAT1_FW_BUILD_ID GENMASK(7, 4) + #define VEND1_GLOBAL_RSVD_STAT1_PROV_ID GENMASK(3, 0) +@@ -124,6 +127,12 @@ + #define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL2 BIT(1) + #define VEND1_GLOBAL_INT_VEND_MASK_GLOBAL3 BIT(0) + ++/* Sleep and timeout for checking if the Processor-Intensive ++ * MDIO operation is finished ++ */ ++#define AQR107_OP_IN_PROG_SLEEP 1000 ++#define AQR107_OP_IN_PROG_TIMEOUT 100000 ++ + struct aqr107_hw_stat { + const char *name; + int reg; +@@ -596,16 +605,52 @@ static void aqr107_link_change_notify(struct phy_device *phydev) + phydev_info(phydev, "Aquantia 1000Base-T2 mode active\n"); + } + ++static int aqr107_wait_processor_intensive_op(struct phy_device *phydev) ++{ ++ int val, err; ++ ++ /* The datasheet notes to wait at least 1ms after issuing a ++ * processor intensive operation before checking. ++ * We cannot use the 'sleep_before_read' parameter of read_poll_timeout ++ * because that just determines the maximum time slept, not the minimum. ++ */ ++ usleep_range(1000, 5000); ++ ++ err = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, ++ VEND1_GLOBAL_GEN_STAT2, val, ++ !(val & VEND1_GLOBAL_GEN_STAT2_OP_IN_PROG), ++ AQR107_OP_IN_PROG_SLEEP, ++ AQR107_OP_IN_PROG_TIMEOUT, false); ++ if (err) { ++ phydev_err(phydev, "timeout: processor-intensive MDIO operation\n"); ++ return err; ++ } ++ ++ return 0; ++} ++ + static int aqr107_suspend(struct phy_device *phydev) + { +- return phy_set_bits_mmd(phydev, MDIO_MMD_VEND1, MDIO_CTRL1, +- MDIO_CTRL1_LPOWER); ++ int err; ++ ++ err = phy_set_bits_mmd(phydev, MDIO_MMD_VEND1, MDIO_CTRL1, ++ MDIO_CTRL1_LPOWER); ++ if (err) ++ return err; ++ ++ return aqr107_wait_processor_intensive_op(phydev); + } + + static int aqr107_resume(struct phy_device *phydev) + { +- return phy_clear_bits_mmd(phydev, MDIO_MMD_VEND1, MDIO_CTRL1, +- MDIO_CTRL1_LPOWER); ++ int err; ++ ++ err = phy_clear_bits_mmd(phydev, MDIO_MMD_VEND1, MDIO_CTRL1, ++ MDIO_CTRL1_LPOWER); ++ if (err) ++ return err; ++ ++ return aqr107_wait_processor_intensive_op(phydev); + } + + static int aqr107_probe(struct phy_device *phydev) +diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c +index 34483a4bd688a..e8e1101911b2f 100644 +--- a/drivers/net/phy/micrel.c ++++ b/drivers/net/phy/micrel.c +@@ -2662,16 +2662,19 @@ static int lan8804_config_init(struct phy_device *phydev) + static irqreturn_t lan8814_handle_interrupt(struct phy_device *phydev) + { + int irq_status, tsu_irq_status; ++ int ret = IRQ_NONE; + + irq_status = phy_read(phydev, LAN8814_INTS); +- if (irq_status > 0 && (irq_status & LAN8814_INT_LINK)) +- phy_trigger_machine(phydev); +- + if (irq_status < 0) { + phy_error(phydev); + return IRQ_NONE; + } + ++ if (irq_status & LAN8814_INT_LINK) { ++ phy_trigger_machine(phydev); ++ ret = IRQ_HANDLED; ++ } ++ + while (1) { + tsu_irq_status = lanphy_read_page_reg(phydev, 4, + LAN8814_INTR_STS_REG); +@@ -2680,12 +2683,15 @@ static irqreturn_t lan8814_handle_interrupt(struct phy_device *phydev) + (tsu_irq_status & (LAN8814_INTR_STS_REG_1588_TSU0_ | + LAN8814_INTR_STS_REG_1588_TSU1_ | + LAN8814_INTR_STS_REG_1588_TSU2_ | +- LAN8814_INTR_STS_REG_1588_TSU3_))) ++ LAN8814_INTR_STS_REG_1588_TSU3_))) { + lan8814_handle_ptp_interrupt(phydev); +- else ++ ret = IRQ_HANDLED; ++ } else { + break; ++ } + } +- return IRQ_HANDLED; ++ ++ return ret; + } + + static int lan8814_ack_interrupt(struct phy_device *phydev) +diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c +index b07dde6f0abf2..b9899913d2467 100644 +--- a/drivers/net/team/team.c ++++ b/drivers/net/team/team.c +@@ -1275,10 +1275,12 @@ static int team_port_add(struct team *team, struct net_device *port_dev, + } + } + +- netif_addr_lock_bh(dev); +- dev_uc_sync_multiple(port_dev, dev); +- dev_mc_sync_multiple(port_dev, dev); +- netif_addr_unlock_bh(dev); ++ if (dev->flags & IFF_UP) { ++ netif_addr_lock_bh(dev); ++ dev_uc_sync_multiple(port_dev, dev); ++ dev_mc_sync_multiple(port_dev, dev); ++ netif_addr_unlock_bh(dev); ++ } + + port->index = -1; + list_add_tail_rcu(&port->list, &team->port_list); +@@ -1349,8 +1351,10 @@ static int team_port_del(struct team *team, struct net_device *port_dev) + netdev_rx_handler_unregister(port_dev); + team_port_disable_netpoll(port); + vlan_vids_del_by_dev(port_dev, dev); +- dev_uc_unsync(port_dev, dev); +- dev_mc_unsync(port_dev, dev); ++ if (dev->flags & IFF_UP) { ++ dev_uc_unsync(port_dev, dev); ++ dev_mc_unsync(port_dev, dev); ++ } + dev_close(port_dev); + team_port_leave(team, port); + +@@ -1700,6 +1704,14 @@ static int team_open(struct net_device *dev) + + static int team_close(struct net_device *dev) + { ++ struct team *team = netdev_priv(dev); ++ struct team_port *port; ++ ++ list_for_each_entry(port, &team->port_list, list) { ++ dev_uc_unsync(port->dev, dev); ++ dev_mc_unsync(port->dev, dev); ++ } ++ + return 0; + } + +diff --git a/drivers/net/wireguard/netlink.c b/drivers/net/wireguard/netlink.c +index d0f3b6d7f4089..5c804bcabfe6b 100644 +--- a/drivers/net/wireguard/netlink.c ++++ b/drivers/net/wireguard/netlink.c +@@ -436,14 +436,13 @@ static int set_peer(struct wg_device *wg, struct nlattr **attrs) + if (attrs[WGPEER_A_ENDPOINT]) { + struct sockaddr *addr = nla_data(attrs[WGPEER_A_ENDPOINT]); + size_t len = nla_len(attrs[WGPEER_A_ENDPOINT]); ++ struct endpoint endpoint = { { { 0 } } }; + +- if ((len == sizeof(struct sockaddr_in) && +- addr->sa_family == AF_INET) || +- (len == sizeof(struct sockaddr_in6) && +- addr->sa_family == AF_INET6)) { +- struct endpoint endpoint = { { { 0 } } }; +- +- memcpy(&endpoint.addr, addr, len); ++ if (len == sizeof(struct sockaddr_in) && addr->sa_family == AF_INET) { ++ endpoint.addr4 = *(struct sockaddr_in *)addr; ++ wg_socket_set_peer_endpoint(peer, &endpoint); ++ } else if (len == sizeof(struct sockaddr_in6) && addr->sa_family == AF_INET6) { ++ endpoint.addr6 = *(struct sockaddr_in6 *)addr; + wg_socket_set_peer_endpoint(peer, &endpoint); + } + } +diff --git a/drivers/net/wireguard/selftest/ratelimiter.c b/drivers/net/wireguard/selftest/ratelimiter.c +index ba87d294604fe..d4bb40a695ab6 100644 +--- a/drivers/net/wireguard/selftest/ratelimiter.c ++++ b/drivers/net/wireguard/selftest/ratelimiter.c +@@ -6,29 +6,28 @@ + #ifdef DEBUG + + #include +-#include + + static const struct { + bool result; +- u64 nsec_to_sleep_before; ++ unsigned int msec_to_sleep_before; + } expected_results[] __initconst = { + [0 ... PACKETS_BURSTABLE - 1] = { true, 0 }, + [PACKETS_BURSTABLE] = { false, 0 }, +- [PACKETS_BURSTABLE + 1] = { true, NSEC_PER_SEC / PACKETS_PER_SECOND }, ++ [PACKETS_BURSTABLE + 1] = { true, MSEC_PER_SEC / PACKETS_PER_SECOND }, + [PACKETS_BURSTABLE + 2] = { false, 0 }, +- [PACKETS_BURSTABLE + 3] = { true, (NSEC_PER_SEC / PACKETS_PER_SECOND) * 2 }, ++ [PACKETS_BURSTABLE + 3] = { true, (MSEC_PER_SEC / PACKETS_PER_SECOND) * 2 }, + [PACKETS_BURSTABLE + 4] = { true, 0 }, + [PACKETS_BURSTABLE + 5] = { false, 0 } + }; + + static __init unsigned int maximum_jiffies_at_index(int index) + { +- u64 total_nsecs = 2 * NSEC_PER_SEC / PACKETS_PER_SECOND / 3; ++ unsigned int total_msecs = 2 * MSEC_PER_SEC / PACKETS_PER_SECOND / 3; + int i; + + for (i = 0; i <= index; ++i) +- total_nsecs += expected_results[i].nsec_to_sleep_before; +- return nsecs_to_jiffies(total_nsecs); ++ total_msecs += expected_results[i].msec_to_sleep_before; ++ return msecs_to_jiffies(total_msecs); + } + + static __init int timings_test(struct sk_buff *skb4, struct iphdr *hdr4, +@@ -43,12 +42,8 @@ static __init int timings_test(struct sk_buff *skb4, struct iphdr *hdr4, + loop_start_time = jiffies; + + for (i = 0; i < ARRAY_SIZE(expected_results); ++i) { +- if (expected_results[i].nsec_to_sleep_before) { +- ktime_t timeout = ktime_add(ktime_add_ns(ktime_get_coarse_boottime(), TICK_NSEC * 4 / 3), +- ns_to_ktime(expected_results[i].nsec_to_sleep_before)); +- set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_hrtimeout_range_clock(&timeout, 0, HRTIMER_MODE_ABS, CLOCK_BOOTTIME); +- } ++ if (expected_results[i].msec_to_sleep_before) ++ msleep(expected_results[i].msec_to_sleep_before); + + if (time_is_before_jiffies(loop_start_time + + maximum_jiffies_at_index(i))) +@@ -132,7 +127,7 @@ bool __init wg_ratelimiter_selftest(void) + if (IS_ENABLED(CONFIG_KASAN) || IS_ENABLED(CONFIG_UBSAN)) + return true; + +- BUILD_BUG_ON(NSEC_PER_SEC % PACKETS_PER_SECOND != 0); ++ BUILD_BUG_ON(MSEC_PER_SEC % PACKETS_PER_SECOND != 0); + + if (wg_ratelimiter_init()) + goto out; +@@ -172,7 +167,7 @@ bool __init wg_ratelimiter_selftest(void) + ++test; + #endif + +- for (trials = TRIALS_BEFORE_GIVING_UP;;) { ++ for (trials = TRIALS_BEFORE_GIVING_UP; IS_ENABLED(DEBUG_RATELIMITER_TIMINGS);) { + int test_count = 0, ret; + + ret = timings_test(skb4, hdr4, skb6, hdr6, &test_count); +diff --git a/drivers/net/wireless/intel/iwlwifi/Kconfig b/drivers/net/wireless/intel/iwlwifi/Kconfig +index a647a406b87be..b20409f8c13ab 100644 +--- a/drivers/net/wireless/intel/iwlwifi/Kconfig ++++ b/drivers/net/wireless/intel/iwlwifi/Kconfig +@@ -140,6 +140,7 @@ config IWLMEI + depends on INTEL_MEI + depends on PM + depends on CFG80211 ++ depends on BROKEN + help + Enables the iwlmei kernel module. + +diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c +index 9e832b27170fe..a4eb025f504f3 100644 +--- a/drivers/net/wireless/mediatek/mt76/mt7615/mac.c ++++ b/drivers/net/wireless/mediatek/mt76/mt7615/mac.c +@@ -1138,7 +1138,7 @@ u32 mt7615_mac_get_sta_tid_sn(struct mt7615_dev *dev, int wcid, u8 tid) + offset %= 32; + + val = mt76_rr(dev, addr); +- val >>= (tid % 32); ++ val >>= offset; + + if (offset > 20) { + addr += 4; +diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c +index 629d10fcf53b2..b9f1a8e9f88cb 100644 +--- a/drivers/nvdimm/pmem.c ++++ b/drivers/nvdimm/pmem.c +@@ -45,7 +45,7 @@ static struct nd_region *to_region(struct pmem_device *pmem) + return to_nd_region(to_dev(pmem)->parent); + } + +-static phys_addr_t to_phys(struct pmem_device *pmem, phys_addr_t offset) ++static phys_addr_t pmem_to_phys(struct pmem_device *pmem, phys_addr_t offset) + { + return pmem->phys_addr + offset; + } +@@ -63,7 +63,7 @@ static phys_addr_t to_offset(struct pmem_device *pmem, sector_t sector) + static void pmem_mkpage_present(struct pmem_device *pmem, phys_addr_t offset, + unsigned int len) + { +- phys_addr_t phys = to_phys(pmem, offset); ++ phys_addr_t phys = pmem_to_phys(pmem, offset); + unsigned long pfn_start, pfn_end, pfn; + + /* only pmem in the linear map supports HWPoison */ +@@ -97,7 +97,7 @@ static void pmem_clear_bb(struct pmem_device *pmem, sector_t sector, long blks) + static long __pmem_clear_poison(struct pmem_device *pmem, + phys_addr_t offset, unsigned int len) + { +- phys_addr_t phys = to_phys(pmem, offset); ++ phys_addr_t phys = pmem_to_phys(pmem, offset); + long cleared = nvdimm_clear_poison(to_dev(pmem), phys, len); + + if (cleared > 0) { +diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c +index d702d7d60235d..2d23b7d41f7e6 100644 +--- a/drivers/nvme/host/apple.c ++++ b/drivers/nvme/host/apple.c +@@ -1502,7 +1502,7 @@ static int apple_nvme_probe(struct platform_device *pdev) + + if (!blk_get_queue(anv->ctrl.admin_q)) { + nvme_start_admin_queue(&anv->ctrl); +- blk_cleanup_queue(anv->ctrl.admin_q); ++ blk_mq_destroy_queue(anv->ctrl.admin_q); + anv->ctrl.admin_q = NULL; + ret = -ENODEV; + goto put_dev; +diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c +index 2f965356f3453..6d76fc608b741 100644 +--- a/drivers/nvme/host/core.c ++++ b/drivers/nvme/host/core.c +@@ -4105,7 +4105,6 @@ static void nvme_ns_remove(struct nvme_ns *ns) + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); + del_gendisk(ns->disk); +- blk_cleanup_queue(ns->queue); + + down_write(&ns->ctrl->namespaces_rwsem); + list_del_init(&ns->list); +diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c +index 4aff83b1b0c05..9a5ce70d7f215 100644 +--- a/drivers/nvme/host/fc.c ++++ b/drivers/nvme/host/fc.c +@@ -2392,7 +2392,7 @@ nvme_fc_ctrl_free(struct kref *ref) + unsigned long flags; + + if (ctrl->ctrl.tagset) { +- blk_cleanup_queue(ctrl->ctrl.connect_q); ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + } + +@@ -2402,8 +2402,8 @@ nvme_fc_ctrl_free(struct kref *ref) + spin_unlock_irqrestore(&ctrl->rport->lock, flags); + + nvme_start_admin_queue(&ctrl->ctrl); +- blk_cleanup_queue(ctrl->ctrl.admin_q); +- blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + + kfree(ctrl->queues); +@@ -2953,7 +2953,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) + out_delete_hw_queues: + nvme_fc_delete_hw_io_queues(ctrl); + out_cleanup_blk_queue: +- blk_cleanup_queue(ctrl->ctrl.connect_q); ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); + out_free_tag_set: + blk_mq_free_tag_set(&ctrl->tag_set); + nvme_fc_free_io_queues(ctrl); +@@ -3642,9 +3642,9 @@ fail_ctrl: + return ERR_PTR(-EIO); + + out_cleanup_admin_q: +- blk_cleanup_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); + out_cleanup_fabrics_q: +- blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); + out_free_admin_tag_set: + blk_mq_free_tag_set(&ctrl->admin_tag_set); + out_free_queues: +diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c +index 9f6614f7dbeb1..3516678d37541 100644 +--- a/drivers/nvme/host/pci.c ++++ b/drivers/nvme/host/pci.c +@@ -1760,7 +1760,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) + * queue to flush these to completion. + */ + nvme_start_admin_queue(&dev->ctrl); +- blk_cleanup_queue(dev->ctrl.admin_q); ++ blk_mq_destroy_queue(dev->ctrl.admin_q); + blk_mq_free_tag_set(&dev->admin_tagset); + } + } +diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c +index 46c2dcf72f7ea..240024dd5d857 100644 +--- a/drivers/nvme/host/rdma.c ++++ b/drivers/nvme/host/rdma.c +@@ -840,8 +840,8 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) + { + if (remove) { +- blk_cleanup_queue(ctrl->ctrl.admin_q); +- blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); + blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); + } + if (ctrl->async_event_sqe.data) { +@@ -935,10 +935,10 @@ out_stop_queue: + nvme_cancel_admin_tagset(&ctrl->ctrl); + out_cleanup_queue: + if (new) +- blk_cleanup_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); + out_cleanup_fabrics_q: + if (new) +- blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); + out_free_tagset: + if (new) + blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); +@@ -957,7 +957,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) + { + if (remove) { +- blk_cleanup_queue(ctrl->ctrl.connect_q); ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(ctrl->ctrl.tagset); + } + nvme_rdma_free_io_queues(ctrl); +@@ -1012,7 +1012,7 @@ out_wait_freeze_timed_out: + out_cleanup_connect_q: + nvme_cancel_tagset(&ctrl->ctrl); + if (new) +- blk_cleanup_queue(ctrl->ctrl.connect_q); ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); + out_free_tag_set: + if (new) + blk_mq_free_tag_set(ctrl->ctrl.tagset); +diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c +index daa0e160e1212..d7e5bbdb9b75a 100644 +--- a/drivers/nvme/host/tcp.c ++++ b/drivers/nvme/host/tcp.c +@@ -1881,7 +1881,7 @@ static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) + { + nvme_tcp_stop_io_queues(ctrl); + if (remove) { +- blk_cleanup_queue(ctrl->connect_q); ++ blk_mq_destroy_queue(ctrl->connect_q); + blk_mq_free_tag_set(ctrl->tagset); + } + nvme_tcp_free_io_queues(ctrl); +@@ -1936,7 +1936,7 @@ out_wait_freeze_timed_out: + out_cleanup_connect_q: + nvme_cancel_tagset(ctrl); + if (new) +- blk_cleanup_queue(ctrl->connect_q); ++ blk_mq_destroy_queue(ctrl->connect_q); + out_free_tag_set: + if (new) + blk_mq_free_tag_set(ctrl->tagset); +@@ -1949,8 +1949,8 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) + { + nvme_tcp_stop_queue(ctrl, 0); + if (remove) { +- blk_cleanup_queue(ctrl->admin_q); +- blk_cleanup_queue(ctrl->fabrics_q); ++ blk_mq_destroy_queue(ctrl->admin_q); ++ blk_mq_destroy_queue(ctrl->fabrics_q); + blk_mq_free_tag_set(ctrl->admin_tagset); + } + nvme_tcp_free_admin_queue(ctrl); +@@ -2008,10 +2008,10 @@ out_stop_queue: + nvme_cancel_admin_tagset(ctrl); + out_cleanup_queue: + if (new) +- blk_cleanup_queue(ctrl->admin_q); ++ blk_mq_destroy_queue(ctrl->admin_q); + out_cleanup_fabrics_q: + if (new) +- blk_cleanup_queue(ctrl->fabrics_q); ++ blk_mq_destroy_queue(ctrl->fabrics_q); + out_free_tagset: + if (new) + blk_mq_free_tag_set(ctrl->admin_tagset); +diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c +index 59024af2da2e3..0f5c77e22a0a9 100644 +--- a/drivers/nvme/target/loop.c ++++ b/drivers/nvme/target/loop.c +@@ -266,8 +266,8 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) + if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags)) + return; + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); +- blk_cleanup_queue(ctrl->ctrl.admin_q); +- blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + } + +@@ -283,7 +283,7 @@ static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (nctrl->tagset) { +- blk_cleanup_queue(ctrl->ctrl.connect_q); ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + } + kfree(ctrl->queues); +@@ -410,9 +410,9 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) + + out_cleanup_queue: + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); +- blk_cleanup_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); + out_cleanup_fabrics_q: +- blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); + out_free_tagset: + blk_mq_free_tag_set(&ctrl->admin_tag_set); + out_free_sq: +@@ -554,7 +554,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) + return 0; + + out_cleanup_connect_q: +- blk_cleanup_queue(ctrl->ctrl.connect_q); ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); + out_free_tagset: + blk_mq_free_tag_set(&ctrl->tag_set); + out_destroy_queues: +diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c +index 80d8309652a4d..b80a9b74662b1 100644 +--- a/drivers/perf/arm-cmn.c ++++ b/drivers/perf/arm-cmn.c +@@ -36,7 +36,7 @@ + #define CMN_CI_CHILD_COUNT GENMASK_ULL(15, 0) + #define CMN_CI_CHILD_PTR_OFFSET GENMASK_ULL(31, 16) + +-#define CMN_CHILD_NODE_ADDR GENMASK(27, 0) ++#define CMN_CHILD_NODE_ADDR GENMASK(29, 0) + #define CMN_CHILD_NODE_EXTERNAL BIT(31) + + #define CMN_MAX_DIMENSION 12 +diff --git a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c +index a4d7d9bd100d3..67712c77d806f 100644 +--- a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c ++++ b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c +@@ -274,7 +274,6 @@ struct mvebu_a3700_comphy_lane { + int submode; + bool invert_tx; + bool invert_rx; +- bool needs_reset; + }; + + struct gbe_phy_init_data_fix { +@@ -1097,40 +1096,12 @@ mvebu_a3700_comphy_pcie_power_off(struct mvebu_a3700_comphy_lane *lane) + 0x0, PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT); + } + +-static int mvebu_a3700_comphy_reset(struct phy *phy) ++static void mvebu_a3700_comphy_usb3_power_off(struct mvebu_a3700_comphy_lane *lane) + { +- struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy); +- u16 mask, data; +- +- dev_dbg(lane->dev, "resetting lane %d\n", lane->id); +- +- /* COMPHY reset for internal logic */ +- comphy_lane_reg_set(lane, COMPHY_SFT_RESET, +- SFT_RST_NO_REG, SFT_RST_NO_REG); +- +- /* COMPHY register reset (cleared automatically) */ +- comphy_lane_reg_set(lane, COMPHY_SFT_RESET, SFT_RST, SFT_RST); +- +- /* PIPE soft and register reset */ +- data = PIPE_SOFT_RESET | PIPE_REG_RESET; +- mask = data; +- comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL, data, mask); +- +- /* Release PIPE register reset */ +- comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL, +- 0x0, PIPE_REG_RESET); +- +- /* Reset SB configuration register (only for lanes 0 and 1) */ +- if (lane->id == 0 || lane->id == 1) { +- u32 mask, data; +- +- data = PIN_RESET_CORE_BIT | PIN_RESET_COMPHY_BIT | +- PIN_PU_PLL_BIT | PIN_PU_RX_BIT | PIN_PU_TX_BIT; +- mask = data | PIN_PU_IVREF_BIT | PIN_TX_IDLE_BIT; +- comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask); +- } +- +- return 0; ++ /* ++ * The USB3 MAC sets the USB3 PHY to low state, so we do not ++ * need to power off USB3 PHY again. ++ */ + } + + static bool mvebu_a3700_comphy_check_mode(int lane, +@@ -1171,10 +1142,6 @@ static int mvebu_a3700_comphy_set_mode(struct phy *phy, enum phy_mode mode, + (lane->mode != mode || lane->submode != submode)) + return -EBUSY; + +- /* If changing mode, ensure reset is called */ +- if (lane->mode != PHY_MODE_INVALID && lane->mode != mode) +- lane->needs_reset = true; +- + /* Just remember the mode, ->power_on() will do the real setup */ + lane->mode = mode; + lane->submode = submode; +@@ -1185,7 +1152,6 @@ static int mvebu_a3700_comphy_set_mode(struct phy *phy, enum phy_mode mode, + static int mvebu_a3700_comphy_power_on(struct phy *phy) + { + struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy); +- int ret; + + if (!mvebu_a3700_comphy_check_mode(lane->id, lane->mode, + lane->submode)) { +@@ -1193,14 +1159,6 @@ static int mvebu_a3700_comphy_power_on(struct phy *phy) + return -EINVAL; + } + +- if (lane->needs_reset) { +- ret = mvebu_a3700_comphy_reset(phy); +- if (ret) +- return ret; +- +- lane->needs_reset = false; +- } +- + switch (lane->mode) { + case PHY_MODE_USB_HOST_SS: + dev_dbg(lane->dev, "set lane %d to USB3 host mode\n", lane->id); +@@ -1224,38 +1182,28 @@ static int mvebu_a3700_comphy_power_off(struct phy *phy) + { + struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy); + +- switch (lane->mode) { +- case PHY_MODE_USB_HOST_SS: +- /* +- * The USB3 MAC sets the USB3 PHY to low state, so we do not +- * need to power off USB3 PHY again. +- */ +- break; +- +- case PHY_MODE_SATA: +- mvebu_a3700_comphy_sata_power_off(lane); +- break; +- +- case PHY_MODE_ETHERNET: ++ switch (lane->id) { ++ case 0: ++ mvebu_a3700_comphy_usb3_power_off(lane); + mvebu_a3700_comphy_ethernet_power_off(lane); +- break; +- +- case PHY_MODE_PCIE: ++ return 0; ++ case 1: + mvebu_a3700_comphy_pcie_power_off(lane); +- break; +- ++ mvebu_a3700_comphy_ethernet_power_off(lane); ++ return 0; ++ case 2: ++ mvebu_a3700_comphy_usb3_power_off(lane); ++ mvebu_a3700_comphy_sata_power_off(lane); ++ return 0; + default: + dev_err(lane->dev, "invalid COMPHY mode\n"); + return -EINVAL; + } +- +- return 0; + } + + static const struct phy_ops mvebu_a3700_comphy_ops = { + .power_on = mvebu_a3700_comphy_power_on, + .power_off = mvebu_a3700_comphy_power_off, +- .reset = mvebu_a3700_comphy_reset, + .set_mode = mvebu_a3700_comphy_set_mode, + .owner = THIS_MODULE, + }; +@@ -1393,8 +1341,7 @@ static int mvebu_a3700_comphy_probe(struct platform_device *pdev) + * To avoid relying on the bootloader/firmware configuration, + * power off all comphys. + */ +- mvebu_a3700_comphy_reset(phy); +- lane->needs_reset = false; ++ mvebu_a3700_comphy_power_off(phy); + } + + provider = devm_of_phy_provider_register(&pdev->dev, +diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c +index ba6d787896606..e8489331f12b8 100644 +--- a/drivers/s390/block/dasd.c ++++ b/drivers/s390/block/dasd.c +@@ -3280,7 +3280,7 @@ static int dasd_alloc_queue(struct dasd_block *block) + static void dasd_free_queue(struct dasd_block *block) + { + if (block->request_queue) { +- blk_cleanup_queue(block->request_queue); ++ blk_mq_destroy_queue(block->request_queue); + blk_mq_free_tag_set(&block->tag_set); + block->request_queue = NULL; + } +diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c +index dc78a523a69f2..b6b938aa66158 100644 +--- a/drivers/s390/block/dasd_alias.c ++++ b/drivers/s390/block/dasd_alias.c +@@ -675,12 +675,12 @@ int dasd_alias_remove_device(struct dasd_device *device) + struct dasd_device *dasd_alias_get_start_dev(struct dasd_device *base_device) + { + struct dasd_eckd_private *alias_priv, *private = base_device->private; +- struct alias_pav_group *group = private->pavgroup; + struct alias_lcu *lcu = private->lcu; + struct dasd_device *alias_device; ++ struct alias_pav_group *group; + unsigned long flags; + +- if (!group || !lcu) ++ if (!lcu) + return NULL; + if (lcu->pav == NO_PAV || + lcu->flags & (NEED_UAC_UPDATE | UPDATE_PENDING)) +@@ -697,6 +697,11 @@ struct dasd_device *dasd_alias_get_start_dev(struct dasd_device *base_device) + } + + spin_lock_irqsave(&lcu->lock, flags); ++ group = private->pavgroup; ++ if (!group) { ++ spin_unlock_irqrestore(&lcu->lock, flags); ++ return NULL; ++ } + alias_device = group->next; + if (!alias_device) { + if (list_empty(&group->aliaslist)) { +diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c +index a7a33ebf4bbe9..5a83f0a39901b 100644 +--- a/drivers/s390/block/dasd_genhd.c ++++ b/drivers/s390/block/dasd_genhd.c +@@ -41,8 +41,8 @@ int dasd_gendisk_alloc(struct dasd_block *block) + if (base->devindex >= DASD_PER_MAJOR) + return -EBUSY; + +- gdp = __alloc_disk_node(block->request_queue, NUMA_NO_NODE, +- &dasd_bio_compl_lkclass); ++ gdp = blk_mq_alloc_disk_for_queue(block->request_queue, ++ &dasd_bio_compl_lkclass); + if (!gdp) + return -ENOMEM; + +diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c +index 8352f90d997df..ae9a107c520d0 100644 +--- a/drivers/scsi/hosts.c ++++ b/drivers/scsi/hosts.c +@@ -182,6 +182,15 @@ void scsi_remove_host(struct Scsi_Host *shost) + mutex_unlock(&shost->scan_mutex); + scsi_proc_host_rm(shost); + ++ /* ++ * New SCSI devices cannot be attached anymore because of the SCSI host ++ * state so drop the tag set refcnt. Wait until the tag set refcnt drops ++ * to zero because .exit_cmd_priv implementations may need the host ++ * pointer. ++ */ ++ kref_put(&shost->tagset_refcnt, scsi_mq_free_tags); ++ wait_for_completion(&shost->tagset_freed); ++ + spin_lock_irqsave(shost->host_lock, flags); + if (scsi_host_set_state(shost, SHOST_DEL)) + BUG_ON(scsi_host_set_state(shost, SHOST_DEL_RECOVERY)); +@@ -240,6 +249,9 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, + if (error) + goto fail; + ++ kref_init(&shost->tagset_refcnt); ++ init_completion(&shost->tagset_freed); ++ + /* + * Increase usage count temporarily here so that calling + * scsi_autopm_put_host() will trigger runtime idle if there is +@@ -312,6 +324,7 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, + pm_runtime_disable(&shost->shost_gendev); + pm_runtime_set_suspended(&shost->shost_gendev); + pm_runtime_put_noidle(&shost->shost_gendev); ++ kref_put(&shost->tagset_refcnt, scsi_mq_free_tags); + fail: + return error; + } +@@ -345,9 +358,6 @@ static void scsi_host_dev_release(struct device *dev) + kfree(dev_name(&shost->shost_dev)); + } + +- if (shost->tag_set.tags) +- scsi_mq_destroy_tags(shost); +- + kfree(shost->shost_data); + + ida_simple_remove(&host_index_ida, shost->host_no); +diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c +index 9a1ae52bb621d..a6d3471a61057 100644 +--- a/drivers/scsi/mpt3sas/mpt3sas_base.c ++++ b/drivers/scsi/mpt3sas/mpt3sas_base.c +@@ -2993,7 +2993,7 @@ _base_config_dma_addressing(struct MPT3SAS_ADAPTER *ioc, struct pci_dev *pdev) + + if (ioc->is_mcpu_endpoint || + sizeof(dma_addr_t) == 4 || ioc->use_32bit_dma || +- dma_get_required_mask(&pdev->dev) <= 32) ++ dma_get_required_mask(&pdev->dev) <= DMA_BIT_MASK(32)) + ioc->dma_mask = 32; + /* Set 63 bit DMA mask for all SAS3 and SAS35 controllers */ + else if (ioc->hba_mpi_version_belonged > MPI2_VERSION) +diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c +index 62666df1a59eb..4acff4e84b909 100644 +--- a/drivers/scsi/qla2xxx/qla_target.c ++++ b/drivers/scsi/qla2xxx/qla_target.c +@@ -2151,8 +2151,10 @@ static int __qlt_24xx_handle_abts(struct scsi_qla_host *vha, + + abort_cmd = ha->tgt.tgt_ops->find_cmd_by_tag(sess, + le32_to_cpu(abts->exchange_addr_to_abort)); +- if (!abort_cmd) ++ if (!abort_cmd) { ++ mempool_free(mcmd, qla_tgt_mgmt_cmd_mempool); + return -EIO; ++ } + mcmd->unpacked_lun = abort_cmd->se_cmd.orig_fe_lun; + + if (abort_cmd->qpair) { +diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c +index f5c876d03c1ad..7e990f7a9f164 100644 +--- a/drivers/scsi/scsi_lib.c ++++ b/drivers/scsi/scsi_lib.c +@@ -168,7 +168,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy) + * Requeue this command. It will go before all other commands + * that are already in the queue. Schedule requeue work under + * lock such that the kblockd_schedule_work() call happens +- * before blk_cleanup_queue() finishes. ++ * before blk_mq_destroy_queue() finishes. + */ + cmd->result = 0; + +@@ -429,9 +429,9 @@ static void scsi_starved_list_run(struct Scsi_Host *shost) + * it and the queue. Mitigate by taking a reference to the + * queue and never touching the sdev again after we drop the + * host lock. Note: if __scsi_remove_device() invokes +- * blk_cleanup_queue() before the queue is run from this ++ * blk_mq_destroy_queue() before the queue is run from this + * function then blk_run_queue() will return immediately since +- * blk_cleanup_queue() marks the queue with QUEUE_FLAG_DYING. ++ * blk_mq_destroy_queue() marks the queue with QUEUE_FLAG_DYING. + */ + slq = sdev->request_queue; + if (!blk_get_queue(slq)) +@@ -1995,9 +1995,13 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) + return blk_mq_alloc_tag_set(tag_set); + } + +-void scsi_mq_destroy_tags(struct Scsi_Host *shost) ++void scsi_mq_free_tags(struct kref *kref) + { ++ struct Scsi_Host *shost = container_of(kref, typeof(*shost), ++ tagset_refcnt); ++ + blk_mq_free_tag_set(&shost->tag_set); ++ complete(&shost->tagset_freed); + } + + /** +diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h +index 5c4786310a31d..a0ee31d55f5f1 100644 +--- a/drivers/scsi/scsi_priv.h ++++ b/drivers/scsi/scsi_priv.h +@@ -94,7 +94,7 @@ extern void scsi_run_host_queues(struct Scsi_Host *shost); + extern void scsi_requeue_run_queue(struct work_struct *work); + extern void scsi_start_queue(struct scsi_device *sdev); + extern int scsi_mq_setup_tags(struct Scsi_Host *shost); +-extern void scsi_mq_destroy_tags(struct Scsi_Host *shost); ++extern void scsi_mq_free_tags(struct kref *kref); + extern void scsi_exit_queue(void); + extern void scsi_evt_thread(struct work_struct *work); + +diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c +index 91ac901a66826..5d27f5196de6f 100644 +--- a/drivers/scsi/scsi_scan.c ++++ b/drivers/scsi/scsi_scan.c +@@ -340,6 +340,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, + kfree(sdev); + goto out; + } ++ kref_get(&sdev->host->tagset_refcnt); + sdev->request_queue = q; + q->queuedata = sdev; + __scsi_init_queue(sdev->host, q); +diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c +index 43949798a2e47..5d61f58399dca 100644 +--- a/drivers/scsi/scsi_sysfs.c ++++ b/drivers/scsi/scsi_sysfs.c +@@ -1475,7 +1475,8 @@ void __scsi_remove_device(struct scsi_device *sdev) + scsi_device_set_state(sdev, SDEV_DEL); + mutex_unlock(&sdev->state_mutex); + +- blk_cleanup_queue(sdev->request_queue); ++ blk_mq_destroy_queue(sdev->request_queue); ++ kref_put(&sdev->host->tagset_refcnt, scsi_mq_free_tags); + cancel_work_sync(&sdev->requeue_work); + + if (sdev->host->hostt->slave_destroy) +diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c +index a1a2ac09066fd..cb587e488601c 100644 +--- a/drivers/scsi/sd.c ++++ b/drivers/scsi/sd.c +@@ -3440,8 +3440,8 @@ static int sd_probe(struct device *dev) + if (!sdkp) + goto out; + +- gd = __alloc_disk_node(sdp->request_queue, NUMA_NO_NODE, +- &sd_bio_compl_lkclass); ++ gd = blk_mq_alloc_disk_for_queue(sdp->request_queue, ++ &sd_bio_compl_lkclass); + if (!gd) + goto out_free; + +diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c +index 32d3b8274f148..a278b739d0c5f 100644 +--- a/drivers/scsi/sr.c ++++ b/drivers/scsi/sr.c +@@ -624,8 +624,8 @@ static int sr_probe(struct device *dev) + if (!cd) + goto fail; + +- disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE, +- &sr_bio_compl_lkclass); ++ disk = blk_mq_alloc_disk_for_queue(sdev->request_queue, ++ &sr_bio_compl_lkclass); + if (!disk) + goto fail_free; + mutex_init(&cd->lock); +diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c +index fff0c740c8f33..6f088dd0ba4f3 100644 +--- a/drivers/thunderbolt/icm.c ++++ b/drivers/thunderbolt/icm.c +@@ -2527,6 +2527,7 @@ struct tb *icm_probe(struct tb_nhi *nhi) + tb->cm_ops = &icm_icl_ops; + break; + ++ case PCI_DEVICE_ID_INTEL_MAPLE_RIDGE_2C_NHI: + case PCI_DEVICE_ID_INTEL_MAPLE_RIDGE_4C_NHI: + icm->is_supported = icm_tgl_is_supported; + icm->get_mode = icm_ar_get_mode; +diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h +index 69083aab2736c..5091677b3f4ba 100644 +--- a/drivers/thunderbolt/nhi.h ++++ b/drivers/thunderbolt/nhi.h +@@ -55,6 +55,7 @@ extern const struct tb_nhi_ops icl_nhi_ops; + * need for the PCI quirk anymore as we will use ICM also on Apple + * hardware. + */ ++#define PCI_DEVICE_ID_INTEL_MAPLE_RIDGE_2C_NHI 0x1134 + #define PCI_DEVICE_ID_INTEL_MAPLE_RIDGE_4C_NHI 0x1137 + #define PCI_DEVICE_ID_INTEL_WIN_RIDGE_2C_NHI 0x157d + #define PCI_DEVICE_ID_INTEL_WIN_RIDGE_2C_BRIDGE 0x157e +diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c +index 2945c1b890880..cb83c66bd8a82 100644 +--- a/drivers/tty/serial/fsl_lpuart.c ++++ b/drivers/tty/serial/fsl_lpuart.c +@@ -2706,14 +2706,15 @@ static int lpuart_probe(struct platform_device *pdev) + lpuart_reg.cons = LPUART_CONSOLE; + handler = lpuart_int; + } +- ret = uart_add_one_port(&lpuart_reg, &sport->port); +- if (ret) +- goto failed_attach_port; + + ret = lpuart_global_reset(sport); + if (ret) + goto failed_reset; + ++ ret = uart_add_one_port(&lpuart_reg, &sport->port); ++ if (ret) ++ goto failed_attach_port; ++ + ret = uart_get_rs485_mode(&sport->port); + if (ret) + goto failed_get_rs485; +@@ -2736,9 +2737,9 @@ static int lpuart_probe(struct platform_device *pdev) + + failed_irq_request: + failed_get_rs485: +-failed_reset: + uart_remove_one_port(&lpuart_reg, &sport->port); + failed_attach_port: ++failed_reset: + lpuart_disable_clks(sport); + return ret; + } +diff --git a/drivers/tty/serial/serial-tegra.c b/drivers/tty/serial/serial-tegra.c +index d942ab152f5a4..24aa1dcc5ef7a 100644 +--- a/drivers/tty/serial/serial-tegra.c ++++ b/drivers/tty/serial/serial-tegra.c +@@ -525,7 +525,7 @@ static void tegra_uart_tx_dma_complete(void *args) + count = tup->tx_bytes_requested - state.residue; + async_tx_ack(tup->tx_dma_desc); + spin_lock_irqsave(&tup->uport.lock, flags); +- xmit->tail = (xmit->tail + count) & (UART_XMIT_SIZE - 1); ++ uart_xmit_advance(&tup->uport, count); + tup->tx_in_progress = 0; + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(&tup->uport); +@@ -613,7 +613,6 @@ static unsigned int tegra_uart_tx_empty(struct uart_port *u) + static void tegra_uart_stop_tx(struct uart_port *u) + { + struct tegra_uart_port *tup = to_tegra_uport(u); +- struct circ_buf *xmit = &tup->uport.state->xmit; + struct dma_tx_state state; + unsigned int count; + +@@ -624,7 +623,7 @@ static void tegra_uart_stop_tx(struct uart_port *u) + dmaengine_tx_status(tup->tx_dma_chan, tup->tx_cookie, &state); + count = tup->tx_bytes_requested - state.residue; + async_tx_ack(tup->tx_dma_desc); +- xmit->tail = (xmit->tail + count) & (UART_XMIT_SIZE - 1); ++ uart_xmit_advance(&tup->uport, count); + tup->tx_in_progress = 0; + } + +diff --git a/drivers/tty/serial/tegra-tcu.c b/drivers/tty/serial/tegra-tcu.c +index 4877c54c613d1..889b701ba7c62 100644 +--- a/drivers/tty/serial/tegra-tcu.c ++++ b/drivers/tty/serial/tegra-tcu.c +@@ -101,7 +101,7 @@ static void tegra_tcu_uart_start_tx(struct uart_port *port) + break; + + tegra_tcu_write(tcu, &xmit->buf[xmit->tail], count); +- xmit->tail = (xmit->tail + count) & (UART_XMIT_SIZE - 1); ++ uart_xmit_advance(port, count); + } + + uart_write_wakeup(port); +diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c +index 829da9cb14a86..55bb0d0422d52 100644 +--- a/drivers/ufs/core/ufshcd.c ++++ b/drivers/ufs/core/ufshcd.c +@@ -9519,7 +9519,7 @@ void ufshcd_remove(struct ufs_hba *hba) + ufs_bsg_remove(hba); + ufshpb_remove(hba); + ufs_sysfs_remove_nodes(hba->dev); +- blk_cleanup_queue(hba->tmf_queue); ++ blk_mq_destroy_queue(hba->tmf_queue); + blk_mq_free_tag_set(&hba->tmf_tag_set); + scsi_remove_host(hba->host); + /* disable interrupts */ +@@ -9815,7 +9815,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) + return 0; + + free_tmf_queue: +- blk_cleanup_queue(hba->tmf_queue); ++ blk_mq_destroy_queue(hba->tmf_queue); + free_tmf_tag_set: + blk_mq_free_tag_set(&hba->tmf_tag_set); + out_remove_scsi_host: +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index dfef85a18eb55..80b29f937c605 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -6049,7 +6049,7 @@ re_enumerate: + * + * Return: The same as for usb_reset_and_verify_device(). + * However, if a reset is already in progress (for instance, if a +- * driver doesn't have pre_ or post_reset() callbacks, and while ++ * driver doesn't have pre_reset() or post_reset() callbacks, and while + * being unbound or re-bound during the ongoing reset its disconnect() + * or probe() routine tries to perform a second, nested reset), the + * routine returns -EINPROGRESS. +diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c +index 1db9f51f98aef..08ca65ffe57b7 100644 +--- a/drivers/usb/dwc3/core.c ++++ b/drivers/usb/dwc3/core.c +@@ -1718,12 +1718,6 @@ static int dwc3_probe(struct platform_device *pdev) + + dwc3_get_properties(dwc); + +- if (!dwc->sysdev_is_parent) { +- ret = dma_set_mask_and_coherent(dwc->sysdev, DMA_BIT_MASK(64)); +- if (ret) +- return ret; +- } +- + dwc->reset = devm_reset_control_array_get_optional_shared(dev); + if (IS_ERR(dwc->reset)) + return PTR_ERR(dwc->reset); +@@ -1789,6 +1783,13 @@ static int dwc3_probe(struct platform_device *pdev) + platform_set_drvdata(pdev, dwc); + dwc3_cache_hwparams(dwc); + ++ if (!dwc->sysdev_is_parent && ++ DWC3_GHWPARAMS0_AWIDTH(dwc->hwparams.hwparams0) == 64) { ++ ret = dma_set_mask_and_coherent(dwc->sysdev, DMA_BIT_MASK(64)); ++ if (ret) ++ goto disable_clks; ++ } ++ + spin_lock_init(&dwc->lock); + mutex_init(&dwc->mutex); + +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index a5e8374a8d710..697683e3fbffa 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -256,6 +256,7 @@ static void option_instat_callback(struct urb *urb); + #define QUECTEL_PRODUCT_EM060K 0x030b + #define QUECTEL_PRODUCT_EM12 0x0512 + #define QUECTEL_PRODUCT_RM500Q 0x0800 ++#define QUECTEL_PRODUCT_RM520N 0x0801 + #define QUECTEL_PRODUCT_EC200S_CN 0x6002 + #define QUECTEL_PRODUCT_EC200T 0x6026 + #define QUECTEL_PRODUCT_RM500K 0x7001 +@@ -1138,6 +1139,8 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0xff, 0xff), + .driver_info = NUMEP2 }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG95, 0xff, 0, 0) }, ++ { USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, 0x0203, 0xff), /* BG95-M3 */ ++ .driver_info = ZLP }, + { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_BG96), + .driver_info = RSVD(4) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0xff, 0xff), +@@ -1159,6 +1162,9 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10), + .driver_info = ZLP }, ++ { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM520N, 0xff, 0xff, 0x30) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM520N, 0xff, 0, 0x40) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM520N, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200S_CN, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200T, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500K, 0xff, 0x00, 0x00) }, +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c +index d5f3f763717ea..d4b2519257962 100644 +--- a/drivers/xen/xenbus/xenbus_client.c ++++ b/drivers/xen/xenbus/xenbus_client.c +@@ -382,9 +382,10 @@ int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr, + unsigned long ring_size = nr_pages * XEN_PAGE_SIZE; + grant_ref_t gref_head; + unsigned int i; ++ void *addr; + int ret; + +- *vaddr = alloc_pages_exact(ring_size, gfp | __GFP_ZERO); ++ addr = *vaddr = alloc_pages_exact(ring_size, gfp | __GFP_ZERO); + if (!*vaddr) { + ret = -ENOMEM; + goto err; +@@ -401,13 +402,15 @@ int xenbus_setup_ring(struct xenbus_device *dev, gfp_t gfp, void **vaddr, + unsigned long gfn; + + if (is_vmalloc_addr(*vaddr)) +- gfn = pfn_to_gfn(vmalloc_to_pfn(vaddr[i])); ++ gfn = pfn_to_gfn(vmalloc_to_pfn(addr)); + else +- gfn = virt_to_gfn(vaddr[i]); ++ gfn = virt_to_gfn(addr); + + grefs[i] = gnttab_claim_grant_reference(&gref_head); + gnttab_grant_foreign_access_ref(grefs[i], dev->otherend_id, + gfn, 0); ++ ++ addr += XEN_PAGE_SIZE; + } + + return 0; +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 781952c5a5c23..20ad619a8a973 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -4586,6 +4586,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) + + set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + ++ /* ++ * If we had UNFINISHED_DROPS we could still be processing them, so ++ * clear that bit and wake up relocation so it can stop. ++ * We must do this before stopping the block group reclaim task, because ++ * at btrfs_relocate_block_group() we wait for this bit, and after the ++ * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we ++ * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will ++ * return 1. ++ */ ++ btrfs_wake_unfinished_drop(fs_info); ++ + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park +@@ -4604,12 +4615,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) + */ + kthread_park(fs_info->cleaner_kthread); + +- /* +- * If we had UNFINISHED_DROPS we could still be processing them, so +- * clear that bit and wake up relocation so it can stop. +- */ +- btrfs_wake_unfinished_drop(fs_info); +- + /* wait for the qgroup rescan worker to stop */ + btrfs_qgroup_wait_for_completion(fs_info, false); + +@@ -4632,6 +4637,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) + /* clear out the rbtree of defraggable inodes */ + btrfs_cleanup_defrag_inodes(fs_info); + ++ /* ++ * After we parked the cleaner kthread, ordered extents may have ++ * completed and created new delayed iputs. If one of the async reclaim ++ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we ++ * can hang forever trying to stop it, because if a delayed iput is ++ * added after it ran btrfs_run_delayed_iputs() and before it called ++ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is ++ * no one else to run iputs. ++ * ++ * So wait for all ongoing ordered extents to complete and then run ++ * delayed iputs. This works because once we reach this point no one ++ * can either create new ordered extents nor create delayed iputs ++ * through some other means. ++ * ++ * Also note that btrfs_wait_ordered_roots() is not safe here, because ++ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, ++ * but the delayed iput for the respective inode is made only when doing ++ * the final btrfs_put_ordered_extent() (which must happen at ++ * btrfs_finish_ordered_io() when we are unmounting). ++ */ ++ btrfs_flush_workqueue(fs_info->endio_write_workers); ++ /* Ordered extents for free space inodes. */ ++ btrfs_flush_workqueue(fs_info->endio_freespace_worker); ++ btrfs_run_delayed_iputs(fs_info); ++ + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 1386362fad3b8..4448b7b6ea221 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1918,10 +1918,44 @@ out_unlock: + return ret; + } + ++static void wait_eb_writebacks(struct btrfs_block_group *block_group) ++{ ++ struct btrfs_fs_info *fs_info = block_group->fs_info; ++ const u64 end = block_group->start + block_group->length; ++ struct radix_tree_iter iter; ++ struct extent_buffer *eb; ++ void __rcu **slot; ++ ++ rcu_read_lock(); ++ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, ++ block_group->start >> fs_info->sectorsize_bits) { ++ eb = radix_tree_deref_slot(slot); ++ if (!eb) ++ continue; ++ if (radix_tree_deref_retry(eb)) { ++ slot = radix_tree_iter_retry(&iter); ++ continue; ++ } ++ ++ if (eb->start < block_group->start) ++ continue; ++ if (eb->start >= end) ++ break; ++ ++ slot = radix_tree_iter_resume(slot, &iter); ++ rcu_read_unlock(); ++ wait_on_extent_buffer_writeback(eb); ++ rcu_read_lock(); ++ } ++ rcu_read_unlock(); ++} ++ + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) + { + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; ++ const bool is_metadata = (block_group->flags & ++ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + int ret = 0; + int i; + +@@ -1932,8 +1966,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ + } + + /* Check if we have unwritten allocated space */ +- if ((block_group->flags & +- (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && ++ if (is_metadata && + block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { + spin_unlock(&block_group->lock); + return -EAGAIN; +@@ -1958,6 +1991,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ + /* No need to wait for NOCOW writers. Zoned mode does not allow that */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); ++ /* Wait for extent buffers to be written. */ ++ if (is_metadata) ++ wait_eb_writebacks(block_group); + + spin_lock(&block_group->lock); + +diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c +index 8f2e003e05907..97278c43f8dc0 100644 +--- a/fs/cifs/cifsfs.c ++++ b/fs/cifs/cifsfs.c +@@ -1232,6 +1232,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, + lock_two_nondirectories(target_inode, src_inode); + + cifs_dbg(FYI, "about to flush pages\n"); ++ ++ rc = filemap_write_and_wait_range(src_inode->i_mapping, off, ++ off + len - 1); ++ if (rc) ++ goto out; ++ + /* should we flush first and last page first */ + truncate_inode_pages(&target_inode->i_data, 0); + +diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c +index e8a8daa82ed76..cc180d37b8ce1 100644 +--- a/fs/cifs/smb2ops.c ++++ b/fs/cifs/smb2ops.c +@@ -1886,17 +1886,8 @@ smb2_copychunk_range(const unsigned int xid, + int chunks_copied = 0; + bool chunk_sizes_updated = false; + ssize_t bytes_written, total_bytes_written = 0; +- struct inode *inode; + + pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); +- +- /* +- * We need to flush all unwritten data before we can send the +- * copychunk ioctl to the server. +- */ +- inode = d_inode(trgtfile->dentry); +- filemap_write_and_wait(inode->i_mapping); +- + if (pcchunk == NULL) + return -ENOMEM; + +@@ -3961,39 +3952,50 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, + { + int rc; + unsigned int xid; +- struct inode *inode; ++ struct inode *inode = file_inode(file); + struct cifsFileInfo *cfile = file->private_data; +- struct cifsInodeInfo *cifsi; ++ struct cifsInodeInfo *cifsi = CIFS_I(inode); + __le64 eof; ++ loff_t old_eof; + + xid = get_xid(); + +- inode = d_inode(cfile->dentry); +- cifsi = CIFS_I(inode); ++ inode_lock(inode); + +- if (off >= i_size_read(inode) || +- off + len >= i_size_read(inode)) { ++ old_eof = i_size_read(inode); ++ if ((off >= old_eof) || ++ off + len >= old_eof) { + rc = -EINVAL; + goto out; + } + ++ filemap_invalidate_lock(inode->i_mapping); ++ rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof - 1); ++ if (rc < 0) ++ goto out_2; ++ ++ truncate_pagecache_range(inode, off, old_eof); ++ + rc = smb2_copychunk_range(xid, cfile, cfile, off + len, +- i_size_read(inode) - off - len, off); ++ old_eof - off - len, off); + if (rc < 0) +- goto out; ++ goto out_2; + +- eof = cpu_to_le64(i_size_read(inode) - len); ++ eof = cpu_to_le64(old_eof - len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc < 0) +- goto out; ++ goto out_2; + + rc = 0; + + cifsi->server_eof = i_size_read(inode) - len; + truncate_setsize(inode, cifsi->server_eof); + fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof); ++out_2: ++ filemap_invalidate_unlock(inode->i_mapping); + out: ++ inode_unlock(inode); + free_xid(xid); + return rc; + } +@@ -4004,34 +4006,47 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, + int rc; + unsigned int xid; + struct cifsFileInfo *cfile = file->private_data; ++ struct inode *inode = file_inode(file); + __le64 eof; +- __u64 count; ++ __u64 count, old_eof; + + xid = get_xid(); + +- if (off >= i_size_read(file->f_inode)) { ++ inode_lock(inode); ++ ++ old_eof = i_size_read(inode); ++ if (off >= old_eof) { + rc = -EINVAL; + goto out; + } + +- count = i_size_read(file->f_inode) - off; +- eof = cpu_to_le64(i_size_read(file->f_inode) + len); ++ count = old_eof - off; ++ eof = cpu_to_le64(old_eof + len); ++ ++ filemap_invalidate_lock(inode->i_mapping); ++ rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof + len - 1); ++ if (rc < 0) ++ goto out_2; ++ truncate_pagecache_range(inode, off, old_eof); + + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc < 0) +- goto out; ++ goto out_2; + + rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); + if (rc < 0) +- goto out; ++ goto out_2; + +- rc = smb3_zero_range(file, tcon, off, len, 1); ++ rc = smb3_zero_data(file, tcon, off, len, xid); + if (rc < 0) +- goto out; ++ goto out_2; + + rc = 0; ++out_2: ++ filemap_invalidate_unlock(inode->i_mapping); + out: ++ inode_unlock(inode); + free_xid(xid); + return rc; + } +diff --git a/fs/dax.c b/fs/dax.c +index 4155a6107fa10..7ab248ed21aa3 100644 +--- a/fs/dax.c ++++ b/fs/dax.c +@@ -1241,6 +1241,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, + loff_t done = 0; + int ret; + ++ if (!iomi.len) ++ return 0; ++ + if (iov_iter_rw(iter) == WRITE) { + lockdep_assert_held_write(&iomi.inode->i_rwsem); + iomi.flags |= IOMAP_WRITE; +diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c +index 9de6a6b844c9e..e541a004f8efa 100644 +--- a/fs/exfat/fatent.c ++++ b/fs/exfat/fatent.c +@@ -270,8 +270,7 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) + struct super_block *sb = dir->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; +- sector_t blknr, last_blknr; +- int i; ++ sector_t blknr, last_blknr, i; + + blknr = exfat_cluster_to_sector(sbi, clu); + last_blknr = blknr + sbi->sect_per_clus; +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index adfc30ee4b7be..0d86931269bfc 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -167,8 +167,6 @@ enum SHIFT_DIRECTION { + #define EXT4_MB_CR0_OPTIMIZED 0x8000 + /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ + #define EXT4_MB_CR1_OPTIMIZED 0x00010000 +-/* Perform linear traversal for one group */ +-#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 + struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; +@@ -1589,8 +1587,8 @@ struct ext4_sb_info { + struct list_head s_discard_list; + struct work_struct s_discard_work; + atomic_t s_retry_alloc_pending; +- struct rb_root s_mb_avg_fragment_size_root; +- rwlock_t s_mb_rb_lock; ++ struct list_head *s_mb_avg_fragment_size; ++ rwlock_t *s_mb_avg_fragment_size_locks; + struct list_head *s_mb_largest_free_orders; + rwlock_t *s_mb_largest_free_orders_locks; + +@@ -3402,6 +3400,8 @@ struct ext4_group_info { + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ++ int bb_avg_fragment_size_order; /* order of average ++ fragment in BG */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; +@@ -3409,7 +3409,7 @@ struct ext4_group_info { + void *bb_bitmap; + #endif + struct rw_semaphore alloc_sem; +- struct rb_node bb_avg_fragment_size_rb; ++ struct list_head bb_avg_fragment_size_node; + struct list_head bb_largest_free_order_node; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index c148bb97b5273..5235974126bd3 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -460,6 +460,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, + error_msg = "invalid eh_entries"; + goto corrupted; + } ++ if (unlikely((eh->eh_entries == 0) && (depth > 0))) { ++ error_msg = "eh_entries is 0 but eh_depth is > 0"; ++ goto corrupted; ++ } + if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index f73e5eb43eae1..208b87ce88588 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -510,7 +510,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, + goto fallback; + } + +- max_dirs = ndirs / ngroups + inodes_per_group / 16; ++ max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 38e7dc2531b17..fd29e15d1c3b5 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -140,13 +140,15 @@ + * number of buddy bitmap orders possible) number of lists. Group-infos are + * placed in appropriate lists. + * +- * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) ++ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) + * +- * Locking: sbi->s_mb_rb_lock (rwlock) ++ * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) + * +- * This is a red black tree consisting of group infos and the tree is sorted +- * by average fragment sizes (which is calculated as ext4_group_info->bb_free +- * / ext4_group_info->bb_fragments). ++ * This is an array of lists where in the i-th list there are groups with ++ * average fragment size >= 2^i and < 2^(i+1). The average fragment size ++ * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. ++ * Note that we don't bother with a special list for completely empty groups ++ * so we only have MB_NUM_ORDERS(sb) lists. + * + * When "mb_optimize_scan" mount option is set, mballoc consults the above data + * structures to decide the order in which groups are to be traversed for +@@ -160,7 +162,8 @@ + * + * At CR = 1, we only consider groups where average fragment size > request + * size. So, we lookup a group which has average fragment size just above or +- * equal to request size using our rb tree (data structure 2) in O(log N) time. ++ * equal to request size using our average fragment size group lists (data ++ * structure 2) in O(1) time. + * + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. +@@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, + } + } + +-static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, +- int (*cmp)(struct rb_node *, struct rb_node *)) ++static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) + { +- struct rb_node **iter = &root->rb_node, *parent = NULL; ++ int order; + +- while (*iter) { +- parent = *iter; +- if (cmp(new, *iter) > 0) +- iter = &((*iter)->rb_left); +- else +- iter = &((*iter)->rb_right); +- } +- +- rb_link_node(new, parent, iter); +- rb_insert_color(new, root); +-} +- +-static int +-ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) +-{ +- struct ext4_group_info *grp1 = rb_entry(rb1, +- struct ext4_group_info, +- bb_avg_fragment_size_rb); +- struct ext4_group_info *grp2 = rb_entry(rb2, +- struct ext4_group_info, +- bb_avg_fragment_size_rb); +- int num_frags_1, num_frags_2; +- +- num_frags_1 = grp1->bb_fragments ? +- grp1->bb_free / grp1->bb_fragments : 0; +- num_frags_2 = grp2->bb_fragments ? +- grp2->bb_free / grp2->bb_fragments : 0; +- +- return (num_frags_2 - num_frags_1); ++ /* ++ * We don't bother with a special lists groups with only 1 block free ++ * extents and for completely empty groups. ++ */ ++ order = fls(len) - 2; ++ if (order < 0) ++ return 0; ++ if (order == MB_NUM_ORDERS(sb)) ++ order--; ++ return order; + } + +-/* +- * Reinsert grpinfo into the avg_fragment_size tree with new average +- * fragment size. +- */ ++/* Move group to appropriate avg_fragment_size list */ + static void + mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); ++ int new_order; + + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + return; + +- write_lock(&sbi->s_mb_rb_lock); +- if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { +- rb_erase(&grp->bb_avg_fragment_size_rb, +- &sbi->s_mb_avg_fragment_size_root); +- RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); +- } ++ new_order = mb_avg_fragment_size_order(sb, ++ grp->bb_free / grp->bb_fragments); ++ if (new_order == grp->bb_avg_fragment_size_order) ++ return; + +- ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, +- &grp->bb_avg_fragment_size_rb, +- ext4_mb_avg_fragment_size_cmp); +- write_unlock(&sbi->s_mb_rb_lock); ++ if (grp->bb_avg_fragment_size_order != -1) { ++ write_lock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ list_del(&grp->bb_avg_fragment_size_node); ++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ } ++ grp->bb_avg_fragment_size_order = new_order; ++ write_lock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ list_add_tail(&grp->bb_avg_fragment_size_node, ++ &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); ++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); + } + + /* +@@ -909,86 +898,55 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, + *new_cr = 1; + } else { + *group = grp->bb_group; +- ac->ac_last_optimal_group = *group; + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + } + } + + /* +- * Choose next group by traversing average fragment size tree. Updates *new_cr +- * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that +- * the linear search should continue for one iteration since there's lock +- * contention on the rb tree lock. ++ * Choose next group by traversing average fragment size list of suitable ++ * order. Updates *new_cr if cr level needs an update. + */ + static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); +- int avg_fragment_size, best_so_far; +- struct rb_node *node, *found; +- struct ext4_group_info *grp; +- +- /* +- * If there is contention on the lock, instead of waiting for the lock +- * to become available, just continue searching lineraly. We'll resume +- * our rb tree search later starting at ac->ac_last_optimal_group. +- */ +- if (!read_trylock(&sbi->s_mb_rb_lock)) { +- ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; +- return; +- } ++ struct ext4_group_info *grp = NULL, *iter; ++ int i; + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); +- /* We have found something at CR 1 in the past */ +- grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); +- for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; +- found = rb_next(found)) { +- grp = rb_entry(found, struct ext4_group_info, +- bb_avg_fragment_size_rb); ++ } ++ ++ for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); ++ i < MB_NUM_ORDERS(ac->ac_sb); i++) { ++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) ++ continue; ++ read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ continue; ++ } ++ list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], ++ bb_avg_fragment_size_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); +- if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) ++ if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { ++ grp = iter; + break; +- } +- goto done; +- } +- +- node = sbi->s_mb_avg_fragment_size_root.rb_node; +- best_so_far = 0; +- found = NULL; +- +- while (node) { +- grp = rb_entry(node, struct ext4_group_info, +- bb_avg_fragment_size_rb); +- avg_fragment_size = 0; +- if (ext4_mb_good_group(ac, grp->bb_group, 1)) { +- avg_fragment_size = grp->bb_fragments ? +- grp->bb_free / grp->bb_fragments : 0; +- if (!best_so_far || avg_fragment_size < best_so_far) { +- best_so_far = avg_fragment_size; +- found = node; + } + } +- if (avg_fragment_size > ac->ac_g_ex.fe_len) +- node = node->rb_right; +- else +- node = node->rb_left; ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ if (grp) ++ break; + } + +-done: +- if (found) { +- grp = rb_entry(found, struct ext4_group_info, +- bb_avg_fragment_size_rb); ++ if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + } else { + *new_cr = 2; + } +- +- read_unlock(&sbi->s_mb_rb_lock); +- ac->ac_last_optimal_group = *group; + } + + static inline int should_optimize_scan(struct ext4_allocation_context *ac) +@@ -1017,11 +975,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) + goto inc_and_return; + } + +- if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { +- ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; +- goto inc_and_return; +- } +- + return group; + inc_and_return: + /* +@@ -1049,8 +1002,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, + { + *new_cr = ac->ac_criteria; + +- if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) ++ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { ++ *group = next_linear_group(ac, *group, ngroups); + return; ++ } + + if (*new_cr == 0) { + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); +@@ -1075,23 +1030,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) + struct ext4_sb_info *sbi = EXT4_SB(sb); + int i; + +- if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { ++ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) ++ if (grp->bb_counters[i] > 0) ++ break; ++ /* No need to move between order lists? */ ++ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || ++ i == grp->bb_largest_free_order) { ++ grp->bb_largest_free_order = i; ++ return; ++ } ++ ++ if (grp->bb_largest_free_order >= 0) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_del_init(&grp->bb_largest_free_order_node); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } +- grp->bb_largest_free_order = -1; /* uninit */ +- +- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { +- if (grp->bb_counters[i] > 0) { +- grp->bb_largest_free_order = i; +- break; +- } +- } +- if (test_opt2(sb, MB_OPTIMIZE_SCAN) && +- grp->bb_largest_free_order >= 0 && grp->bb_free) { ++ grp->bb_largest_free_order = i; ++ if (grp->bb_largest_free_order >= 0 && grp->bb_free) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_add_tail(&grp->bb_largest_free_order_node, +@@ -1148,13 +1105,13 @@ void ext4_mb_generate_buddy(struct super_block *sb, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + } + mb_set_largest_free_order(sb, grp); ++ mb_update_avg_fragment_size(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); +- mb_update_avg_fragment_size(sb, grp); + } + + /* The buddy information is attached the buddy cache inode +@@ -2630,7 +2587,7 @@ static noinline_for_stack int + ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + { + ext4_group_t prefetch_grp = 0, ngroups, group, i; +- int cr = -1; ++ int cr = -1, new_cr; + int err = 0, first_err = 0; + unsigned int nr = 0, prefetch_ios = 0; + struct ext4_sb_info *sbi; +@@ -2701,17 +2658,14 @@ repeat: + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; +- ac->ac_last_optimal_group = group; + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; + prefetch_grp = group; + +- for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), +- i++) { +- int ret = 0, new_cr; ++ for (i = 0, new_cr = cr; i < ngroups; i++, ++ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { ++ int ret = 0; + + cond_resched(); +- +- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); + if (new_cr != cr) { + cr = new_cr; + goto repeat; +@@ -2985,9 +2939,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) + struct super_block *sb = pde_data(file_inode(seq->file)); + unsigned long position; + +- read_lock(&EXT4_SB(sb)->s_mb_rb_lock); +- +- if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) ++ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +@@ -2999,7 +2951,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof + unsigned long position; + + ++*pos; +- if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) ++ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +@@ -3011,29 +2963,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long position = ((unsigned long) v); + struct ext4_group_info *grp; +- struct rb_node *n; +- unsigned int count, min, max; ++ unsigned int count; + + position--; + if (position >= MB_NUM_ORDERS(sb)) { +- seq_puts(seq, "fragment_size_tree:\n"); +- n = rb_first(&sbi->s_mb_avg_fragment_size_root); +- if (!n) { +- seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); +- return 0; +- } +- grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); +- min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; +- count = 1; +- while (rb_next(n)) { +- count++; +- n = rb_next(n); +- } +- grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); +- max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; ++ position -= MB_NUM_ORDERS(sb); ++ if (position == 0) ++ seq_puts(seq, "avg_fragment_size_lists:\n"); + +- seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", +- min, max, count); ++ count = 0; ++ read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); ++ list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], ++ bb_avg_fragment_size_node) ++ count++; ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); ++ seq_printf(seq, "\tlist_order_%u_groups: %u\n", ++ (unsigned int)position, count); + return 0; + } + +@@ -3043,9 +2988,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) + seq_puts(seq, "max_free_order_lists:\n"); + } + count = 0; ++ read_lock(&sbi->s_mb_largest_free_orders_locks[position]); + list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], + bb_largest_free_order_node) + count++; ++ read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); + +@@ -3053,11 +3000,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) + } + + static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +-__releases(&EXT4_SB(sb)->s_mb_rb_lock) + { +- struct super_block *sb = pde_data(file_inode(seq->file)); +- +- read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); + } + + const struct seq_operations ext4_mb_seq_structs_summary_ops = { +@@ -3170,8 +3113,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); +- RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); ++ INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ ++ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ + meta_group_info[i]->bb_group = group; + + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); +@@ -3420,7 +3364,24 @@ int ext4_mb_init(struct super_block *sb) + i++; + } while (i < MB_NUM_ORDERS(sb)); + +- sbi->s_mb_avg_fragment_size_root = RB_ROOT; ++ sbi->s_mb_avg_fragment_size = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!sbi->s_mb_avg_fragment_size) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ sbi->s_mb_avg_fragment_size_locks = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), ++ GFP_KERNEL); ++ if (!sbi->s_mb_avg_fragment_size_locks) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ for (i = 0; i < MB_NUM_ORDERS(sb); i++) { ++ INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); ++ rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); ++ } + sbi->s_mb_largest_free_orders = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); +@@ -3439,7 +3400,6 @@ int ext4_mb_init(struct super_block *sb) + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); + } +- rwlock_init(&sbi->s_mb_rb_lock); + + spin_lock_init(&sbi->s_md_lock); + sbi->s_mb_free_pending = 0; +@@ -3510,6 +3470,8 @@ out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; + out: ++ kfree(sbi->s_mb_avg_fragment_size); ++ kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_offsets); +@@ -3576,6 +3538,8 @@ int ext4_mb_release(struct super_block *sb) + kvfree(group_info); + rcu_read_unlock(); + } ++ kfree(sbi->s_mb_avg_fragment_size); ++ kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_offsets); +@@ -5187,6 +5151,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; ++ bool inode_pa_eligible, group_pa_eligible; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; +@@ -5194,25 +5159,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + ++ group_pa_eligible = sbi->s_mb_group_prealloc > 0; ++ inode_pa_eligible = true; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + ++ /* No point in using inode preallocation for closed files */ + if ((size == isize) && !ext4_fs_is_busy(sbi) && +- !inode_is_open_for_write(ac->ac_inode)) { +- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; +- return; +- } ++ !inode_is_open_for_write(ac->ac_inode)) ++ inode_pa_eligible = false; + +- if (sbi->s_mb_group_prealloc <= 0) { +- ac->ac_flags |= EXT4_MB_STREAM_ALLOC; +- return; +- } +- +- /* don't use group allocation for large files */ + size = max(size, isize); +- if (size > sbi->s_mb_stream_request) { +- ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ++ /* Don't use group allocation for large files */ ++ if (size > sbi->s_mb_stream_request) ++ group_pa_eligible = false; ++ ++ if (!group_pa_eligible) { ++ if (inode_pa_eligible) ++ ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ++ else ++ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + return; + } + +@@ -5559,6 +5526,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; ++ int retries = 0; + u64 seq; + + might_sleep(); +@@ -5661,7 +5629,8 @@ repeat: + ar->len = ac->ac_b_ex.fe_len; + } + } else { +- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) ++ if (++retries < 3 && ++ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + goto repeat; + /* + * If block allocation fails then the pa allocated above +diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h +index 39da92ceabf88..dcda2a943cee0 100644 +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -178,7 +178,6 @@ struct ext4_allocation_context { + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + +- ext4_group_t ac_last_optimal_group; + __u32 ac_groups_considered; + __u32 ac_flags; /* allocation hints */ + __u16 ac_groups_scanned; +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index 7515a465ec03a..7c90b1ab3e00d 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -543,10 +543,9 @@ + */ + #ifdef CONFIG_CFI_CLANG + #define TEXT_CFI_JT \ +- . = ALIGN(PMD_SIZE); \ ++ ALIGN_FUNCTION(); \ + __cfi_jt_start = .; \ + *(.text..L.cfi.jumptable .text..L.cfi.jumptable.*) \ +- . = ALIGN(PMD_SIZE); \ + __cfi_jt_end = .; + #else + #define TEXT_CFI_JT +diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h +index e2d9daf7e8dd0..0fd96e92c6c65 100644 +--- a/include/linux/blk-mq.h ++++ b/include/linux/blk-mq.h +@@ -686,10 +686,13 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + \ + __blk_mq_alloc_disk(set, queuedata, &__key); \ + }) ++struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, ++ struct lock_class_key *lkclass); + struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); + int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q); + void blk_mq_unregister_dev(struct device *, struct request_queue *); ++void blk_mq_destroy_queue(struct request_queue *); + + int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); + int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 62e3ff52ab033..83eb8869a8c94 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -148,6 +148,7 @@ struct gendisk { + #define GD_NATIVE_CAPACITY 3 + #define GD_ADDED 4 + #define GD_SUPPRESS_PART_SCAN 5 ++#define GD_OWNS_QUEUE 6 + + struct mutex open_mutex; /* open/close mutex */ + unsigned open_partitions; /* number of open partitions */ +@@ -559,7 +560,6 @@ struct request_queue { + #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ + #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ + #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ +-#define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ + #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ + #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ + #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ +@@ -587,7 +587,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); + #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) + #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) + #define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) +-#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) + #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) + #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) + #define blk_queue_noxmerges(q) \ +@@ -812,8 +811,6 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) + + int bdev_disk_changed(struct gendisk *disk, bool invalidate); + +-struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, +- struct lock_class_key *lkclass); + void put_disk(struct gendisk *disk); + struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); + +@@ -955,7 +952,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, + /* + * Access functions for manipulating queue properties + */ +-extern void blk_cleanup_queue(struct request_queue *); + void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce limit); + extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); + extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); +diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h +index 4592d08459417..57aa459c6618a 100644 +--- a/include/linux/cpumask.h ++++ b/include/linux/cpumask.h +@@ -1083,9 +1083,10 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, + * cover a worst-case of every other cpu being on one of two nodes for a + * very large NR_CPUS. + * +- * Use PAGE_SIZE as a minimum for smaller configurations. ++ * Use PAGE_SIZE as a minimum for smaller configurations while avoiding ++ * unsigned comparison to -1. + */ +-#define CPUMAP_FILE_MAX_BYTES ((((NR_CPUS * 9)/32 - 1) > PAGE_SIZE) \ ++#define CPUMAP_FILE_MAX_BYTES (((NR_CPUS * 9)/32 > PAGE_SIZE) \ + ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) + #define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) + +diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h +index fde258b3decd5..037a8d81a66cf 100644 +--- a/include/linux/serial_core.h ++++ b/include/linux/serial_core.h +@@ -302,6 +302,23 @@ struct uart_state { + /* number of characters left in xmit buffer before we ask for more */ + #define WAKEUP_CHARS 256 + ++/** ++ * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars ++ * @up: uart_port structure describing the port ++ * @chars: number of characters sent ++ * ++ * This function advances the tail of circular xmit buffer by the number of ++ * @chars transmitted and handles accounting of transmitted bytes (into ++ * @up's icount.tx). ++ */ ++static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars) ++{ ++ struct circ_buf *xmit = &up->state->xmit; ++ ++ xmit->tail = (xmit->tail + chars) & (UART_XMIT_SIZE - 1); ++ up->icount.tx += chars; ++} ++ + struct module; + struct tty_driver; + +diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h +index 184105d682942..f2273bd5a4c58 100644 +--- a/include/net/bond_3ad.h ++++ b/include/net/bond_3ad.h +@@ -15,8 +15,6 @@ + #define PKT_TYPE_LACPDU cpu_to_be16(ETH_P_SLOW) + #define AD_TIMER_INTERVAL 100 /*msec*/ + +-#define MULTICAST_LACPDU_ADDR {0x01, 0x80, 0xC2, 0x00, 0x00, 0x02} +- + #define AD_LACP_SLOW 0 + #define AD_LACP_FAST 1 + +diff --git a/include/net/bonding.h b/include/net/bonding.h +index 3b816ae8b1f3b..7ac1773b99224 100644 +--- a/include/net/bonding.h ++++ b/include/net/bonding.h +@@ -785,6 +785,9 @@ extern struct rtnl_link_ops bond_link_ops; + /* exported from bond_sysfs_slave.c */ + extern const struct sysfs_ops slave_sysfs_ops; + ++/* exported from bond_3ad.c */ ++extern const u8 lacpdu_mcast_addr[]; ++ + static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) + { + dev_core_stats_tx_dropped_inc(dev); +diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h +index 667d889b92b52..3e1cea155049b 100644 +--- a/include/scsi/scsi_host.h ++++ b/include/scsi/scsi_host.h +@@ -557,6 +557,8 @@ struct Scsi_Host { + struct scsi_host_template *hostt; + struct scsi_transport_template *transportt; + ++ struct kref tagset_refcnt; ++ struct completion tagset_freed; + /* Area to keep a shared tag map */ + struct blk_mq_tag_set tag_set; + +diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h +index 65e13a099b1a0..a9f5d884560ac 100644 +--- a/include/uapi/linux/xfrm.h ++++ b/include/uapi/linux/xfrm.h +@@ -296,7 +296,7 @@ enum xfrm_attr_type_t { + XFRMA_ETIMER_THRESH, + XFRMA_SRCADDR, /* xfrm_address_t */ + XFRMA_COADDR, /* xfrm_address_t */ +- XFRMA_LASTUSED, /* unsigned long */ ++ XFRMA_LASTUSED, /* __u64 */ + XFRMA_POLICY_TYPE, /* struct xfrm_userpolicy_type */ + XFRMA_MIGRATE, + XFRMA_ALG_AEAD, /* struct xfrm_algo_aead */ +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 602da2cfd57c8..15a6f1e93e5af 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -10951,6 +10951,9 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) + io_poll_remove_all(ctx, NULL, true); + /* if we failed setting up the ctx, we might not have any rings */ + io_iopoll_try_reap_events(ctx); ++ /* drop cached put refs after potentially doing completions */ ++ if (current->io_uring) ++ io_uring_drop_tctx_refs(current); + } + + INIT_WORK(&ctx->exit_work, io_ring_exit_work); +diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c +index e702ca368539a..80c23f48f3b4b 100644 +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -6026,6 +6026,9 @@ struct cgroup *cgroup_get_from_id(u64 id) + if (!kn) + goto out; + ++ if (kernfs_type(kn) != KERNFS_DIR) ++ goto put; ++ + rcu_read_lock(); + + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); +@@ -6033,7 +6036,7 @@ struct cgroup *cgroup_get_from_id(u64 id) + cgrp = NULL; + + rcu_read_unlock(); +- ++put: + kernfs_put(kn); + out: + return cgrp; +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index aa8a82bc67384..fc6e4f2523452 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -3066,10 +3066,8 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) + if (WARN_ON(!work->func)) + return false; + +- if (!from_cancel) { +- lock_map_acquire(&work->lockdep_map); +- lock_map_release(&work->lockdep_map); +- } ++ lock_map_acquire(&work->lockdep_map); ++ lock_map_release(&work->lockdep_map); + + if (start_flush_work(work, &barr, from_cancel)) { + wait_for_completion(&barr.done); +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 2e24db4bff192..c399ab486557f 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -264,8 +264,10 @@ config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT + config DEBUG_INFO_DWARF4 + bool "Generate DWARF Version 4 debuginfo" + select DEBUG_INFO ++ depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) + help +- Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+. ++ Generate DWARF v4 debug info. This requires gcc 4.5+, binutils 2.35.2 ++ if using clang without clang's integrated assembler, and gdb 7.0+. + + If you have consumers of DWARF debug info that are not ready for + newer revisions of DWARF, you may wish to choose this or have your +diff --git a/mm/slab_common.c b/mm/slab_common.c +index dbd4b6f9b0e79..29ae1358d5f07 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -503,6 +503,7 @@ void slab_kmem_cache_release(struct kmem_cache *s) + void kmem_cache_destroy(struct kmem_cache *s) + { + int refcnt; ++ bool rcu_set; + + if (unlikely(!s) || !kasan_check_byte(s)) + return; +@@ -510,6 +511,8 @@ void kmem_cache_destroy(struct kmem_cache *s) + cpus_read_lock(); + mutex_lock(&slab_mutex); + ++ rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; ++ + refcnt = --s->refcount; + if (refcnt) + goto out_unlock; +@@ -520,7 +523,7 @@ void kmem_cache_destroy(struct kmem_cache *s) + out_unlock: + mutex_unlock(&slab_mutex); + cpus_read_unlock(); +- if (!refcnt && !(s->flags & SLAB_TYPESAFE_BY_RCU)) ++ if (!refcnt && !rcu_set) + kmem_cache_release(s); + } + EXPORT_SYMBOL(kmem_cache_destroy); +diff --git a/mm/slub.c b/mm/slub.c +index b1281b8654bd3..1eec942b8336c 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -310,6 +310,11 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) + */ + static nodemask_t slab_nodes; + ++/* ++ * Workqueue used for flush_cpu_slab(). ++ */ ++static struct workqueue_struct *flushwq; ++ + /******************************************************************** + * Core slab cache functions + *******************************************************************/ +@@ -2730,7 +2735,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) + INIT_WORK(&sfw->work, flush_cpu_slab); + sfw->skip = false; + sfw->s = s; +- schedule_work_on(cpu, &sfw->work); ++ queue_work_on(cpu, flushwq, &sfw->work); + } + + for_each_online_cpu(cpu) { +@@ -4880,6 +4885,8 @@ void __init kmem_cache_init(void) + + void __init kmem_cache_init_late(void) + { ++ flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); ++ WARN_ON(!flushwq); + } + + struct kmem_cache * +@@ -4950,6 +4957,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) + /* Honor the call site pointer we received. */ + trace_kmalloc(caller, ret, size, s->size, gfpflags); + ++ ret = kasan_kmalloc(s, ret, size, gfpflags); ++ + return ret; + } + EXPORT_SYMBOL(__kmalloc_track_caller); +@@ -4981,6 +4990,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + /* Honor the call site pointer we received. */ + trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); + ++ ret = kasan_kmalloc(s, ret, size, gfpflags); ++ + return ret; + } + EXPORT_SYMBOL(__kmalloc_node_track_caller); +@@ -5914,7 +5925,8 @@ static char *create_unique_id(struct kmem_cache *s) + char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); + char *p = name; + +- BUG_ON(!name); ++ if (!name) ++ return ERR_PTR(-ENOMEM); + + *p++ = ':'; + /* +@@ -5972,6 +5984,8 @@ static int sysfs_slab_add(struct kmem_cache *s) + * for the symlinks. + */ + name = create_unique_id(s); ++ if (IS_ERR(name)) ++ return PTR_ERR(name); + } + + s->kobj.kset = kset; +diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c +index b8f8da7ee3dea..41c1ad33d009f 100644 +--- a/net/batman-adv/hard-interface.c ++++ b/net/batman-adv/hard-interface.c +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -700,6 +701,9 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, + int max_header_len = batadv_max_header_len(); + int ret; + ++ if (hard_iface->net_dev->mtu < ETH_MIN_MTU + max_header_len) ++ return -EINVAL; ++ + if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) + goto out; + +diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c +index 9a0ae59cdc500..4f385d52a1c49 100644 +--- a/net/bridge/netfilter/ebtables.c ++++ b/net/bridge/netfilter/ebtables.c +@@ -1040,8 +1040,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, + goto free_iterate; + } + +- if (repl->valid_hooks != t->valid_hooks) ++ if (repl->valid_hooks != t->valid_hooks) { ++ ret = -EINVAL; + goto free_unlock; ++ } + + if (repl->num_counters && repl->num_counters != t->private->nentries) { + ret = -EINVAL; +diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c +index 6aee04f75e3e4..bcba61ef5b378 100644 +--- a/net/core/flow_dissector.c ++++ b/net/core/flow_dissector.c +@@ -1572,9 +1572,8 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) + + switch (keys->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: +- addr_diff = (__force u32)keys->addrs.v4addrs.dst - +- (__force u32)keys->addrs.v4addrs.src; +- if (addr_diff < 0) ++ if ((__force u32)keys->addrs.v4addrs.dst < ++ (__force u32)keys->addrs.v4addrs.src) + swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + + if ((__force u16)keys->ports.dst < +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index 9f6f4a41245d4..1012012a061fe 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -1069,13 +1069,13 @@ static int __init inet6_init(void) + for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + ++ raw_hashinfo_init(&raw_v6_hashinfo); ++ + if (disable_ipv6_mod) { + pr_info("Loaded, but administratively disabled, reboot required to enable\n"); + goto out; + } + +- raw_hashinfo_init(&raw_v6_hashinfo); +- + err = proto_register(&tcpv6_prot, 1); + if (err) + goto out; +diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c +index 0d9332e9cf71a..617f744a2e3a3 100644 +--- a/net/netfilter/nf_conntrack_ftp.c ++++ b/net/netfilter/nf_conntrack_ftp.c +@@ -33,6 +33,7 @@ MODULE_AUTHOR("Rusty Russell "); + MODULE_DESCRIPTION("ftp connection tracking helper"); + MODULE_ALIAS("ip_conntrack_ftp"); + MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); ++static DEFINE_SPINLOCK(nf_ftp_lock); + + #define MAX_PORTS 8 + static u_int16_t ports[MAX_PORTS]; +@@ -409,7 +410,8 @@ static int help(struct sk_buff *skb, + } + datalen = skb->len - dataoff; + +- spin_lock_bh(&ct->lock); ++ /* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */ ++ spin_lock_bh(&nf_ftp_lock); + fb_ptr = skb->data + dataoff; + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); +@@ -538,7 +540,7 @@ out_update_nl: + if (ends_in_nl) + update_nl_seq(ct, seq, ct_ftp_info, dir, skb); + out: +- spin_unlock_bh(&ct->lock); ++ spin_unlock_bh(&nf_ftp_lock); + return ret; + } + +diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c +index 992decbcaa5c1..5703846bea3b6 100644 +--- a/net/netfilter/nf_conntrack_irc.c ++++ b/net/netfilter/nf_conntrack_irc.c +@@ -157,15 +157,37 @@ static int help(struct sk_buff *skb, unsigned int protoff, + data = ib_ptr; + data_limit = ib_ptr + datalen; + +- /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 +- * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ +- while (data < data_limit - (19 + MINMATCHLEN)) { +- if (memcmp(data, "\1DCC ", 5)) { ++ /* Skip any whitespace */ ++ while (data < data_limit - 10) { ++ if (*data == ' ' || *data == '\r' || *data == '\n') ++ data++; ++ else ++ break; ++ } ++ ++ /* strlen("PRIVMSG x ")=10 */ ++ if (data < data_limit - 10) { ++ if (strncasecmp("PRIVMSG ", data, 8)) ++ goto out; ++ data += 8; ++ } ++ ++ /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26 ++ * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26 ++ */ ++ while (data < data_limit - (21 + MINMATCHLEN)) { ++ /* Find first " :", the start of message */ ++ if (memcmp(data, " :", 2)) { + data++; + continue; + } ++ data += 2; ++ ++ /* then check that place only for the DCC command */ ++ if (memcmp(data, "\1DCC ", 5)) ++ goto out; + data += 5; +- /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ ++ /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */ + + iph = ip_hdr(skb); + pr_debug("DCC found in master %pI4:%u %pI4:%u\n", +@@ -181,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, + pr_debug("DCC %s detected\n", dccprotos[i]); + + /* we have at least +- * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid ++ * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid + * data left (== 14/13 bytes) */ + if (parse_dcc(data, data_limit, &dcc_ip, + &dcc_port, &addr_beg_p, &addr_end_p)) { +diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c +index b83dc9bf0a5dd..78fd9122b70c7 100644 +--- a/net/netfilter/nf_conntrack_sip.c ++++ b/net/netfilter/nf_conntrack_sip.c +@@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, + return ret; + if (ret == 0) + break; +- dataoff += *matchoff; ++ dataoff = *matchoff; + } + *in_header = 0; + } +@@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, + break; + if (ret == 0) + return ret; +- dataoff += *matchoff; ++ dataoff = *matchoff; + } + + if (in_header) +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 848cc81d69926..2fde193c3d26a 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -2197,7 +2197,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, + struct netlink_ext_ack *extack) + { + const struct nlattr * const *nla = ctx->nla; +- struct nft_stats __percpu *stats = NULL; + struct nft_table *table = ctx->table; + struct nft_base_chain *basechain; + struct net *net = ctx->net; +@@ -2212,6 +2211,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, + return -EOVERFLOW; + + if (nla[NFTA_CHAIN_HOOK]) { ++ struct nft_stats __percpu *stats = NULL; + struct nft_chain_hook hook; + + if (flags & NFT_CHAIN_BINDING) +@@ -2243,8 +2243,11 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, + if (err < 0) { + nft_chain_release_hook(&hook); + kfree(basechain); ++ free_percpu(stats); + return err; + } ++ if (stats) ++ static_branch_inc(&nft_counters_enabled); + } else { + if (flags & NFT_CHAIN_BASE) + return -EINVAL; +@@ -2319,9 +2322,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, + goto err_unregister_hook; + } + +- if (stats) +- static_branch_inc(&nft_counters_enabled); +- + table->use++; + + return 0; +diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c +index 0fa2e20304272..ee6840bd59337 100644 +--- a/net/netfilter/nfnetlink_osf.c ++++ b/net/netfilter/nfnetlink_osf.c +@@ -269,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb, + struct nf_osf_hdr_ctx ctx; + const struct tcphdr *tcp; + struct tcphdr _tcph; ++ bool found = false; + + memset(&ctx, 0, sizeof(ctx)); + +@@ -283,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb, + + data->genre = f->genre; + data->version = f->version; ++ found = true; + break; + } + +- return true; ++ return found; + } + EXPORT_SYMBOL_GPL(nf_osf_find); + +diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c +index ac366c99086fd..7d7f7bac0216a 100644 +--- a/net/sched/cls_api.c ++++ b/net/sched/cls_api.c +@@ -2136,6 +2136,7 @@ replay: + } + + if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { ++ tfilter_put(tp, fh); + NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); + err = -EINVAL; + goto errout; +diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c +index 0b941dd63d268..86675a79da1e4 100644 +--- a/net/sched/sch_taprio.c ++++ b/net/sched/sch_taprio.c +@@ -67,6 +67,7 @@ struct taprio_sched { + u32 flags; + enum tk_offsets tk_offset; + int clockid; ++ bool offloaded; + atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ + * speeds it's sub-nanoseconds per byte + */ +@@ -1279,6 +1280,8 @@ static int taprio_enable_offload(struct net_device *dev, + goto done; + } + ++ q->offloaded = true; ++ + done: + taprio_offload_free(offload); + +@@ -1293,12 +1296,9 @@ static int taprio_disable_offload(struct net_device *dev, + struct tc_taprio_qopt_offload *offload; + int err; + +- if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) ++ if (!q->offloaded) + return 0; + +- if (!ops->ndo_setup_tc) +- return -EOPNOTSUPP; +- + offload = taprio_offload_alloc(0); + if (!offload) { + NL_SET_ERR_MSG(extack, +@@ -1314,6 +1314,8 @@ static int taprio_disable_offload(struct net_device *dev, + goto out; + } + ++ q->offloaded = false; ++ + out: + taprio_offload_free(offload); + +@@ -1949,12 +1951,14 @@ start_error: + + static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) + { +- struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); ++ struct taprio_sched *q = qdisc_priv(sch); ++ struct net_device *dev = qdisc_dev(sch); ++ unsigned int ntx = cl - 1; + +- if (!dev_queue) ++ if (ntx >= dev->num_tx_queues) + return NULL; + +- return dev_queue->qdisc_sleeping; ++ return q->qdiscs[ntx]; + } + + static unsigned long taprio_find(struct Qdisc *sch, u32 classid) +diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c +index 1f3bb1f6b1f7b..8095876b66eb6 100644 +--- a/net/smc/smc_core.c ++++ b/net/smc/smc_core.c +@@ -2148,7 +2148,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, + static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + struct smc_buf_desc *buf_desc, bool is_rmb) + { +- int i, rc = 0; ++ int i, rc = 0, cnt = 0; + + /* protect against parallel link reconfiguration */ + mutex_lock(&lgr->llc_conf_mutex); +@@ -2161,9 +2161,12 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + rc = -ENOMEM; + goto out; + } ++ cnt++; + } + out: + mutex_unlock(&lgr->llc_conf_mutex); ++ if (!rc && !cnt) ++ rc = -EINVAL; + return rc; + } + +diff --git a/scripts/Makefile.debug b/scripts/Makefile.debug +index 9f39b0130551f..8cf1cb22dd934 100644 +--- a/scripts/Makefile.debug ++++ b/scripts/Makefile.debug +@@ -1,20 +1,19 @@ + DEBUG_CFLAGS := ++debug-flags-y := -g + + ifdef CONFIG_DEBUG_INFO_SPLIT + DEBUG_CFLAGS += -gsplit-dwarf +-else +-DEBUG_CFLAGS += -g + endif + +-ifndef CONFIG_AS_IS_LLVM +-KBUILD_AFLAGS += -Wa,-gdwarf-2 +-endif +- +-ifndef CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT +-dwarf-version-$(CONFIG_DEBUG_INFO_DWARF4) := 4 +-dwarf-version-$(CONFIG_DEBUG_INFO_DWARF5) := 5 +-DEBUG_CFLAGS += -gdwarf-$(dwarf-version-y) ++debug-flags-$(CONFIG_DEBUG_INFO_DWARF4) += -gdwarf-4 ++debug-flags-$(CONFIG_DEBUG_INFO_DWARF5) += -gdwarf-5 ++ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_AS_IS_GNU),yy) ++# Clang does not pass -g or -gdwarf-* option down to GAS. ++# Add -Wa, prefix to explicitly specify the flags. ++KBUILD_AFLAGS += $(addprefix -Wa$(comma), $(debug-flags-y)) + endif ++DEBUG_CFLAGS += $(debug-flags-y) ++KBUILD_AFLAGS += $(debug-flags-y) + + ifdef CONFIG_DEBUG_INFO_REDUCED + DEBUG_CFLAGS += -fno-var-tracking +@@ -29,5 +28,5 @@ KBUILD_AFLAGS += -gz=zlib + KBUILD_LDFLAGS += --compress-debug-sections=zlib + endif + +-KBUILD_CFLAGS += $(DEBUG_CFLAGS) ++KBUILD_CFLAGS += $(DEBUG_CFLAGS) + export DEBUG_CFLAGS +diff --git a/sound/core/init.c b/sound/core/init.c +index 726a8353201f8..4eacfafa41730 100644 +--- a/sound/core/init.c ++++ b/sound/core/init.c +@@ -178,10 +178,8 @@ int snd_card_new(struct device *parent, int idx, const char *xid, + return -ENOMEM; + + err = snd_card_init(card, parent, idx, xid, module, extra_size); +- if (err < 0) { +- kfree(card); +- return err; +- } ++ if (err < 0) ++ return err; /* card is freed by error handler */ + + *card_ret = card; + return 0; +@@ -231,7 +229,7 @@ int snd_devm_card_new(struct device *parent, int idx, const char *xid, + card->managed = true; + err = snd_card_init(card, parent, idx, xid, module, extra_size); + if (err < 0) { +- devres_free(card); ++ devres_free(card); /* in managed mode, we need to free manually */ + return err; + } + +@@ -293,6 +291,8 @@ static int snd_card_init(struct snd_card *card, struct device *parent, + mutex_unlock(&snd_card_mutex); + dev_err(parent, "cannot find the slot for index %d (range 0-%i), error: %d\n", + idx, snd_ecards_limit - 1, err); ++ if (!card->managed) ++ kfree(card); /* manually free here, as no destructor called */ + return err; + } + set_bit(idx, snd_cards_lock); /* lock it */ +diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c +index c572fb5886d5d..7af2515735957 100644 +--- a/sound/pci/hda/hda_bind.c ++++ b/sound/pci/hda/hda_bind.c +@@ -157,10 +157,10 @@ static int hda_codec_driver_remove(struct device *dev) + return codec->bus->core.ext_ops->hdev_detach(&codec->core); + } + +- refcount_dec(&codec->pcm_ref); + snd_hda_codec_disconnect_pcms(codec); + snd_hda_jack_tbl_disconnect(codec); +- wait_event(codec->remove_sleep, !refcount_read(&codec->pcm_ref)); ++ if (!refcount_dec_and_test(&codec->pcm_ref)) ++ wait_event(codec->remove_sleep, !refcount_read(&codec->pcm_ref)); + snd_power_sync_ref(codec->bus->card); + + if (codec->patch_ops.free) +diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c +index b20694fd69dea..6f30c374f896e 100644 +--- a/sound/pci/hda/hda_intel.c ++++ b/sound/pci/hda/hda_intel.c +@@ -2550,6 +2550,8 @@ static const struct pci_device_id azx_ids[] = { + /* 5 Series/3400 */ + { PCI_DEVICE(0x8086, 0x3b56), + .driver_data = AZX_DRIVER_SCH | AZX_DCAPS_INTEL_PCH_NOPM }, ++ { PCI_DEVICE(0x8086, 0x3b57), ++ .driver_data = AZX_DRIVER_SCH | AZX_DCAPS_INTEL_PCH_NOPM }, + /* Poulsbo */ + { PCI_DEVICE(0x8086, 0x811b), + .driver_data = AZX_DRIVER_SCH | AZX_DCAPS_INTEL_PCH_BASE }, +diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c +index 6c209cd26c0ca..c9d9aa6351ecf 100644 +--- a/sound/pci/hda/patch_hdmi.c ++++ b/sound/pci/hda/patch_hdmi.c +@@ -170,6 +170,8 @@ struct hdmi_spec { + bool dyn_pcm_no_legacy; + /* hdmi interrupt trigger control flag for Nvidia codec */ + bool hdmi_intr_trig_ctrl; ++ bool nv_dp_workaround; /* workaround DP audio infoframe for Nvidia */ ++ + bool intel_hsw_fixup; /* apply Intel platform-specific fixups */ + /* + * Non-generic VIA/NVIDIA specific +@@ -679,15 +681,24 @@ static void hdmi_pin_setup_infoframe(struct hda_codec *codec, + int ca, int active_channels, + int conn_type) + { ++ struct hdmi_spec *spec = codec->spec; + union audio_infoframe ai; + + memset(&ai, 0, sizeof(ai)); +- if (conn_type == 0) { /* HDMI */ ++ if ((conn_type == 0) || /* HDMI */ ++ /* Nvidia DisplayPort: Nvidia HW expects same layout as HDMI */ ++ (conn_type == 1 && spec->nv_dp_workaround)) { + struct hdmi_audio_infoframe *hdmi_ai = &ai.hdmi; + +- hdmi_ai->type = 0x84; +- hdmi_ai->ver = 0x01; +- hdmi_ai->len = 0x0a; ++ if (conn_type == 0) { /* HDMI */ ++ hdmi_ai->type = 0x84; ++ hdmi_ai->ver = 0x01; ++ hdmi_ai->len = 0x0a; ++ } else {/* Nvidia DP */ ++ hdmi_ai->type = 0x84; ++ hdmi_ai->ver = 0x1b; ++ hdmi_ai->len = 0x11 << 2; ++ } + hdmi_ai->CC02_CT47 = active_channels - 1; + hdmi_ai->CA = ca; + hdmi_checksum_audio_infoframe(hdmi_ai); +@@ -3617,6 +3628,7 @@ static int patch_nvhdmi_2ch(struct hda_codec *codec) + spec->pcm_playback.rates = SUPPORTED_RATES; + spec->pcm_playback.maxbps = SUPPORTED_MAXBPS; + spec->pcm_playback.formats = SUPPORTED_FORMATS; ++ spec->nv_dp_workaround = true; + return 0; + } + +@@ -3756,6 +3768,7 @@ static int patch_nvhdmi(struct hda_codec *codec) + spec->chmap.ops.chmap_cea_alloc_validate_get_type = + nvhdmi_chmap_cea_alloc_validate_get_type; + spec->chmap.ops.chmap_validate = nvhdmi_chmap_validate; ++ spec->nv_dp_workaround = true; + + codec->link_down_at_suspend = 1; + +@@ -3779,6 +3792,7 @@ static int patch_nvhdmi_legacy(struct hda_codec *codec) + spec->chmap.ops.chmap_cea_alloc_validate_get_type = + nvhdmi_chmap_cea_alloc_validate_get_type; + spec->chmap.ops.chmap_validate = nvhdmi_chmap_validate; ++ spec->nv_dp_workaround = true; + + codec->link_down_at_suspend = 1; + +@@ -3984,6 +3998,7 @@ static int tegra_hdmi_init(struct hda_codec *codec) + + generic_hdmi_init_per_pins(codec); + ++ codec->depop_delay = 10; + codec->patch_ops.build_pcms = tegra_hdmi_build_pcms; + spec->chmap.ops.chmap_cea_alloc_validate_get_type = + nvhdmi_chmap_cea_alloc_validate_get_type; +@@ -3992,6 +4007,7 @@ static int tegra_hdmi_init(struct hda_codec *codec) + spec->chmap.ops.chmap_cea_alloc_validate_get_type = + nvhdmi_chmap_cea_alloc_validate_get_type; + spec->chmap.ops.chmap_validate = nvhdmi_chmap_validate; ++ spec->nv_dp_workaround = true; + + return 0; + } +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 799f6bf266dd0..9614b63415a8e 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -7037,6 +7037,8 @@ enum { + ALC294_FIXUP_ASUS_GU502_HP, + ALC294_FIXUP_ASUS_GU502_PINS, + ALC294_FIXUP_ASUS_GU502_VERBS, ++ ALC294_FIXUP_ASUS_G513_PINS, ++ ALC285_FIXUP_ASUS_G533Z_PINS, + ALC285_FIXUP_HP_GPIO_LED, + ALC285_FIXUP_HP_MUTE_LED, + ALC236_FIXUP_HP_GPIO_LED, +@@ -8374,6 +8376,24 @@ static const struct hda_fixup alc269_fixups[] = { + [ALC294_FIXUP_ASUS_GU502_HP] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc294_fixup_gu502_hp, ++ }, ++ [ALC294_FIXUP_ASUS_G513_PINS] = { ++ .type = HDA_FIXUP_PINS, ++ .v.pins = (const struct hda_pintbl[]) { ++ { 0x19, 0x03a11050 }, /* front HP mic */ ++ { 0x1a, 0x03a11c30 }, /* rear external mic */ ++ { 0x21, 0x03211420 }, /* front HP out */ ++ { } ++ }, ++ }, ++ [ALC285_FIXUP_ASUS_G533Z_PINS] = { ++ .type = HDA_FIXUP_PINS, ++ .v.pins = (const struct hda_pintbl[]) { ++ { 0x14, 0x90170120 }, ++ { } ++ }, ++ .chained = true, ++ .chain_id = ALC294_FIXUP_ASUS_G513_PINS, + }, + [ALC294_FIXUP_ASUS_COEF_1B] = { + .type = HDA_FIXUP_VERBS, +@@ -9114,6 +9134,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x1028, 0x0871, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), + SND_PCI_QUIRK(0x1028, 0x0872, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), + SND_PCI_QUIRK(0x1028, 0x0873, "Dell Precision 3930", ALC255_FIXUP_DUMMY_LINEOUT_VERB), ++ SND_PCI_QUIRK(0x1028, 0x087d, "Dell Precision 5530", ALC289_FIXUP_DUAL_SPK), + SND_PCI_QUIRK(0x1028, 0x08ad, "Dell WYSE AIO", ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1028, 0x08ae, "Dell WYSE NB", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1028, 0x0935, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), +@@ -9130,6 +9151,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x1028, 0x0a9d, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1028, 0x0a9e, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1028, 0x0b19, "Dell XPS 15 9520", ALC289_FIXUP_DUAL_SPK), ++ SND_PCI_QUIRK(0x1028, 0x0b1a, "Dell Precision 5570", ALC289_FIXUP_DUAL_SPK), + SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), +@@ -9257,6 +9279,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x103c, 0x8896, "HP EliteBook 855 G8 Notebook PC", ALC285_FIXUP_HP_MUTE_LED), + SND_PCI_QUIRK(0x103c, 0x8898, "HP EliteBook 845 G8 Notebook PC", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0x103c, 0x88d0, "HP Pavilion 15-eh1xxx (mainboard 88D0)", ALC287_FIXUP_HP_GPIO_LED), ++ SND_PCI_QUIRK(0x103c, 0x8902, "HP OMEN 16", ALC285_FIXUP_HP_MUTE_LED), + SND_PCI_QUIRK(0x103c, 0x896e, "HP EliteBook x360 830 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8971, "HP EliteBook 830 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8972, "HP EliteBook 840 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), +@@ -9304,10 +9327,11 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x1043, 0x13b0, "ASUS Z550SA", ALC256_FIXUP_ASUS_MIC), + SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK), + SND_PCI_QUIRK(0x1043, 0x1517, "Asus Zenbook UX31A", ALC269VB_FIXUP_ASUS_ZENBOOK_UX31A), ++ SND_PCI_QUIRK(0x1043, 0x1662, "ASUS GV301QH", ALC294_FIXUP_ASUS_DUAL_SPK), ++ SND_PCI_QUIRK(0x1043, 0x16b2, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x16e3, "ASUS UX50", ALC269_FIXUP_STEREO_DMIC), + SND_PCI_QUIRK(0x1043, 0x1740, "ASUS UX430UA", ALC295_FIXUP_ASUS_DACS), + SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_DUAL_SPK), +- SND_PCI_QUIRK(0x1043, 0x1662, "ASUS GV301QH", ALC294_FIXUP_ASUS_DUAL_SPK), + SND_PCI_QUIRK(0x1043, 0x1881, "ASUS Zephyrus S/M", ALC294_FIXUP_ASUS_GX502_PINS), + SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC), + SND_PCI_QUIRK(0x1043, 0x18f1, "Asus FX505DT", ALC256_FIXUP_ASUS_HEADSET_MIC), +@@ -9323,14 +9347,16 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x1043, 0x1b13, "Asus U41SV", ALC269_FIXUP_INV_DMIC), + SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x1c23, "Asus X55U", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), ++ SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), + SND_PCI_QUIRK(0x1043, 0x1ccd, "ASUS X555UB", ALC256_FIXUP_ASUS_MIC), ++ SND_PCI_QUIRK(0x1043, 0x1d42, "ASUS Zephyrus G14 2022", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x1d4e, "ASUS TM420", ALC256_FIXUP_ASUS_HPE), + SND_PCI_QUIRK(0x1043, 0x1e11, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA502), + SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS), ++ SND_PCI_QUIRK(0x1043, 0x1e5e, "ASUS ROG Strix G513", ALC294_FIXUP_ASUS_G513_PINS), + SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), ++ SND_PCI_QUIRK(0x1043, 0x1c52, "ASUS Zephyrus G15 2022", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x1f11, "ASUS Zephyrus G14", ALC289_FIXUP_ASUS_GA401), +- SND_PCI_QUIRK(0x1043, 0x1d42, "ASUS Zephyrus G14 2022", ALC289_FIXUP_ASUS_GA401), +- SND_PCI_QUIRK(0x1043, 0x16b2, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2), + SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), + SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC), +@@ -9532,6 +9558,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), + SND_PCI_QUIRK(0x1849, 0x1233, "ASRock NUC Box 1100", ALC233_FIXUP_NO_AUDIO_JACK), + SND_PCI_QUIRK(0x19e5, 0x3204, "Huawei MACH-WX9", ALC256_FIXUP_HUAWEI_MACH_WX9_PINS), ++ SND_PCI_QUIRK(0x19e5, 0x320f, "Huawei WRT-WX9 ", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1b35, 0x1235, "CZC B20", ALC269_FIXUP_CZC_B20), + SND_PCI_QUIRK(0x1b35, 0x1236, "CZC TMI", ALC269_FIXUP_CZC_TMI), + SND_PCI_QUIRK(0x1b35, 0x1237, "CZC L101", ALC269_FIXUP_CZC_L101), +diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c +index ff2aa13b7b26f..5d105c44b46df 100644 +--- a/sound/usb/endpoint.c ++++ b/sound/usb/endpoint.c +@@ -758,8 +758,7 @@ bool snd_usb_endpoint_compatible(struct snd_usb_audio *chip, + * The endpoint needs to be closed via snd_usb_endpoint_close() later. + * + * Note that this function doesn't configure the endpoint. The substream +- * needs to set it up later via snd_usb_endpoint_set_params() and +- * snd_usb_endpoint_prepare(). ++ * needs to set it up later via snd_usb_endpoint_configure(). + */ + struct snd_usb_endpoint * + snd_usb_endpoint_open(struct snd_usb_audio *chip, +@@ -1293,13 +1292,12 @@ out_of_memory: + /* + * snd_usb_endpoint_set_params: configure an snd_usb_endpoint + * +- * It's called either from hw_params callback. + * Determine the number of URBs to be used on this endpoint. + * An endpoint must be configured before it can be started. + * An endpoint that is already running can not be reconfigured. + */ +-int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, +- struct snd_usb_endpoint *ep) ++static int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, ++ struct snd_usb_endpoint *ep) + { + const struct audioformat *fmt = ep->cur_audiofmt; + int err; +@@ -1382,18 +1380,18 @@ static int init_sample_rate(struct snd_usb_audio *chip, + } + + /* +- * snd_usb_endpoint_prepare: Prepare the endpoint ++ * snd_usb_endpoint_configure: Configure the endpoint + * + * This function sets up the EP to be fully usable state. +- * It's called either from prepare callback. ++ * It's called either from hw_params or prepare callback. + * The function checks need_setup flag, and performs nothing unless needed, + * so it's safe to call this multiple times. + * + * This returns zero if unchanged, 1 if the configuration has changed, + * or a negative error code. + */ +-int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, +- struct snd_usb_endpoint *ep) ++int snd_usb_endpoint_configure(struct snd_usb_audio *chip, ++ struct snd_usb_endpoint *ep) + { + bool iface_first; + int err = 0; +@@ -1414,6 +1412,9 @@ int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, + if (err < 0) + goto unlock; + } ++ err = snd_usb_endpoint_set_params(chip, ep); ++ if (err < 0) ++ goto unlock; + goto done; + } + +@@ -1441,6 +1442,10 @@ int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, + if (err < 0) + goto unlock; + ++ err = snd_usb_endpoint_set_params(chip, ep); ++ if (err < 0) ++ goto unlock; ++ + err = snd_usb_select_mode_quirk(chip, ep->cur_audiofmt); + if (err < 0) + goto unlock; +diff --git a/sound/usb/endpoint.h b/sound/usb/endpoint.h +index e67ea28faa54f..6a9af04cf175a 100644 +--- a/sound/usb/endpoint.h ++++ b/sound/usb/endpoint.h +@@ -17,10 +17,8 @@ snd_usb_endpoint_open(struct snd_usb_audio *chip, + bool is_sync_ep); + void snd_usb_endpoint_close(struct snd_usb_audio *chip, + struct snd_usb_endpoint *ep); +-int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, +- struct snd_usb_endpoint *ep); +-int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, +- struct snd_usb_endpoint *ep); ++int snd_usb_endpoint_configure(struct snd_usb_audio *chip, ++ struct snd_usb_endpoint *ep); + int snd_usb_endpoint_get_clock_rate(struct snd_usb_audio *chip, int clock); + + bool snd_usb_endpoint_compatible(struct snd_usb_audio *chip, +diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c +index 02035b545f9dd..e692ae04436a5 100644 +--- a/sound/usb/pcm.c ++++ b/sound/usb/pcm.c +@@ -443,17 +443,17 @@ static int configure_endpoints(struct snd_usb_audio *chip, + if (stop_endpoints(subs, false)) + sync_pending_stops(subs); + if (subs->sync_endpoint) { +- err = snd_usb_endpoint_prepare(chip, subs->sync_endpoint); ++ err = snd_usb_endpoint_configure(chip, subs->sync_endpoint); + if (err < 0) + return err; + } +- err = snd_usb_endpoint_prepare(chip, subs->data_endpoint); ++ err = snd_usb_endpoint_configure(chip, subs->data_endpoint); + if (err < 0) + return err; + snd_usb_set_format_quirk(subs, subs->cur_audiofmt); + } else { + if (subs->sync_endpoint) { +- err = snd_usb_endpoint_prepare(chip, subs->sync_endpoint); ++ err = snd_usb_endpoint_configure(chip, subs->sync_endpoint); + if (err < 0) + return err; + } +@@ -551,13 +551,7 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, + subs->cur_audiofmt = fmt; + mutex_unlock(&chip->mutex); + +- if (subs->sync_endpoint) { +- ret = snd_usb_endpoint_set_params(chip, subs->sync_endpoint); +- if (ret < 0) +- goto unlock; +- } +- +- ret = snd_usb_endpoint_set_params(chip, subs->data_endpoint); ++ ret = configure_endpoints(chip, subs); + + unlock: + if (ret < 0) +diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c +index 6b1bafe267a42..8ec5b9f344e02 100644 +--- a/tools/lib/perf/evlist.c ++++ b/tools/lib/perf/evlist.c +@@ -441,6 +441,7 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, + + perf_evlist__for_each_entry(evlist, evsel) { + bool overwrite = evsel->attr.write_backward; ++ enum fdarray_flags flgs; + struct perf_mmap *map; + int *output, fd, cpu; + +@@ -504,8 +505,8 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, + + revent = !overwrite ? POLLIN : 0; + +- if (!evsel->system_wide && +- perf_evlist__add_pollfd(evlist, fd, map, revent, fdarray_flag__default) < 0) { ++ flgs = evsel->system_wide ? fdarray_flag__nonfilterable : fdarray_flag__default; ++ if (perf_evlist__add_pollfd(evlist, fd, map, revent, flgs) < 0) { + perf_mmap__put(map); + return -1; + } +diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c +index 63b9db6574425..97c69a249c6e4 100644 +--- a/tools/perf/util/bpf_counter_cgroup.c ++++ b/tools/perf/util/bpf_counter_cgroup.c +@@ -95,7 +95,7 @@ static int bperf_load_program(struct evlist *evlist) + + perf_cpu_map__for_each_cpu(cpu, i, evlist->core.all_cpus) { + link = bpf_program__attach_perf_event(skel->progs.on_cgrp_switch, +- FD(cgrp_switch, cpu.cpu)); ++ FD(cgrp_switch, i)); + if (IS_ERR(link)) { + pr_err("Failed to attach cgroup program\n"); + err = PTR_ERR(link); +@@ -123,7 +123,7 @@ static int bperf_load_program(struct evlist *evlist) + + map_fd = bpf_map__fd(skel->maps.events); + perf_cpu_map__for_each_cpu(cpu, j, evlist->core.all_cpus) { +- int fd = FD(evsel, cpu.cpu); ++ int fd = FD(evsel, j); + __u32 idx = evsel->core.idx * total_cpus + cpu.cpu; + + err = bpf_map_update_elem(map_fd, &idx, &fd, +diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c +index 292c430768b52..c72f8ad96f751 100644 +--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c ++++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c +@@ -176,7 +176,7 @@ static int bperf_cgroup_count(void) + } + + // This will be attached to cgroup-switches event for each cpu +-SEC("perf_events") ++SEC("perf_event") + int BPF_PROG(on_cgrp_switch) + { + return bperf_cgroup_count(); +diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c +index 953338b9e887e..02cd9f75e3d2f 100644 +--- a/tools/perf/util/genelf.c ++++ b/tools/perf/util/genelf.c +@@ -251,6 +251,7 @@ jit_write_elf(int fd, uint64_t load_addr, const char *sym, + Elf_Data *d; + Elf_Scn *scn; + Elf_Ehdr *ehdr; ++ Elf_Phdr *phdr; + Elf_Shdr *shdr; + uint64_t eh_frame_base_offset; + char *strsym = NULL; +@@ -285,6 +286,19 @@ jit_write_elf(int fd, uint64_t load_addr, const char *sym, + ehdr->e_version = EV_CURRENT; + ehdr->e_shstrndx= unwinding ? 4 : 2; /* shdr index for section name */ + ++ /* ++ * setup program header ++ */ ++ phdr = elf_newphdr(e, 1); ++ phdr[0].p_type = PT_LOAD; ++ phdr[0].p_offset = 0; ++ phdr[0].p_vaddr = 0; ++ phdr[0].p_paddr = 0; ++ phdr[0].p_filesz = csize; ++ phdr[0].p_memsz = csize; ++ phdr[0].p_flags = PF_X | PF_R; ++ phdr[0].p_align = 8; ++ + /* + * setup text section + */ +diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h +index ae138afe6c563..b5c909546e3f2 100644 +--- a/tools/perf/util/genelf.h ++++ b/tools/perf/util/genelf.h +@@ -53,8 +53,10 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent + + #if GEN_ELF_CLASS == ELFCLASS64 + #define elf_newehdr elf64_newehdr ++#define elf_newphdr elf64_newphdr + #define elf_getshdr elf64_getshdr + #define Elf_Ehdr Elf64_Ehdr ++#define Elf_Phdr Elf64_Phdr + #define Elf_Shdr Elf64_Shdr + #define Elf_Sym Elf64_Sym + #define ELF_ST_TYPE(a) ELF64_ST_TYPE(a) +@@ -62,8 +64,10 @@ int jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_ent + #define ELF_ST_VIS(a) ELF64_ST_VISIBILITY(a) + #else + #define elf_newehdr elf32_newehdr ++#define elf_newphdr elf32_newphdr + #define elf_getshdr elf32_getshdr + #define Elf_Ehdr Elf32_Ehdr ++#define Elf_Phdr Elf32_Phdr + #define Elf_Shdr Elf32_Shdr + #define Elf_Sym Elf32_Sym + #define ELF_ST_TYPE(a) ELF32_ST_TYPE(a) +diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c +index 75bec32d4f571..647b7dff8ef36 100644 +--- a/tools/perf/util/symbol-elf.c ++++ b/tools/perf/util/symbol-elf.c +@@ -2102,8 +2102,8 @@ static int kcore_copy__compare_file(const char *from_dir, const char *to_dir, + * unusual. One significant peculiarity is that the mapping (start -> pgoff) + * is not the same for the kernel map and the modules map. That happens because + * the data is copied adjacently whereas the original kcore has gaps. Finally, +- * kallsyms and modules files are compared with their copies to check that +- * modules have not been loaded or unloaded while the copies were taking place. ++ * kallsyms file is compared with its copy to check that modules have not been ++ * loaded or unloaded while the copies were taking place. + * + * Return: %0 on success, %-1 on failure. + */ +@@ -2166,9 +2166,6 @@ int kcore_copy(const char *from_dir, const char *to_dir) + goto out_extract_close; + } + +- if (kcore_copy__compare_file(from_dir, to_dir, "modules")) +- goto out_extract_close; +- + if (kcore_copy__compare_file(from_dir, to_dir, "kallsyms")) + goto out_extract_close; + +diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c +index 84d17bd4efaed..64e273b2b1b21 100644 +--- a/tools/perf/util/synthetic-events.c ++++ b/tools/perf/util/synthetic-events.c +@@ -367,13 +367,24 @@ static void perf_record_mmap2__read_build_id(struct perf_record_mmap2 *event, + bool is_kernel) + { + struct build_id bid; ++ struct nsinfo *nsi; ++ struct nscookie nc; + int rc; + +- if (is_kernel) ++ if (is_kernel) { + rc = sysfs__read_build_id("/sys/kernel/notes", &bid); +- else +- rc = filename__read_build_id(event->filename, &bid) > 0 ? 0 : -1; ++ goto out; ++ } ++ ++ nsi = nsinfo__new(event->pid); ++ nsinfo__mountns_enter(nsi, &nc); + ++ rc = filename__read_build_id(event->filename, &bid) > 0 ? 0 : -1; ++ ++ nsinfo__mountns_exit(&nc); ++ nsinfo__put(nsi); ++ ++out: + if (rc == 0) { + memcpy(event->build_id, bid.data, sizeof(bid.data)); + event->build_id_size = (u8) bid.size; +diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh +index e714bae473fb4..81f31179ac887 100755 +--- a/tools/testing/selftests/net/forwarding/sch_red.sh ++++ b/tools/testing/selftests/net/forwarding/sch_red.sh +@@ -1,3 +1,4 @@ ++#!/bin/bash + # SPDX-License-Identifier: GPL-2.0 + + # This test sends one stream of traffic from H1 through a TBF shaper, to a RED diff --git a/sys-kernel/pinephone-sources/files/5.19.8-9.patch b/sys-kernel/pinephone-sources/files/5.19.8-9.patch new file mode 100644 index 0000000..f12fb56 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/5.19.8-9.patch @@ -0,0 +1,8234 @@ +diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst +index 33b04db8408f9..fda97b3fcf018 100644 +--- a/Documentation/arm64/silicon-errata.rst ++++ b/Documentation/arm64/silicon-errata.rst +@@ -52,6 +52,8 @@ stable kernels. + | Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 | + +----------------+-----------------+-----------------+-----------------------------+ + +----------------+-----------------+-----------------+-----------------------------+ ++| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 | +++----------------+-----------------+-----------------+-----------------------------+ + | ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 | + +----------------+-----------------+-----------------+-----------------------------+ + | ARM | Cortex-A510 | #2038923 | ARM64_ERRATUM_2038923 | +diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst +index 78ca69eda8778..02f4ad314a1eb 100644 +--- a/Documentation/hwmon/asus_ec_sensors.rst ++++ b/Documentation/hwmon/asus_ec_sensors.rst +@@ -13,12 +13,16 @@ Supported boards: + * ROG CROSSHAIR VIII FORMULA + * ROG CROSSHAIR VIII HERO + * ROG CROSSHAIR VIII IMPACT ++ * ROG MAXIMUS XI HERO ++ * ROG MAXIMUS XI HERO (WI-FI) + * ROG STRIX B550-E GAMING + * ROG STRIX B550-I GAMING + * ROG STRIX X570-E GAMING + * ROG STRIX X570-E GAMING WIFI II + * ROG STRIX X570-F GAMING + * ROG STRIX X570-I GAMING ++ * ROG STRIX Z690-A GAMING WIFI D4 ++ * ROG ZENITH II EXTREME + + Authors: + - Eugene Shalygin +diff --git a/Makefile b/Makefile +index e361c6230e9e5..1f27c4bd09e67 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 5 + PATCHLEVEL = 19 +-SUBLEVEL = 8 ++SUBLEVEL = 9 + EXTRAVERSION = + NAME = Superb Owl + +@@ -1286,8 +1286,7 @@ hdr-inst := -f $(srctree)/scripts/Makefile.headersinst obj + + PHONY += headers + headers: $(version_h) scripts_unifdef uapi-asm-generic archheaders archscripts +- $(if $(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/Kbuild),, \ +- $(error Headers not exportable for the $(SRCARCH) architecture)) ++ $(if $(filter um, $(SRCARCH)), $(error Headers not exportable for UML)) + $(Q)$(MAKE) $(hdr-inst)=include/uapi + $(Q)$(MAKE) $(hdr-inst)=arch/$(SRCARCH)/include/uapi + +diff --git a/arch/arm/boot/dts/at91-sama5d27_wlsom1.dtsi b/arch/arm/boot/dts/at91-sama5d27_wlsom1.dtsi +index ba621783acdbc..d6f364c6be94b 100644 +--- a/arch/arm/boot/dts/at91-sama5d27_wlsom1.dtsi ++++ b/arch/arm/boot/dts/at91-sama5d27_wlsom1.dtsi +@@ -76,8 +76,8 @@ + regulators { + vdd_3v3: VDD_IO { + regulator-name = "VDD_IO"; +- regulator-min-microvolt = <1200000>; +- regulator-max-microvolt = <3700000>; ++ regulator-min-microvolt = <3300000>; ++ regulator-max-microvolt = <3300000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -95,8 +95,8 @@ + + vddio_ddr: VDD_DDR { + regulator-name = "VDD_DDR"; +- regulator-min-microvolt = <600000>; +- regulator-max-microvolt = <1850000>; ++ regulator-min-microvolt = <1200000>; ++ regulator-max-microvolt = <1200000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -118,8 +118,8 @@ + + vdd_core: VDD_CORE { + regulator-name = "VDD_CORE"; +- regulator-min-microvolt = <600000>; +- regulator-max-microvolt = <1850000>; ++ regulator-min-microvolt = <1250000>; ++ regulator-max-microvolt = <1250000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -160,8 +160,8 @@ + + LDO1 { + regulator-name = "LDO1"; +- regulator-min-microvolt = <1200000>; +- regulator-max-microvolt = <3700000>; ++ regulator-min-microvolt = <3300000>; ++ regulator-max-microvolt = <3300000>; + regulator-always-on; + + regulator-state-standby { +@@ -175,9 +175,8 @@ + + LDO2 { + regulator-name = "LDO2"; +- regulator-min-microvolt = <1200000>; +- regulator-max-microvolt = <3700000>; +- regulator-always-on; ++ regulator-min-microvolt = <1800000>; ++ regulator-max-microvolt = <3300000>; + + regulator-state-standby { + regulator-on-in-suspend; +diff --git a/arch/arm/boot/dts/at91-sama5d2_icp.dts b/arch/arm/boot/dts/at91-sama5d2_icp.dts +index 164201a8fbf2d..492456e195a37 100644 +--- a/arch/arm/boot/dts/at91-sama5d2_icp.dts ++++ b/arch/arm/boot/dts/at91-sama5d2_icp.dts +@@ -197,8 +197,8 @@ + regulators { + vdd_io_reg: VDD_IO { + regulator-name = "VDD_IO"; +- regulator-min-microvolt = <1200000>; +- regulator-max-microvolt = <3700000>; ++ regulator-min-microvolt = <3300000>; ++ regulator-max-microvolt = <3300000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -216,8 +216,8 @@ + + VDD_DDR { + regulator-name = "VDD_DDR"; +- regulator-min-microvolt = <600000>; +- regulator-max-microvolt = <1850000>; ++ regulator-min-microvolt = <1350000>; ++ regulator-max-microvolt = <1350000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -235,8 +235,8 @@ + + VDD_CORE { + regulator-name = "VDD_CORE"; +- regulator-min-microvolt = <600000>; +- regulator-max-microvolt = <1850000>; ++ regulator-min-microvolt = <1250000>; ++ regulator-max-microvolt = <1250000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -258,7 +258,6 @@ + regulator-max-microvolt = <1850000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; +- regulator-always-on; + + regulator-state-standby { + regulator-on-in-suspend; +@@ -273,8 +272,8 @@ + + LDO1 { + regulator-name = "LDO1"; +- regulator-min-microvolt = <1200000>; +- regulator-max-microvolt = <3700000>; ++ regulator-min-microvolt = <2500000>; ++ regulator-max-microvolt = <2500000>; + regulator-always-on; + + regulator-state-standby { +@@ -288,8 +287,8 @@ + + LDO2 { + regulator-name = "LDO2"; +- regulator-min-microvolt = <1200000>; +- regulator-max-microvolt = <3700000>; ++ regulator-min-microvolt = <3300000>; ++ regulator-max-microvolt = <3300000>; + regulator-always-on; + + regulator-state-standby { +diff --git a/arch/arm/boot/dts/at91-sama7g5ek.dts b/arch/arm/boot/dts/at91-sama7g5ek.dts +index 103544620fd7c..b261b4da08502 100644 +--- a/arch/arm/boot/dts/at91-sama7g5ek.dts ++++ b/arch/arm/boot/dts/at91-sama7g5ek.dts +@@ -244,8 +244,8 @@ + regulators { + vdd_3v3: VDD_IO { + regulator-name = "VDD_IO"; +- regulator-min-microvolt = <1200000>; +- regulator-max-microvolt = <3700000>; ++ regulator-min-microvolt = <3300000>; ++ regulator-max-microvolt = <3300000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -264,8 +264,8 @@ + + vddioddr: VDD_DDR { + regulator-name = "VDD_DDR"; +- regulator-min-microvolt = <1300000>; +- regulator-max-microvolt = <1450000>; ++ regulator-min-microvolt = <1350000>; ++ regulator-max-microvolt = <1350000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -285,8 +285,8 @@ + + vddcore: VDD_CORE { + regulator-name = "VDD_CORE"; +- regulator-min-microvolt = <1100000>; +- regulator-max-microvolt = <1850000>; ++ regulator-min-microvolt = <1150000>; ++ regulator-max-microvolt = <1150000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-always-on; +@@ -306,7 +306,7 @@ + vddcpu: VDD_OTHER { + regulator-name = "VDD_OTHER"; + regulator-min-microvolt = <1050000>; +- regulator-max-microvolt = <1850000>; ++ regulator-max-microvolt = <1250000>; + regulator-initial-mode = <2>; + regulator-allowed-modes = <2>, <4>; + regulator-ramp-delay = <3125>; +@@ -326,8 +326,8 @@ + + vldo1: LDO1 { + regulator-name = "LDO1"; +- regulator-min-microvolt = <1200000>; +- regulator-max-microvolt = <3700000>; ++ regulator-min-microvolt = <1800000>; ++ regulator-max-microvolt = <1800000>; + regulator-always-on; + + regulator-state-standby { +diff --git a/arch/arm/boot/dts/imx6qdl-kontron-samx6i.dtsi b/arch/arm/boot/dts/imx6qdl-kontron-samx6i.dtsi +index 095c9143d99a3..6b791d515e294 100644 +--- a/arch/arm/boot/dts/imx6qdl-kontron-samx6i.dtsi ++++ b/arch/arm/boot/dts/imx6qdl-kontron-samx6i.dtsi +@@ -51,16 +51,6 @@ + vin-supply = <®_3p3v_s5>; + }; + +- reg_3p3v_s0: regulator-3p3v-s0 { +- compatible = "regulator-fixed"; +- regulator-name = "V_3V3_S0"; +- regulator-min-microvolt = <3300000>; +- regulator-max-microvolt = <3300000>; +- regulator-always-on; +- regulator-boot-on; +- vin-supply = <®_3p3v_s5>; +- }; +- + reg_3p3v_s5: regulator-3p3v-s5 { + compatible = "regulator-fixed"; + regulator-name = "V_3V3_S5"; +@@ -259,7 +249,7 @@ + + /* default boot source: workaround #1 for errata ERR006282 */ + smarc_flash: flash@0 { +- compatible = "winbond,w25q16dw", "jedec,spi-nor"; ++ compatible = "jedec,spi-nor"; + reg = <0>; + spi-max-frequency = <20000000>; + }; +diff --git a/arch/arm/boot/dts/imx6qdl-vicut1.dtsi b/arch/arm/boot/dts/imx6qdl-vicut1.dtsi +index a1676b5d2980f..c5a98b0110dd3 100644 +--- a/arch/arm/boot/dts/imx6qdl-vicut1.dtsi ++++ b/arch/arm/boot/dts/imx6qdl-vicut1.dtsi +@@ -28,7 +28,7 @@ + enable-gpios = <&gpio4 28 GPIO_ACTIVE_HIGH>; + }; + +- backlight_led: backlight_led { ++ backlight_led: backlight-led { + compatible = "pwm-backlight"; + pwms = <&pwm3 0 5000000 0>; + brightness-levels = <0 16 64 255>; +diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c +index df6d673e83d56..f4501dea98b04 100644 +--- a/arch/arm/mach-at91/pm.c ++++ b/arch/arm/mach-at91/pm.c +@@ -541,9 +541,41 @@ extern u32 at91_pm_suspend_in_sram_sz; + + static int at91_suspend_finish(unsigned long val) + { ++ unsigned char modified_gray_code[] = { ++ 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x04, 0x05, 0x0c, 0x0d, ++ 0x0e, 0x0f, 0x0a, 0x0b, 0x08, 0x09, 0x18, 0x19, 0x1a, 0x1b, ++ 0x1e, 0x1f, 0x1c, 0x1d, 0x14, 0x15, 0x16, 0x17, 0x12, 0x13, ++ 0x10, 0x11, ++ }; ++ unsigned int tmp, index; + int i; + + if (soc_pm.data.mode == AT91_PM_BACKUP && soc_pm.data.ramc_phy) { ++ /* ++ * Bootloader will perform DDR recalibration and will try to ++ * restore the ZQ0SR0 with the value saved here. But the ++ * calibration is buggy and restoring some values from ZQ0SR0 ++ * is forbidden and risky thus we need to provide processed ++ * values for these (modified gray code values). ++ */ ++ tmp = readl(soc_pm.data.ramc_phy + DDR3PHY_ZQ0SR0); ++ ++ /* Store pull-down output impedance select. */ ++ index = (tmp >> DDR3PHY_ZQ0SR0_PDO_OFF) & 0x1f; ++ soc_pm.bu->ddr_phy_calibration[0] = modified_gray_code[index]; ++ ++ /* Store pull-up output impedance select. */ ++ index = (tmp >> DDR3PHY_ZQ0SR0_PUO_OFF) & 0x1f; ++ soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index]; ++ ++ /* Store pull-down on-die termination impedance select. */ ++ index = (tmp >> DDR3PHY_ZQ0SR0_PDODT_OFF) & 0x1f; ++ soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index]; ++ ++ /* Store pull-up on-die termination impedance select. */ ++ index = (tmp >> DDR3PHY_ZQ0SRO_PUODT_OFF) & 0x1f; ++ soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index]; ++ + /* + * The 1st 8 words of memory might get corrupted in the process + * of DDR PHY recalibration; it is saved here in securam and it +@@ -1066,10 +1098,6 @@ static int __init at91_pm_backup_init(void) + of_scan_flat_dt(at91_pm_backup_scan_memcs, &located); + if (!located) + goto securam_fail; +- +- /* DDR3PHY_ZQ0SR0 */ +- soc_pm.bu->ddr_phy_calibration[0] = readl(soc_pm.data.ramc_phy + +- 0x188); + } + + return 0; +diff --git a/arch/arm/mach-at91/pm_suspend.S b/arch/arm/mach-at91/pm_suspend.S +index abe4ced33edaf..ffed4d9490428 100644 +--- a/arch/arm/mach-at91/pm_suspend.S ++++ b/arch/arm/mach-at91/pm_suspend.S +@@ -172,9 +172,15 @@ sr_ena_2: + /* Put DDR PHY's DLL in bypass mode for non-backup modes. */ + cmp r7, #AT91_PM_BACKUP + beq sr_ena_3 +- ldr tmp1, [r3, #DDR3PHY_PIR] +- orr tmp1, tmp1, #DDR3PHY_PIR_DLLBYP +- str tmp1, [r3, #DDR3PHY_PIR] ++ ++ /* Disable DX DLLs. */ ++ ldr tmp1, [r3, #DDR3PHY_DX0DLLCR] ++ orr tmp1, tmp1, #DDR3PHY_DXDLLCR_DLLDIS ++ str tmp1, [r3, #DDR3PHY_DX0DLLCR] ++ ++ ldr tmp1, [r3, #DDR3PHY_DX1DLLCR] ++ orr tmp1, tmp1, #DDR3PHY_DXDLLCR_DLLDIS ++ str tmp1, [r3, #DDR3PHY_DX1DLLCR] + + sr_ena_3: + /* Power down DDR PHY data receivers. */ +@@ -221,10 +227,14 @@ sr_ena_3: + bic tmp1, tmp1, #DDR3PHY_DSGCR_ODTPDD_ODT0 + str tmp1, [r3, #DDR3PHY_DSGCR] + +- /* Take DDR PHY's DLL out of bypass mode. */ +- ldr tmp1, [r3, #DDR3PHY_PIR] +- bic tmp1, tmp1, #DDR3PHY_PIR_DLLBYP +- str tmp1, [r3, #DDR3PHY_PIR] ++ /* Enable DX DLLs. */ ++ ldr tmp1, [r3, #DDR3PHY_DX0DLLCR] ++ bic tmp1, tmp1, #DDR3PHY_DXDLLCR_DLLDIS ++ str tmp1, [r3, #DDR3PHY_DX0DLLCR] ++ ++ ldr tmp1, [r3, #DDR3PHY_DX1DLLCR] ++ bic tmp1, tmp1, #DDR3PHY_DXDLLCR_DLLDIS ++ str tmp1, [r3, #DDR3PHY_DX1DLLCR] + + /* Enable quasi-dynamic programming. */ + mov tmp1, #0 +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 001eaba5a6b4b..cc1e7bb49d38b 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -914,6 +914,23 @@ config ARM64_ERRATUM_1902691 + + If unsure, say Y. + ++config ARM64_ERRATUM_2457168 ++ bool "Cortex-A510: 2457168: workaround for AMEVCNTR01 incrementing incorrectly" ++ depends on ARM64_AMU_EXTN ++ default y ++ help ++ This option adds the workaround for ARM Cortex-A510 erratum 2457168. ++ ++ The AMU counter AMEVCNTR01 (constant counter) should increment at the same rate ++ as the system counter. On affected Cortex-A510 cores AMEVCNTR01 increments ++ incorrectly giving a significantly higher output value. ++ ++ Work around this problem by returning 0 when reading the affected counter in ++ key locations that results in disabling all users of this counter. This effect ++ is the same to firmware disabling affected counters. ++ ++ If unsure, say Y. ++ + config CAVIUM_ERRATUM_22375 + bool "Cavium erratum 22375, 24313" + default y +@@ -1867,6 +1884,8 @@ config ARM64_BTI_KERNEL + depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697 + depends on !CC_IS_GCC || GCC_VERSION >= 100100 ++ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106671 ++ depends on !CC_IS_GCC + # https://github.com/llvm/llvm-project/commit/a88c722e687e6780dcd6a58718350dc76fcc4cc9 + depends on !CC_IS_CLANG || CLANG_VERSION >= 120000 + depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS) +diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds-65bb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds-65bb.dts +index 40d34c8384a5e..b949cac037427 100644 +--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds-65bb.dts ++++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds-65bb.dts +@@ -25,7 +25,6 @@ + &enetc_port0 { + phy-handle = <&slot1_sgmii>; + phy-mode = "2500base-x"; +- managed = "in-band-status"; + status = "okay"; + }; + +diff --git a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts +index 24737e89038a4..96cac0f969a77 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts ++++ b/arch/arm64/boot/dts/freescale/imx8mm-venice-gw7901.dts +@@ -626,24 +626,28 @@ + lan1: port@0 { + reg = <0>; + label = "lan1"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + + lan2: port@1 { + reg = <1>; + label = "lan2"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + + lan3: port@2 { + reg = <2>; + label = "lan3"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + + lan4: port@3 { + reg = <3>; + label = "lan4"; ++ phy-mode = "internal"; + local-mac-address = [00 00 00 00 00 00]; + }; + +diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +index eafa88d980b32..c2d4da25482ff 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi ++++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +@@ -32,10 +32,10 @@ + }; + + /* Fixed clock dedicated to SPI CAN controller */ +- clk20m: oscillator { ++ clk40m: oscillator { + compatible = "fixed-clock"; + #clock-cells = <0>; +- clock-frequency = <20000000>; ++ clock-frequency = <40000000>; + }; + + gpio-keys { +@@ -194,8 +194,8 @@ + + can1: can@0 { + compatible = "microchip,mcp251xfd"; +- clocks = <&clk20m>; +- interrupts-extended = <&gpio1 6 IRQ_TYPE_EDGE_FALLING>; ++ clocks = <&clk40m>; ++ interrupts-extended = <&gpio1 6 IRQ_TYPE_LEVEL_LOW>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_can1_int>; + reg = <0>; +@@ -595,7 +595,7 @@ + pinctrl-0 = <&pinctrl_gpio_9_dsi>, <&pinctrl_i2s_2_bclk_touch_reset>; + reg = <0x4a>; + /* Verdin I2S_2_BCLK (TOUCH_RESET#, SODIMM 42) */ +- reset-gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>; ++ reset-gpios = <&gpio3 23 GPIO_ACTIVE_LOW>; + status = "disabled"; + }; + +@@ -737,6 +737,7 @@ + }; + + &usbphynop2 { ++ power-domains = <&pgc_otg2>; + vcc-supply = <®_vdd_3v3>; + }; + +diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts +index 521215520a0f4..6630ec561dc25 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts ++++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts +@@ -770,10 +770,10 @@ + + pinctrl_sai2: sai2grp { + fsl,pins = < +- MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_SAI2_TX_SYNC +- MX8MP_IOMUXC_SAI2_TXD0__AUDIOMIX_SAI2_TX_DATA00 +- MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_SAI2_TX_BCLK +- MX8MP_IOMUXC_SAI2_MCLK__AUDIOMIX_SAI2_MCLK ++ MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_SAI2_TX_SYNC 0xd6 ++ MX8MP_IOMUXC_SAI2_TXD0__AUDIOMIX_SAI2_TX_DATA00 0xd6 ++ MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_SAI2_TX_BCLK 0xd6 ++ MX8MP_IOMUXC_SAI2_MCLK__AUDIOMIX_SAI2_MCLK 0xd6 + >; + }; + +diff --git a/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi +index fb17e329cd370..f5323291a9b24 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi ++++ b/arch/arm64/boot/dts/freescale/imx8mp-verdin.dtsi +@@ -620,7 +620,7 @@ + interrupts = <5 IRQ_TYPE_EDGE_FALLING>; + reg = <0x4a>; + /* Verdin GPIO_2 (SODIMM 208) */ +- reset-gpios = <&gpio1 1 GPIO_ACTIVE_HIGH>; ++ reset-gpios = <&gpio1 1 GPIO_ACTIVE_LOW>; + status = "disabled"; + }; + }; +@@ -697,7 +697,7 @@ + pinctrl-0 = <&pinctrl_gpio_9_dsi>, <&pinctrl_i2s_2_bclk_touch_reset>; + reg = <0x4a>; + /* Verdin I2S_2_BCLK (TOUCH_RESET#, SODIMM 42) */ +- reset-gpios = <&gpio5 0 GPIO_ACTIVE_HIGH>; ++ reset-gpios = <&gpio5 0 GPIO_ACTIVE_LOW>; + status = "disabled"; + }; + +diff --git a/arch/arm64/boot/dts/freescale/imx8mq-tqma8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq-tqma8mq.dtsi +index 899e8e7dbc24f..802ad6e5cef61 100644 +--- a/arch/arm64/boot/dts/freescale/imx8mq-tqma8mq.dtsi ++++ b/arch/arm64/boot/dts/freescale/imx8mq-tqma8mq.dtsi +@@ -204,7 +204,6 @@ + reg = <0x51>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rtc>; +- interrupt-names = "irq"; + interrupt-parent = <&gpio1>; + interrupts = <1 IRQ_TYPE_EDGE_FALLING>; + quartz-load-femtofarads = <7000>; +diff --git a/arch/arm64/boot/dts/renesas/r8a779g0.dtsi b/arch/arm64/boot/dts/renesas/r8a779g0.dtsi +index 7cbb0de060ddc..1c15726cff8bf 100644 +--- a/arch/arm64/boot/dts/renesas/r8a779g0.dtsi ++++ b/arch/arm64/boot/dts/renesas/r8a779g0.dtsi +@@ -85,7 +85,7 @@ + "renesas,rcar-gen4-hscif", + "renesas,hscif"; + reg = <0 0xe6540000 0 96>; +- interrupts = ; ++ interrupts = ; + clocks = <&cpg CPG_MOD 514>, + <&cpg CPG_CORE R8A779G0_CLK_S0D3_PER>, + <&scif_clk>; +diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c +index 5f4117dae8888..af137f91607da 100644 +--- a/arch/arm64/kernel/cpu_errata.c ++++ b/arch/arm64/kernel/cpu_errata.c +@@ -656,6 +656,16 @@ const struct arm64_cpu_capabilities arm64_errata[] = { + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2) + }, + #endif ++#ifdef CONFIG_ARM64_ERRATUM_2457168 ++ { ++ .desc = "ARM erratum 2457168", ++ .capability = ARM64_WORKAROUND_2457168, ++ .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, ++ ++ /* Cortex-A510 r0p0-r1p1 */ ++ CAP_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1) ++ }, ++#endif + #ifdef CONFIG_ARM64_ERRATUM_2038923 + { + .desc = "ARM erratum 2038923", +diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c +index ebdfbd1cf207b..f34c9f8b9ee0a 100644 +--- a/arch/arm64/kernel/cpufeature.c ++++ b/arch/arm64/kernel/cpufeature.c +@@ -1798,7 +1798,10 @@ static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) + pr_info("detected CPU%d: Activity Monitors Unit (AMU)\n", + smp_processor_id()); + cpumask_set_cpu(smp_processor_id(), &amu_cpus); +- update_freq_counters_refs(); ++ ++ /* 0 reference values signal broken/disabled counters */ ++ if (!this_cpu_has_cap(ARM64_WORKAROUND_2457168)) ++ update_freq_counters_refs(); + } + } + +diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c +index af5df48ba915b..2e248342476ea 100644 +--- a/arch/arm64/kernel/hibernate.c ++++ b/arch/arm64/kernel/hibernate.c +@@ -300,6 +300,11 @@ static void swsusp_mte_restore_tags(void) + unsigned long pfn = xa_state.xa_index; + struct page *page = pfn_to_online_page(pfn); + ++ /* ++ * It is not required to invoke page_kasan_tag_reset(page) ++ * at this point since the tags stored in page->flags are ++ * already restored. ++ */ + mte_restore_page_tags(page_address(page), tags); + + mte_free_tag_storage(tags); +diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c +index b2b730233274b..f6b00743c3994 100644 +--- a/arch/arm64/kernel/mte.c ++++ b/arch/arm64/kernel/mte.c +@@ -48,6 +48,15 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte, + if (!pte_is_tagged) + return; + ++ page_kasan_tag_reset(page); ++ /* ++ * We need smp_wmb() in between setting the flags and clearing the ++ * tags because if another thread reads page->flags and builds a ++ * tagged address out of it, there is an actual dependency to the ++ * memory access, but on the current thread we do not guarantee that ++ * the new page->flags are visible before the tags were updated. ++ */ ++ smp_wmb(); + mte_clear_page_tags(page_address(page)); + } + +diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c +index 9ab78ad826e2a..707b5451929d4 100644 +--- a/arch/arm64/kernel/topology.c ++++ b/arch/arm64/kernel/topology.c +@@ -310,12 +310,25 @@ core_initcall(init_amu_fie); + + static void cpu_read_corecnt(void *val) + { ++ /* ++ * A value of 0 can be returned if the current CPU does not support AMUs ++ * or if the counter is disabled for this CPU. A return value of 0 at ++ * counter read is properly handled as an error case by the users of the ++ * counter. ++ */ + *(u64 *)val = read_corecnt(); + } + + static void cpu_read_constcnt(void *val) + { +- *(u64 *)val = read_constcnt(); ++ /* ++ * Return 0 if the current CPU is affected by erratum 2457168. A value ++ * of 0 is also returned if the current CPU does not support AMUs or if ++ * the counter is disabled. A return value of 0 at counter read is ++ * properly handled as an error case by the users of the counter. ++ */ ++ *(u64 *)val = this_cpu_has_cap(ARM64_WORKAROUND_2457168) ? ++ 0UL : read_constcnt(); + } + + static inline +@@ -342,7 +355,22 @@ int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) + */ + bool cpc_ffh_supported(void) + { +- return freq_counters_valid(get_cpu_with_amu_feat()); ++ int cpu = get_cpu_with_amu_feat(); ++ ++ /* ++ * FFH is considered supported if there is at least one present CPU that ++ * supports AMUs. Using FFH to read core and reference counters for CPUs ++ * that do not support AMUs, have counters disabled or that are affected ++ * by errata, will result in a return value of 0. ++ * ++ * This is done to allow any enabled and valid counters to be read ++ * through FFH, knowing that potentially returning 0 as counter value is ++ * properly handled by the users of these counters. ++ */ ++ if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) ++ return false; ++ ++ return true; + } + + int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val) +diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c +index 24913271e898c..0dea80bf6de46 100644 +--- a/arch/arm64/mm/copypage.c ++++ b/arch/arm64/mm/copypage.c +@@ -23,6 +23,15 @@ void copy_highpage(struct page *to, struct page *from) + + if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) { + set_bit(PG_mte_tagged, &to->flags); ++ page_kasan_tag_reset(to); ++ /* ++ * We need smp_wmb() in between setting the flags and clearing the ++ * tags because if another thread reads page->flags and builds a ++ * tagged address out of it, there is an actual dependency to the ++ * memory access, but on the current thread we do not guarantee that ++ * the new page->flags are visible before the tags were updated. ++ */ ++ smp_wmb(); + mte_copy_page_tags(kto, kfrom); + } + } +diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c +index 4334dec93bd44..a9e50e930484a 100644 +--- a/arch/arm64/mm/mteswap.c ++++ b/arch/arm64/mm/mteswap.c +@@ -53,6 +53,15 @@ bool mte_restore_tags(swp_entry_t entry, struct page *page) + if (!tags) + return false; + ++ page_kasan_tag_reset(page); ++ /* ++ * We need smp_wmb() in between setting the flags and clearing the ++ * tags because if another thread reads page->flags and builds a ++ * tagged address out of it, there is an actual dependency to the ++ * memory access, but on the current thread we do not guarantee that ++ * the new page->flags are visible before the tags were updated. ++ */ ++ smp_wmb(); + mte_restore_page_tags(page_address(page), tags); + + return true; +diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps +index 8809e14cf86a2..18999f46df19f 100644 +--- a/arch/arm64/tools/cpucaps ++++ b/arch/arm64/tools/cpucaps +@@ -66,6 +66,7 @@ WORKAROUND_1902691 + WORKAROUND_2038923 + WORKAROUND_2064142 + WORKAROUND_2077057 ++WORKAROUND_2457168 + WORKAROUND_TRBE_OVERWRITE_FILL_MODE + WORKAROUND_TSB_FLUSH_FAILURE + WORKAROUND_TRBE_WRITE_OUT_OF_RANGE +diff --git a/arch/mips/loongson32/ls1c/board.c b/arch/mips/loongson32/ls1c/board.c +index e9de6da0ce51f..9dcfe9de55b0a 100644 +--- a/arch/mips/loongson32/ls1c/board.c ++++ b/arch/mips/loongson32/ls1c/board.c +@@ -15,7 +15,6 @@ static struct platform_device *ls1c_platform_devices[] __initdata = { + static int __init ls1c_platform_init(void) + { + ls1x_serial_set_uartclk(&ls1x_uart_pdev); +- ls1x_rtc_set_extclk(&ls1x_rtc_pdev); + + return platform_add_devices(ls1c_platform_devices, + ARRAY_SIZE(ls1c_platform_devices)); +diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h +index 56ffd260c669b..0ec9cfc5131fc 100644 +--- a/arch/parisc/include/asm/bitops.h ++++ b/arch/parisc/include/asm/bitops.h +@@ -12,14 +12,6 @@ + #include + #include + +-/* compiler build environment sanity checks: */ +-#if !defined(CONFIG_64BIT) && defined(__LP64__) +-#error "Please use 'ARCH=parisc' to build the 32-bit kernel." +-#endif +-#if defined(CONFIG_64BIT) && !defined(__LP64__) +-#error "Please use 'ARCH=parisc64' to build the 64-bit kernel." +-#endif +- + /* See http://marc.theaimsgroup.com/?t=108826637900003 for discussion + * on use of volatile and __*_bit() (set/clear/change): + * *_bit() want use of volatile. +diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S +index e0a9e96576221..fd15fd4bbb61b 100644 +--- a/arch/parisc/kernel/head.S ++++ b/arch/parisc/kernel/head.S +@@ -22,7 +22,7 @@ + #include + #include + +- .level PA_ASM_LEVEL ++ .level 1.1 + + __INITDATA + ENTRY(boot_args) +@@ -70,6 +70,47 @@ $bss_loop: + stw,ma %arg2,4(%r1) + stw,ma %arg3,4(%r1) + ++#if !defined(CONFIG_64BIT) && defined(CONFIG_PA20) ++ /* This 32-bit kernel was compiled for PA2.0 CPUs. Check current CPU ++ * and halt kernel if we detect a PA1.x CPU. */ ++ ldi 32,%r10 ++ mtctl %r10,%cr11 ++ .level 2.0 ++ mfctl,w %cr11,%r10 ++ .level 1.1 ++ comib,<>,n 0,%r10,$cpu_ok ++ ++ load32 PA(msg1),%arg0 ++ ldi msg1_end-msg1,%arg1 ++$iodc_panic: ++ copy %arg0, %r10 ++ copy %arg1, %r11 ++ load32 PA(init_stack),%sp ++#define MEM_CONS 0x3A0 ++ ldw MEM_CONS+32(%r0),%arg0 // HPA ++ ldi ENTRY_IO_COUT,%arg1 ++ ldw MEM_CONS+36(%r0),%arg2 // SPA ++ ldw MEM_CONS+8(%r0),%arg3 // layers ++ load32 PA(__bss_start),%r1 ++ stw %r1,-52(%sp) // arg4 ++ stw %r0,-56(%sp) // arg5 ++ stw %r10,-60(%sp) // arg6 = ptr to text ++ stw %r11,-64(%sp) // arg7 = len ++ stw %r0,-68(%sp) // arg8 ++ load32 PA(.iodc_panic_ret), %rp ++ ldw MEM_CONS+40(%r0),%r1 // ENTRY_IODC ++ bv,n (%r1) ++.iodc_panic_ret: ++ b . /* wait endless with ... */ ++ or %r10,%r10,%r10 /* qemu idle sleep */ ++msg1: .ascii "Can't boot kernel which was built for PA8x00 CPUs on this machine.\r\n" ++msg1_end: ++ ++$cpu_ok: ++#endif ++ ++ .level PA_ASM_LEVEL ++ + /* Initialize startup VM. Just map first 16/32 MB of memory */ + load32 PA(swapper_pg_dir),%r4 + mtctl %r4,%cr24 /* Initialize kernel root pointer */ +diff --git a/arch/riscv/boot/dts/microchip/mpfs.dtsi b/arch/riscv/boot/dts/microchip/mpfs.dtsi +index 9f5bce1488d93..9bf37ef379509 100644 +--- a/arch/riscv/boot/dts/microchip/mpfs.dtsi ++++ b/arch/riscv/boot/dts/microchip/mpfs.dtsi +@@ -161,7 +161,7 @@ + ranges; + + cctrllr: cache-controller@2010000 { +- compatible = "sifive,fu540-c000-ccache", "cache"; ++ compatible = "microchip,mpfs-ccache", "sifive,fu540-c000-ccache", "cache"; + reg = <0x0 0x2010000 0x0 0x1000>; + cache-block-size = <64>; + cache-level = <2>; +diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c +index 53ed3884fe644..5d66e3947070c 100644 +--- a/arch/s390/kernel/nmi.c ++++ b/arch/s390/kernel/nmi.c +@@ -63,7 +63,7 @@ static inline unsigned long nmi_get_mcesa_size(void) + * structure. The structure is required for machine check happening + * early in the boot process. + */ +-static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE); ++static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE); + + void __init nmi_alloc_mcesa_early(u64 *mcesad) + { +diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c +index 0a37f5de28631..3e0361db963ef 100644 +--- a/arch/s390/kernel/setup.c ++++ b/arch/s390/kernel/setup.c +@@ -486,6 +486,7 @@ static void __init setup_lowcore_dat_off(void) + put_abs_lowcore(restart_data, lc->restart_data); + put_abs_lowcore(restart_source, lc->restart_source); + put_abs_lowcore(restart_psw, lc->restart_psw); ++ put_abs_lowcore(mcesad, lc->mcesad); + + lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; +diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h +index 4a23e52fe0ee1..ebc271bb6d8ed 100644 +--- a/arch/x86/include/asm/sev.h ++++ b/arch/x86/include/asm/sev.h +@@ -195,7 +195,7 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); + void snp_set_memory_private(unsigned long vaddr, unsigned int npages); + void snp_set_wakeup_secondary_cpu(void); + bool snp_init(struct boot_params *bp); +-void snp_abort(void); ++void __init __noreturn snp_abort(void); + int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); + #else + static inline void sev_es_ist_enter(struct pt_regs *regs) { } +diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c +index 4f84c3f11af5b..a428c62330d37 100644 +--- a/arch/x86/kernel/sev.c ++++ b/arch/x86/kernel/sev.c +@@ -2112,7 +2112,7 @@ bool __init snp_init(struct boot_params *bp) + return true; + } + +-void __init snp_abort(void) ++void __init __noreturn snp_abort(void) + { + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + } +diff --git a/block/partitions/core.c b/block/partitions/core.c +index 8a0ec929023bc..76617b1d2d47f 100644 +--- a/block/partitions/core.c ++++ b/block/partitions/core.c +@@ -597,6 +597,9 @@ static int blk_add_partitions(struct gendisk *disk) + if (disk->flags & GENHD_FL_NO_PART) + return 0; + ++ if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) ++ return 0; ++ + state = check_partition(disk); + if (!state) + return 0; +diff --git a/drivers/base/driver.c b/drivers/base/driver.c +index 15a75afe6b845..676b6275d5b53 100644 +--- a/drivers/base/driver.c ++++ b/drivers/base/driver.c +@@ -63,6 +63,12 @@ int driver_set_override(struct device *dev, const char **override, + if (len >= (PAGE_SIZE - 1)) + return -EINVAL; + ++ /* ++ * Compute the real length of the string in case userspace sends us a ++ * bunch of \0 characters like python likes to do. ++ */ ++ len = strlen(s); ++ + if (!len) { + /* Empty string passed - clear override */ + device_lock(dev); +diff --git a/drivers/base/regmap/regmap-spi.c b/drivers/base/regmap/regmap-spi.c +index 719323bc6c7f1..37ab23a9d0345 100644 +--- a/drivers/base/regmap/regmap-spi.c ++++ b/drivers/base/regmap/regmap-spi.c +@@ -113,6 +113,7 @@ static const struct regmap_bus *regmap_get_spi_bus(struct spi_device *spi, + const struct regmap_config *config) + { + size_t max_size = spi_max_transfer_size(spi); ++ size_t max_msg_size, reg_reserve_size; + struct regmap_bus *bus; + + if (max_size != SIZE_MAX) { +@@ -120,9 +121,16 @@ static const struct regmap_bus *regmap_get_spi_bus(struct spi_device *spi, + if (!bus) + return ERR_PTR(-ENOMEM); + ++ max_msg_size = spi_max_message_size(spi); ++ reg_reserve_size = config->reg_bits / BITS_PER_BYTE ++ + config->pad_bits / BITS_PER_BYTE; ++ if (max_size + reg_reserve_size > max_msg_size) ++ max_size -= reg_reserve_size; ++ + bus->free_on_exit = true; + bus->max_raw_read = max_size; + bus->max_raw_write = max_size; ++ + return bus; + } + +diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c +index 2cad427741647..f9fd1b6c15d42 100644 +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -532,7 +532,7 @@ static unsigned int __resolve_freq(struct cpufreq_policy *policy, + + target_freq = clamp_val(target_freq, policy->min, policy->max); + +- if (!cpufreq_driver->target_index) ++ if (!policy->freq_table) + return target_freq; + + idx = cpufreq_frequency_table_target(policy, target_freq, relation); +diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c +index 4dde8edd53b62..3e8d4b51a8140 100644 +--- a/drivers/firmware/efi/capsule-loader.c ++++ b/drivers/firmware/efi/capsule-loader.c +@@ -242,29 +242,6 @@ failed: + return ret; + } + +-/** +- * efi_capsule_flush - called by file close or file flush +- * @file: file pointer +- * @id: not used +- * +- * If a capsule is being partially uploaded then calling this function +- * will be treated as upload termination and will free those completed +- * buffer pages and -ECANCELED will be returned. +- **/ +-static int efi_capsule_flush(struct file *file, fl_owner_t id) +-{ +- int ret = 0; +- struct capsule_info *cap_info = file->private_data; +- +- if (cap_info->index > 0) { +- pr_err("capsule upload not complete\n"); +- efi_free_all_buff_pages(cap_info); +- ret = -ECANCELED; +- } +- +- return ret; +-} +- + /** + * efi_capsule_release - called by file close + * @inode: not used +@@ -277,6 +254,13 @@ static int efi_capsule_release(struct inode *inode, struct file *file) + { + struct capsule_info *cap_info = file->private_data; + ++ if (cap_info->index > 0 && ++ (cap_info->header.headersize == 0 || ++ cap_info->count < cap_info->total_size)) { ++ pr_err("capsule upload not complete\n"); ++ efi_free_all_buff_pages(cap_info); ++ } ++ + kfree(cap_info->pages); + kfree(cap_info->phys); + kfree(file->private_data); +@@ -324,7 +308,6 @@ static const struct file_operations efi_capsule_fops = { + .owner = THIS_MODULE, + .open = efi_capsule_open, + .write = efi_capsule_write, +- .flush = efi_capsule_flush, + .release = efi_capsule_release, + .llseek = no_llseek, + }; +diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile +index d0537573501e9..2c67f71f23753 100644 +--- a/drivers/firmware/efi/libstub/Makefile ++++ b/drivers/firmware/efi/libstub/Makefile +@@ -37,6 +37,13 @@ KBUILD_CFLAGS := $(cflags-y) -Os -DDISABLE_BRANCH_PROFILING \ + $(call cc-option,-fno-addrsig) \ + -D__DISABLE_EXPORTS + ++# ++# struct randomization only makes sense for Linux internal types, which the EFI ++# stub code never touches, so let's turn off struct randomization for the stub ++# altogether ++# ++KBUILD_CFLAGS := $(filter-out $(RANDSTRUCT_CFLAGS), $(KBUILD_CFLAGS)) ++ + # remove SCS flags from all objects in this directory + KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS)) + # disable LTO +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 3adebb63680e0..67d4a3c13ed19 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -2482,12 +2482,14 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) + if (!hive->reset_domain || + !amdgpu_reset_get_reset_domain(hive->reset_domain)) { + r = -ENOENT; ++ amdgpu_put_xgmi_hive(hive); + goto init_failed; + } + + /* Drop the early temporary reset domain we created for device */ + amdgpu_reset_put_reset_domain(adev->reset_domain); + adev->reset_domain = hive->reset_domain; ++ amdgpu_put_xgmi_hive(hive); + } + } + +@@ -4473,8 +4475,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, + retry: + amdgpu_amdkfd_pre_reset(adev); + +- amdgpu_amdkfd_pre_reset(adev); +- + if (from_hypervisor) + r = amdgpu_virt_request_full_gpu(adev, true); + else +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +index e9411c28d88ba..2b00f8fe15a89 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +@@ -2612,6 +2612,9 @@ static int psp_hw_fini(void *handle) + psp_rap_terminate(psp); + psp_dtm_terminate(psp); + psp_hdcp_terminate(psp); ++ ++ if (adev->gmc.xgmi.num_physical_nodes > 1) ++ psp_xgmi_terminate(psp); + } + + psp_asd_terminate(psp); +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +index 1b108d03e7859..f2aebbf3fbe38 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +@@ -742,7 +742,7 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) + amdgpu_put_xgmi_hive(hive); + } + +- return psp_xgmi_terminate(&adev->psp); ++ return 0; + } + + static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) +diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +index a4a6751b1e449..30998ac47707c 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +@@ -5090,9 +5090,12 @@ static void gfx_v11_0_update_coarse_grain_clock_gating(struct amdgpu_device *ade + data = REG_SET_FIELD(data, SDMA0_RLC_CGCG_CTRL, CGCG_INT_ENABLE, 1); + WREG32_SOC15(GC, 0, regSDMA0_RLC_CGCG_CTRL, data); + +- data = RREG32_SOC15(GC, 0, regSDMA1_RLC_CGCG_CTRL); +- data = REG_SET_FIELD(data, SDMA1_RLC_CGCG_CTRL, CGCG_INT_ENABLE, 1); +- WREG32_SOC15(GC, 0, regSDMA1_RLC_CGCG_CTRL, data); ++ /* Some ASICs only have one SDMA instance, not need to configure SDMA1 */ ++ if (adev->sdma.num_instances > 1) { ++ data = RREG32_SOC15(GC, 0, regSDMA1_RLC_CGCG_CTRL); ++ data = REG_SET_FIELD(data, SDMA1_RLC_CGCG_CTRL, CGCG_INT_ENABLE, 1); ++ WREG32_SOC15(GC, 0, regSDMA1_RLC_CGCG_CTRL, data); ++ } + } else { + /* Program RLC_CGCG_CGLS_CTRL */ + def = data = RREG32_SOC15(GC, 0, regRLC_CGCG_CGLS_CTRL); +@@ -5121,9 +5124,12 @@ static void gfx_v11_0_update_coarse_grain_clock_gating(struct amdgpu_device *ade + data &= ~SDMA0_RLC_CGCG_CTRL__CGCG_INT_ENABLE_MASK; + WREG32_SOC15(GC, 0, regSDMA0_RLC_CGCG_CTRL, data); + +- data = RREG32_SOC15(GC, 0, regSDMA1_RLC_CGCG_CTRL); +- data &= ~SDMA1_RLC_CGCG_CTRL__CGCG_INT_ENABLE_MASK; +- WREG32_SOC15(GC, 0, regSDMA1_RLC_CGCG_CTRL, data); ++ /* Some ASICs only have one SDMA instance, not need to configure SDMA1 */ ++ if (adev->sdma.num_instances > 1) { ++ data = RREG32_SOC15(GC, 0, regSDMA1_RLC_CGCG_CTRL); ++ data &= ~SDMA1_RLC_CGCG_CTRL__CGCG_INT_ENABLE_MASK; ++ WREG32_SOC15(GC, 0, regSDMA1_RLC_CGCG_CTRL, data); ++ } + } + } + +diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +index 5349ca4d19e38..6d8ff3b099422 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +@@ -2587,7 +2587,8 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev) + + gfx_v9_0_tiling_mode_table_init(adev); + +- gfx_v9_0_setup_rb(adev); ++ if (adev->gfx.num_gfx_rings) ++ gfx_v9_0_setup_rb(adev); + gfx_v9_0_get_cu_info(adev, &adev->gfx.cu_info); + adev->gfx.config.db_debug2 = RREG32_SOC15(GC, 0, mmDB_DEBUG2); + +diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c +index 3f44a099c52a4..3e51e773f92be 100644 +--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c +@@ -176,6 +176,7 @@ static void mmhub_v1_0_init_cache_regs(struct amdgpu_device *adev) + tmp = REG_SET_FIELD(tmp, VM_L2_CNTL2, INVALIDATE_L2_CACHE, 1); + WREG32_SOC15(MMHUB, 0, mmVM_L2_CNTL2, tmp); + ++ tmp = mmVM_L2_CNTL3_DEFAULT; + if (adev->gmc.translate_further) { + tmp = REG_SET_FIELD(tmp, VM_L2_CNTL3, BANK_SELECT, 12); + tmp = REG_SET_FIELD(tmp, VM_L2_CNTL3, +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +index c7a592d68febf..275bfb8ca6f89 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +@@ -3188,7 +3188,7 @@ void crtc_debugfs_init(struct drm_crtc *crtc) + &crc_win_y_end_fops); + debugfs_create_file_unsafe("crc_win_update", 0644, dir, crtc, + &crc_win_update_fops); +- ++ dput(dir); + } + #endif + /* +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c +index 30c6f9cd717f3..27fbe906682f9 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c +@@ -41,6 +41,12 @@ + #define FN(reg_name, field) \ + FD(reg_name##__##field) + ++#include "logger_types.h" ++#undef DC_LOGGER ++#define DC_LOGGER \ ++ CTX->logger ++#define smu_print(str, ...) {DC_LOG_SMU(str, ##__VA_ARGS__); } ++ + #define VBIOSSMC_MSG_TestMessage 0x1 + #define VBIOSSMC_MSG_GetSmuVersion 0x2 + #define VBIOSSMC_MSG_PowerUpGfx 0x3 +@@ -95,7 +101,13 @@ static int rn_vbios_smu_send_msg_with_param(struct clk_mgr_internal *clk_mgr, + uint32_t result; + + result = rn_smu_wait_for_response(clk_mgr, 10, 200000); +- ASSERT(result == VBIOSSMC_Result_OK); ++ ++ if (result != VBIOSSMC_Result_OK) ++ smu_print("SMU Response was not OK. SMU response after wait received is: %d\n", result); ++ ++ if (result == VBIOSSMC_Status_BUSY) { ++ return -1; ++ } + + /* First clear response register */ + REG_WRITE(MP1_SMN_C2PMSG_91, VBIOSSMC_Status_BUSY); +@@ -176,6 +188,10 @@ int rn_vbios_smu_set_hard_min_dcfclk(struct clk_mgr_internal *clk_mgr, int reque + VBIOSSMC_MSG_SetHardMinDcfclkByFreq, + khz_to_mhz_ceil(requested_dcfclk_khz)); + ++#ifdef DBG ++ smu_print("actual_dcfclk_set_mhz %d is set to : %d\n", actual_dcfclk_set_mhz, actual_dcfclk_set_mhz * 1000); ++#endif ++ + return actual_dcfclk_set_mhz * 1000; + } + +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/dcn301_smu.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/dcn301_smu.c +index 1cae01a91a69d..e4f96b6fd79d0 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/dcn301_smu.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/dcn301_smu.c +@@ -41,6 +41,12 @@ + #define FN(reg_name, field) \ + FD(reg_name##__##field) + ++#include "logger_types.h" ++#undef DC_LOGGER ++#define DC_LOGGER \ ++ CTX->logger ++#define smu_print(str, ...) {DC_LOG_SMU(str, ##__VA_ARGS__); } ++ + #define VBIOSSMC_MSG_GetSmuVersion 0x2 + #define VBIOSSMC_MSG_SetDispclkFreq 0x4 + #define VBIOSSMC_MSG_SetDprefclkFreq 0x5 +@@ -96,6 +102,13 @@ static int dcn301_smu_send_msg_with_param(struct clk_mgr_internal *clk_mgr, + + result = dcn301_smu_wait_for_response(clk_mgr, 10, 200000); + ++ if (result != VBIOSSMC_Result_OK) ++ smu_print("SMU Response was not OK. SMU response after wait received is: %d\n", result); ++ ++ if (result == VBIOSSMC_Status_BUSY) { ++ return -1; ++ } ++ + /* First clear response register */ + REG_WRITE(MP1_SMN_C2PMSG_91, VBIOSSMC_Status_BUSY); + +@@ -167,6 +180,10 @@ int dcn301_smu_set_hard_min_dcfclk(struct clk_mgr_internal *clk_mgr, int request + VBIOSSMC_MSG_SetHardMinDcfclkByFreq, + khz_to_mhz_ceil(requested_dcfclk_khz)); + ++#ifdef DBG ++ smu_print("actual_dcfclk_set_mhz %d is set to : %d\n", actual_dcfclk_set_mhz, actual_dcfclk_set_mhz * 1000); ++#endif ++ + return actual_dcfclk_set_mhz * 1000; + } + +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c +index c5d7d075026f3..090b2c02aee17 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn31/dcn31_smu.c +@@ -40,6 +40,12 @@ + #define FN(reg_name, field) \ + FD(reg_name##__##field) + ++#include "logger_types.h" ++#undef DC_LOGGER ++#define DC_LOGGER \ ++ CTX->logger ++#define smu_print(str, ...) {DC_LOG_SMU(str, ##__VA_ARGS__); } ++ + #define VBIOSSMC_MSG_TestMessage 0x1 + #define VBIOSSMC_MSG_GetSmuVersion 0x2 + #define VBIOSSMC_MSG_PowerUpGfx 0x3 +@@ -102,7 +108,9 @@ static int dcn31_smu_send_msg_with_param(struct clk_mgr_internal *clk_mgr, + uint32_t result; + + result = dcn31_smu_wait_for_response(clk_mgr, 10, 200000); +- ASSERT(result == VBIOSSMC_Result_OK); ++ ++ if (result != VBIOSSMC_Result_OK) ++ smu_print("SMU Response was not OK. SMU response after wait received is: %d\n", result); + + if (result == VBIOSSMC_Status_BUSY) { + return -1; +@@ -194,6 +202,10 @@ int dcn31_smu_set_hard_min_dcfclk(struct clk_mgr_internal *clk_mgr, int requeste + VBIOSSMC_MSG_SetHardMinDcfclkByFreq, + khz_to_mhz_ceil(requested_dcfclk_khz)); + ++#ifdef DBG ++ smu_print("actual_dcfclk_set_mhz %d is set to : %d\n", actual_dcfclk_set_mhz, actual_dcfclk_set_mhz * 1000); ++#endif ++ + return actual_dcfclk_set_mhz * 1000; + } + +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_smu.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_smu.c +index 2600313fea579..925d6e13620ec 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_smu.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_smu.c +@@ -70,6 +70,12 @@ static const struct IP_BASE NBIO_BASE = { { { { 0x00000000, 0x00000014, 0x00000D + #define REG_NBIO(reg_name) \ + (NBIO_BASE.instance[0].segment[regBIF_BX_PF2_ ## reg_name ## _BASE_IDX] + regBIF_BX_PF2_ ## reg_name) + ++#include "logger_types.h" ++#undef DC_LOGGER ++#define DC_LOGGER \ ++ CTX->logger ++#define smu_print(str, ...) {DC_LOG_SMU(str, ##__VA_ARGS__); } ++ + #define mmMP1_C2PMSG_3 0x3B1050C + + #define VBIOSSMC_MSG_TestMessage 0x01 ///< To check if PMFW is alive and responding. Requirement specified by PMFW team +@@ -130,7 +136,9 @@ static int dcn315_smu_send_msg_with_param( + uint32_t result; + + result = dcn315_smu_wait_for_response(clk_mgr, 10, 200000); +- ASSERT(result == VBIOSSMC_Result_OK); ++ ++ if (result != VBIOSSMC_Result_OK) ++ smu_print("SMU Response was not OK. SMU response after wait received is: %d\n", result); + + if (result == VBIOSSMC_Status_BUSY) { + return -1; +@@ -197,6 +205,10 @@ int dcn315_smu_set_hard_min_dcfclk(struct clk_mgr_internal *clk_mgr, int request + VBIOSSMC_MSG_SetHardMinDcfclkByFreq, + khz_to_mhz_ceil(requested_dcfclk_khz)); + ++#ifdef DBG ++ smu_print("actual_dcfclk_set_mhz %d is set to : %d\n", actual_dcfclk_set_mhz, actual_dcfclk_set_mhz * 1000); ++#endif ++ + return actual_dcfclk_set_mhz * 1000; + } + +diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_smu.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_smu.c +index dceec4b960527..457a9254ae1c8 100644 +--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_smu.c ++++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_smu.c +@@ -58,6 +58,12 @@ static const struct IP_BASE MP0_BASE = { { { { 0x00016000, 0x00DC0000, 0x00E0000 + #define FN(reg_name, field) \ + FD(reg_name##__##field) + ++#include "logger_types.h" ++#undef DC_LOGGER ++#define DC_LOGGER \ ++ CTX->logger ++#define smu_print(str, ...) {DC_LOG_SMU(str, ##__VA_ARGS__); } ++ + #define VBIOSSMC_MSG_TestMessage 0x01 ///< To check if PMFW is alive and responding. Requirement specified by PMFW team + #define VBIOSSMC_MSG_GetPmfwVersion 0x02 ///< Get PMFW version + #define VBIOSSMC_MSG_Spare0 0x03 ///< Spare0 +@@ -118,7 +124,9 @@ static int dcn316_smu_send_msg_with_param( + uint32_t result; + + result = dcn316_smu_wait_for_response(clk_mgr, 10, 200000); +- ASSERT(result == VBIOSSMC_Result_OK); ++ ++ if (result != VBIOSSMC_Result_OK) ++ smu_print("SMU Response was not OK. SMU response after wait received is: %d\n", result); + + if (result == VBIOSSMC_Status_BUSY) { + return -1; +@@ -183,6 +191,10 @@ int dcn316_smu_set_hard_min_dcfclk(struct clk_mgr_internal *clk_mgr, int request + VBIOSSMC_MSG_SetHardMinDcfclkByFreq, + khz_to_mhz_ceil(requested_dcfclk_khz)); + ++#ifdef DBG ++ smu_print("actual_dcfclk_set_mhz %d is set to : %d\n", actual_dcfclk_set_mhz, actual_dcfclk_set_mhz * 1000); ++#endif ++ + return actual_dcfclk_set_mhz * 1000; + } + +diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c +index 86d670c712867..ad068865ba206 100644 +--- a/drivers/gpu/drm/drm_gem.c ++++ b/drivers/gpu/drm/drm_gem.c +@@ -168,21 +168,6 @@ void drm_gem_private_object_init(struct drm_device *dev, + } + EXPORT_SYMBOL(drm_gem_private_object_init); + +-static void +-drm_gem_remove_prime_handles(struct drm_gem_object *obj, struct drm_file *filp) +-{ +- /* +- * Note: obj->dma_buf can't disappear as long as we still hold a +- * handle reference in obj->handle_count. +- */ +- mutex_lock(&filp->prime.lock); +- if (obj->dma_buf) { +- drm_prime_remove_buf_handle_locked(&filp->prime, +- obj->dma_buf); +- } +- mutex_unlock(&filp->prime.lock); +-} +- + /** + * drm_gem_object_handle_free - release resources bound to userspace handles + * @obj: GEM object to clean up. +@@ -253,7 +238,7 @@ drm_gem_object_release_handle(int id, void *ptr, void *data) + if (obj->funcs->close) + obj->funcs->close(obj, file_priv); + +- drm_gem_remove_prime_handles(obj, file_priv); ++ drm_prime_remove_buf_handle(&file_priv->prime, id); + drm_vma_node_revoke(&obj->vma_node, file_priv); + + drm_gem_object_handle_put_unlocked(obj); +diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h +index 1fbbc19f1ac09..7bb98e6a446d0 100644 +--- a/drivers/gpu/drm/drm_internal.h ++++ b/drivers/gpu/drm/drm_internal.h +@@ -74,8 +74,8 @@ int drm_prime_fd_to_handle_ioctl(struct drm_device *dev, void *data, + + void drm_prime_init_file_private(struct drm_prime_file_private *prime_fpriv); + void drm_prime_destroy_file_private(struct drm_prime_file_private *prime_fpriv); +-void drm_prime_remove_buf_handle_locked(struct drm_prime_file_private *prime_fpriv, +- struct dma_buf *dma_buf); ++void drm_prime_remove_buf_handle(struct drm_prime_file_private *prime_fpriv, ++ uint32_t handle); + + /* drm_drv.c */ + struct drm_minor *drm_minor_acquire(unsigned int minor_id); +diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c +index e3f09f18110c7..bd5366b16381b 100644 +--- a/drivers/gpu/drm/drm_prime.c ++++ b/drivers/gpu/drm/drm_prime.c +@@ -190,29 +190,33 @@ static int drm_prime_lookup_buf_handle(struct drm_prime_file_private *prime_fpri + return -ENOENT; + } + +-void drm_prime_remove_buf_handle_locked(struct drm_prime_file_private *prime_fpriv, +- struct dma_buf *dma_buf) ++void drm_prime_remove_buf_handle(struct drm_prime_file_private *prime_fpriv, ++ uint32_t handle) + { + struct rb_node *rb; + +- rb = prime_fpriv->dmabufs.rb_node; ++ mutex_lock(&prime_fpriv->lock); ++ ++ rb = prime_fpriv->handles.rb_node; + while (rb) { + struct drm_prime_member *member; + +- member = rb_entry(rb, struct drm_prime_member, dmabuf_rb); +- if (member->dma_buf == dma_buf) { ++ member = rb_entry(rb, struct drm_prime_member, handle_rb); ++ if (member->handle == handle) { + rb_erase(&member->handle_rb, &prime_fpriv->handles); + rb_erase(&member->dmabuf_rb, &prime_fpriv->dmabufs); + +- dma_buf_put(dma_buf); ++ dma_buf_put(member->dma_buf); + kfree(member); +- return; +- } else if (member->dma_buf < dma_buf) { ++ break; ++ } else if (member->handle < handle) { + rb = rb->rb_right; + } else { + rb = rb->rb_left; + } + } ++ ++ mutex_unlock(&prime_fpriv->lock); + } + + void drm_prime_init_file_private(struct drm_prime_file_private *prime_fpriv) +diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c +index 0c5638f5b72bc..91caf4523b34d 100644 +--- a/drivers/gpu/drm/i915/display/intel_bios.c ++++ b/drivers/gpu/drm/i915/display/intel_bios.c +@@ -478,6 +478,13 @@ init_bdb_block(struct drm_i915_private *i915, + + block_size = get_blocksize(block); + ++ /* ++ * Version number and new block size are considered ++ * part of the header for MIPI sequenece block v3+. ++ */ ++ if (section_id == BDB_MIPI_SEQUENCE && *(const u8 *)block >= 3) ++ block_size += 5; ++ + entry = kzalloc(struct_size(entry, data, max(min_size, block_size) + 3), + GFP_KERNEL); + if (!entry) { +diff --git a/drivers/gpu/drm/i915/display/intel_dp_link_training.c b/drivers/gpu/drm/i915/display/intel_dp_link_training.c +index 9feaf1a589f38..d213d8ad1ea53 100644 +--- a/drivers/gpu/drm/i915/display/intel_dp_link_training.c ++++ b/drivers/gpu/drm/i915/display/intel_dp_link_training.c +@@ -671,6 +671,28 @@ intel_dp_prepare_link_train(struct intel_dp *intel_dp, + intel_dp_compute_rate(intel_dp, crtc_state->port_clock, + &link_bw, &rate_select); + ++ /* ++ * WaEdpLinkRateDataReload ++ * ++ * Parade PS8461E MUX (used on varius TGL+ laptops) needs ++ * to snoop the link rates reported by the sink when we ++ * use LINK_RATE_SET in order to operate in jitter cleaning ++ * mode (as opposed to redriver mode). Unfortunately it ++ * loses track of the snooped link rates when powered down, ++ * so we need to make it re-snoop often. Without this high ++ * link rates are not stable. ++ */ ++ if (!link_bw) { ++ struct intel_connector *connector = intel_dp->attached_connector; ++ __le16 sink_rates[DP_MAX_SUPPORTED_RATES]; ++ ++ drm_dbg_kms(&i915->drm, "[CONNECTOR:%d:%s] Reloading eDP link rates\n", ++ connector->base.base.id, connector->base.name); ++ ++ drm_dp_dpcd_read(&intel_dp->aux, DP_SUPPORTED_LINK_RATES, ++ sink_rates, sizeof(sink_rates)); ++ } ++ + if (link_bw) + drm_dbg_kms(&i915->drm, + "[ENCODER:%d:%s] Using LINK_BW_SET value %02x\n", +diff --git a/drivers/gpu/drm/i915/gt/intel_llc.c b/drivers/gpu/drm/i915/gt/intel_llc.c +index 40e2e28ee6c75..bf01780e7ea56 100644 +--- a/drivers/gpu/drm/i915/gt/intel_llc.c ++++ b/drivers/gpu/drm/i915/gt/intel_llc.c +@@ -12,6 +12,7 @@ + #include "intel_llc.h" + #include "intel_mchbar_regs.h" + #include "intel_pcode.h" ++#include "intel_rps.h" + + struct ia_constants { + unsigned int min_gpu_freq; +@@ -55,9 +56,6 @@ static bool get_ia_constants(struct intel_llc *llc, + if (!HAS_LLC(i915) || IS_DGFX(i915)) + return false; + +- if (rps->max_freq <= rps->min_freq) +- return false; +- + consts->max_ia_freq = cpu_max_MHz(); + + consts->min_ring_freq = +@@ -65,13 +63,8 @@ static bool get_ia_constants(struct intel_llc *llc, + /* convert DDR frequency from units of 266.6MHz to bandwidth */ + consts->min_ring_freq = mult_frac(consts->min_ring_freq, 8, 3); + +- consts->min_gpu_freq = rps->min_freq; +- consts->max_gpu_freq = rps->max_freq; +- if (GRAPHICS_VER(i915) >= 9) { +- /* Convert GT frequency to 50 HZ units */ +- consts->min_gpu_freq /= GEN9_FREQ_SCALER; +- consts->max_gpu_freq /= GEN9_FREQ_SCALER; +- } ++ consts->min_gpu_freq = intel_rps_get_min_raw_freq(rps); ++ consts->max_gpu_freq = intel_rps_get_max_raw_freq(rps); + + return true; + } +@@ -131,6 +124,12 @@ static void gen6_update_ring_freq(struct intel_llc *llc) + if (!get_ia_constants(llc, &consts)) + return; + ++ /* ++ * Although this is unlikely on any platform during initialization, ++ * let's ensure we don't get accidentally into infinite loop ++ */ ++ if (consts.max_gpu_freq <= consts.min_gpu_freq) ++ return; + /* + * For each potential GPU frequency, load a ring frequency we'd like + * to use for memory access. We do this by specifying the IA frequency +diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c +index 3476a11f294ce..7c068cc64c2fa 100644 +--- a/drivers/gpu/drm/i915/gt/intel_rps.c ++++ b/drivers/gpu/drm/i915/gt/intel_rps.c +@@ -2123,6 +2123,31 @@ u32 intel_rps_get_max_frequency(struct intel_rps *rps) + return intel_gpu_freq(rps, rps->max_freq_softlimit); + } + ++/** ++ * intel_rps_get_max_raw_freq - returns the max frequency in some raw format. ++ * @rps: the intel_rps structure ++ * ++ * Returns the max frequency in a raw format. In newer platforms raw is in ++ * units of 50 MHz. ++ */ ++u32 intel_rps_get_max_raw_freq(struct intel_rps *rps) ++{ ++ struct intel_guc_slpc *slpc = rps_to_slpc(rps); ++ u32 freq; ++ ++ if (rps_uses_slpc(rps)) { ++ return DIV_ROUND_CLOSEST(slpc->rp0_freq, ++ GT_FREQUENCY_MULTIPLIER); ++ } else { ++ freq = rps->max_freq; ++ if (GRAPHICS_VER(rps_to_i915(rps)) >= 9) { ++ /* Convert GT frequency to 50 MHz units */ ++ freq /= GEN9_FREQ_SCALER; ++ } ++ return freq; ++ } ++} ++ + u32 intel_rps_get_rp0_frequency(struct intel_rps *rps) + { + struct intel_guc_slpc *slpc = rps_to_slpc(rps); +@@ -2211,6 +2236,31 @@ u32 intel_rps_get_min_frequency(struct intel_rps *rps) + return intel_gpu_freq(rps, rps->min_freq_softlimit); + } + ++/** ++ * intel_rps_get_min_raw_freq - returns the min frequency in some raw format. ++ * @rps: the intel_rps structure ++ * ++ * Returns the min frequency in a raw format. In newer platforms raw is in ++ * units of 50 MHz. ++ */ ++u32 intel_rps_get_min_raw_freq(struct intel_rps *rps) ++{ ++ struct intel_guc_slpc *slpc = rps_to_slpc(rps); ++ u32 freq; ++ ++ if (rps_uses_slpc(rps)) { ++ return DIV_ROUND_CLOSEST(slpc->min_freq, ++ GT_FREQUENCY_MULTIPLIER); ++ } else { ++ freq = rps->min_freq; ++ if (GRAPHICS_VER(rps_to_i915(rps)) >= 9) { ++ /* Convert GT frequency to 50 MHz units */ ++ freq /= GEN9_FREQ_SCALER; ++ } ++ return freq; ++ } ++} ++ + static int set_min_freq(struct intel_rps *rps, u32 val) + { + int ret = 0; +diff --git a/drivers/gpu/drm/i915/gt/intel_rps.h b/drivers/gpu/drm/i915/gt/intel_rps.h +index 1e8d564913083..4509dfdc52e09 100644 +--- a/drivers/gpu/drm/i915/gt/intel_rps.h ++++ b/drivers/gpu/drm/i915/gt/intel_rps.h +@@ -37,8 +37,10 @@ u32 intel_rps_get_cagf(struct intel_rps *rps, u32 rpstat1); + u32 intel_rps_read_actual_frequency(struct intel_rps *rps); + u32 intel_rps_get_requested_frequency(struct intel_rps *rps); + u32 intel_rps_get_min_frequency(struct intel_rps *rps); ++u32 intel_rps_get_min_raw_freq(struct intel_rps *rps); + int intel_rps_set_min_frequency(struct intel_rps *rps, u32 val); + u32 intel_rps_get_max_frequency(struct intel_rps *rps); ++u32 intel_rps_get_max_raw_freq(struct intel_rps *rps); + int intel_rps_set_max_frequency(struct intel_rps *rps, u32 val); + u32 intel_rps_get_rp0_frequency(struct intel_rps *rps); + u32 intel_rps_get_rp1_frequency(struct intel_rps *rps); +diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c +index 429644d5ddc69..9fba16cb3f1e7 100644 +--- a/drivers/gpu/drm/radeon/radeon_device.c ++++ b/drivers/gpu/drm/radeon/radeon_device.c +@@ -1604,6 +1604,9 @@ int radeon_suspend_kms(struct drm_device *dev, bool suspend, + if (r) { + /* delay GPU reset to resume */ + radeon_fence_driver_force_completion(rdev, i); ++ } else { ++ /* finish executing delayed work */ ++ flush_delayed_work(&rdev->fence_drv[i].lockup_work); + } + } + +diff --git a/drivers/hwmon/asus-ec-sensors.c b/drivers/hwmon/asus-ec-sensors.c +index 3633ab691662b..81e688975c6a7 100644 +--- a/drivers/hwmon/asus-ec-sensors.c ++++ b/drivers/hwmon/asus-ec-sensors.c +@@ -54,6 +54,10 @@ static char *mutex_path_override; + /* ACPI mutex for locking access to the EC for the firmware */ + #define ASUS_HW_ACCESS_MUTEX_ASMX "\\AMW0.ASMX" + ++#define ASUS_HW_ACCESS_MUTEX_RMTW_ASMX "\\RMTW.ASMX" ++ ++#define ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0 "\\_SB_.PCI0.SBRG.SIO1.MUT0" ++ + #define MAX_IDENTICAL_BOARD_VARIATIONS 3 + + /* Moniker for the ACPI global lock (':' is not allowed in ASL identifiers) */ +@@ -119,6 +123,18 @@ enum ec_sensors { + ec_sensor_temp_water_in, + /* "Water_Out" temperature sensor reading [℃] */ + ec_sensor_temp_water_out, ++ /* "Water_Block_In" temperature sensor reading [℃] */ ++ ec_sensor_temp_water_block_in, ++ /* "Water_Block_Out" temperature sensor reading [℃] */ ++ ec_sensor_temp_water_block_out, ++ /* "T_sensor_2" temperature sensor reading [℃] */ ++ ec_sensor_temp_t_sensor_2, ++ /* "Extra_1" temperature sensor reading [℃] */ ++ ec_sensor_temp_sensor_extra_1, ++ /* "Extra_2" temperature sensor reading [℃] */ ++ ec_sensor_temp_sensor_extra_2, ++ /* "Extra_3" temperature sensor reading [℃] */ ++ ec_sensor_temp_sensor_extra_3, + }; + + #define SENSOR_TEMP_CHIPSET BIT(ec_sensor_temp_chipset) +@@ -134,11 +150,19 @@ enum ec_sensors { + #define SENSOR_CURR_CPU BIT(ec_sensor_curr_cpu) + #define SENSOR_TEMP_WATER_IN BIT(ec_sensor_temp_water_in) + #define SENSOR_TEMP_WATER_OUT BIT(ec_sensor_temp_water_out) ++#define SENSOR_TEMP_WATER_BLOCK_IN BIT(ec_sensor_temp_water_block_in) ++#define SENSOR_TEMP_WATER_BLOCK_OUT BIT(ec_sensor_temp_water_block_out) ++#define SENSOR_TEMP_T_SENSOR_2 BIT(ec_sensor_temp_t_sensor_2) ++#define SENSOR_TEMP_SENSOR_EXTRA_1 BIT(ec_sensor_temp_sensor_extra_1) ++#define SENSOR_TEMP_SENSOR_EXTRA_2 BIT(ec_sensor_temp_sensor_extra_2) ++#define SENSOR_TEMP_SENSOR_EXTRA_3 BIT(ec_sensor_temp_sensor_extra_3) + + enum board_family { + family_unknown, + family_amd_400_series, + family_amd_500_series, ++ family_intel_300_series, ++ family_intel_600_series + }; + + /* All the known sensors for ASUS EC controllers */ +@@ -195,15 +219,54 @@ static const struct ec_sensor_info sensors_family_amd_500[] = { + EC_SENSOR("Water_In", hwmon_temp, 1, 0x01, 0x00), + [ec_sensor_temp_water_out] = + EC_SENSOR("Water_Out", hwmon_temp, 1, 0x01, 0x01), ++ [ec_sensor_temp_water_block_in] = ++ EC_SENSOR("Water_Block_In", hwmon_temp, 1, 0x01, 0x02), ++ [ec_sensor_temp_water_block_out] = ++ EC_SENSOR("Water_Block_Out", hwmon_temp, 1, 0x01, 0x03), ++ [ec_sensor_temp_sensor_extra_1] = ++ EC_SENSOR("Extra_1", hwmon_temp, 1, 0x01, 0x09), ++ [ec_sensor_temp_t_sensor_2] = ++ EC_SENSOR("T_sensor_2", hwmon_temp, 1, 0x01, 0x0a), ++ [ec_sensor_temp_sensor_extra_2] = ++ EC_SENSOR("Extra_2", hwmon_temp, 1, 0x01, 0x0b), ++ [ec_sensor_temp_sensor_extra_3] = ++ EC_SENSOR("Extra_3", hwmon_temp, 1, 0x01, 0x0c), ++}; ++ ++static const struct ec_sensor_info sensors_family_intel_300[] = { ++ [ec_sensor_temp_chipset] = ++ EC_SENSOR("Chipset", hwmon_temp, 1, 0x00, 0x3a), ++ [ec_sensor_temp_cpu] = EC_SENSOR("CPU", hwmon_temp, 1, 0x00, 0x3b), ++ [ec_sensor_temp_mb] = ++ EC_SENSOR("Motherboard", hwmon_temp, 1, 0x00, 0x3c), ++ [ec_sensor_temp_t_sensor] = ++ EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x00, 0x3d), ++ [ec_sensor_temp_vrm] = EC_SENSOR("VRM", hwmon_temp, 1, 0x00, 0x3e), ++ [ec_sensor_fan_cpu_opt] = ++ EC_SENSOR("CPU_Opt", hwmon_fan, 2, 0x00, 0xb0), ++ [ec_sensor_fan_vrm_hs] = EC_SENSOR("VRM HS", hwmon_fan, 2, 0x00, 0xb2), ++ [ec_sensor_fan_water_flow] = ++ EC_SENSOR("Water_Flow", hwmon_fan, 2, 0x00, 0xbc), ++ [ec_sensor_temp_water_in] = ++ EC_SENSOR("Water_In", hwmon_temp, 1, 0x01, 0x00), ++ [ec_sensor_temp_water_out] = ++ EC_SENSOR("Water_Out", hwmon_temp, 1, 0x01, 0x01), ++}; ++ ++static const struct ec_sensor_info sensors_family_intel_600[] = { ++ [ec_sensor_temp_t_sensor] = ++ EC_SENSOR("T_Sensor", hwmon_temp, 1, 0x00, 0x3d), ++ [ec_sensor_temp_vrm] = EC_SENSOR("VRM", hwmon_temp, 1, 0x00, 0x3e), + }; + + /* Shortcuts for common combinations */ + #define SENSOR_SET_TEMP_CHIPSET_CPU_MB \ + (SENSOR_TEMP_CHIPSET | SENSOR_TEMP_CPU | SENSOR_TEMP_MB) + #define SENSOR_SET_TEMP_WATER (SENSOR_TEMP_WATER_IN | SENSOR_TEMP_WATER_OUT) ++#define SENSOR_SET_WATER_BLOCK \ ++ (SENSOR_TEMP_WATER_BLOCK_IN | SENSOR_TEMP_WATER_BLOCK_OUT) + + struct ec_board_info { +- const char *board_names[MAX_IDENTICAL_BOARD_VARIATIONS]; + unsigned long sensors; + /* + * Defines which mutex to use for guarding access to the state and the +@@ -216,121 +279,194 @@ struct ec_board_info { + enum board_family family; + }; + +-static const struct ec_board_info board_info[] = { +- { +- .board_names = {"PRIME X470-PRO"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | +- SENSOR_FAN_CPU_OPT | +- SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, +- .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, +- .family = family_amd_400_series, +- }, +- { +- .board_names = {"PRIME X570-PRO"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | +- SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ProArt X570-CREATOR WIFI"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | +- SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT | +- SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, +- }, +- { +- .board_names = {"Pro WS X570-ACE"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | +- SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET | +- SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ROG CROSSHAIR VIII DARK HERO"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | +- SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | +- SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW | +- SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = { +- "ROG CROSSHAIR VIII FORMULA", +- "ROG CROSSHAIR VIII HERO", +- "ROG CROSSHAIR VIII HERO (WI-FI)", +- }, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | +- SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | +- SENSOR_FAN_CPU_OPT | SENSOR_FAN_CHIPSET | +- SENSOR_FAN_WATER_FLOW | SENSOR_CURR_CPU | +- SENSOR_IN_CPU_CORE, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ROG CROSSHAIR VIII IMPACT"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | +- SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | +- SENSOR_IN_CPU_CORE, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ROG STRIX B550-E GAMING"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | +- SENSOR_FAN_CPU_OPT, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ROG STRIX B550-I GAMING"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | +- SENSOR_FAN_VRM_HS | SENSOR_CURR_CPU | +- SENSOR_IN_CPU_CORE, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ROG STRIX X570-E GAMING"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | +- SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | +- SENSOR_IN_CPU_CORE, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ROG STRIX X570-E GAMING WIFI II"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | SENSOR_CURR_CPU | +- SENSOR_IN_CPU_CORE, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ROG STRIX X570-F GAMING"}, +- .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | +- SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- { +- .board_names = {"ROG STRIX X570-I GAMING"}, +- .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_FAN_VRM_HS | +- SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | +- SENSOR_IN_CPU_CORE, +- .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, +- .family = family_amd_500_series, +- }, +- {} ++static const struct ec_board_info board_info_prime_x470_pro = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | ++ SENSOR_FAN_CPU_OPT | ++ SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, ++ .mutex_path = ACPI_GLOBAL_LOCK_PSEUDO_PATH, ++ .family = family_amd_400_series, ++}; ++ ++static const struct ec_board_info board_info_prime_x570_pro = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | ++ SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_pro_art_x570_creator_wifi = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | ++ SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CPU_OPT | ++ SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_pro_ws_x570_ace = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_VRM | ++ SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET | ++ SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_crosshair_viii_dark_hero = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | ++ SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | ++ SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW | ++ SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_crosshair_viii_hero = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | ++ SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | ++ SENSOR_FAN_CPU_OPT | SENSOR_FAN_CHIPSET | ++ SENSOR_FAN_WATER_FLOW | SENSOR_CURR_CPU | ++ SENSOR_IN_CPU_CORE, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_maximus_xi_hero = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | ++ SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | ++ SENSOR_FAN_CPU_OPT | SENSOR_FAN_WATER_FLOW, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_intel_300_series, ++}; ++ ++static const struct ec_board_info board_info_crosshair_viii_impact = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | ++ SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | ++ SENSOR_IN_CPU_CORE, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_strix_b550_e_gaming = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | ++ SENSOR_FAN_CPU_OPT, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_strix_b550_i_gaming = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | ++ SENSOR_FAN_VRM_HS | SENSOR_CURR_CPU | ++ SENSOR_IN_CPU_CORE, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_strix_x570_e_gaming = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM | ++ SENSOR_FAN_CHIPSET | SENSOR_CURR_CPU | ++ SENSOR_IN_CPU_CORE, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_strix_x570_e_gaming_wifi_ii = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | SENSOR_CURR_CPU | ++ SENSOR_IN_CPU_CORE, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_strix_x570_f_gaming = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | ++ SENSOR_TEMP_T_SENSOR | SENSOR_FAN_CHIPSET, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_strix_x570_i_gaming = { ++ .sensors = SENSOR_TEMP_CHIPSET | SENSOR_TEMP_VRM | ++ SENSOR_TEMP_T_SENSOR | ++ SENSOR_FAN_VRM_HS | SENSOR_FAN_CHIPSET | ++ SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_ASMX, ++ .family = family_amd_500_series, ++}; ++ ++static const struct ec_board_info board_info_strix_z690_a_gaming_wifi_d4 = { ++ .sensors = SENSOR_TEMP_T_SENSOR | SENSOR_TEMP_VRM, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_RMTW_ASMX, ++ .family = family_intel_600_series, ++}; ++ ++static const struct ec_board_info board_info_zenith_ii_extreme = { ++ .sensors = SENSOR_SET_TEMP_CHIPSET_CPU_MB | SENSOR_TEMP_T_SENSOR | ++ SENSOR_TEMP_VRM | SENSOR_SET_TEMP_WATER | ++ SENSOR_FAN_CPU_OPT | SENSOR_FAN_CHIPSET | SENSOR_FAN_VRM_HS | ++ SENSOR_FAN_WATER_FLOW | SENSOR_CURR_CPU | SENSOR_IN_CPU_CORE | ++ SENSOR_SET_WATER_BLOCK | ++ SENSOR_TEMP_T_SENSOR_2 | SENSOR_TEMP_SENSOR_EXTRA_1 | ++ SENSOR_TEMP_SENSOR_EXTRA_2 | SENSOR_TEMP_SENSOR_EXTRA_3, ++ .mutex_path = ASUS_HW_ACCESS_MUTEX_SB_PCI0_SBRG_SIO1_MUT0, ++ .family = family_amd_500_series, ++}; ++ ++#define DMI_EXACT_MATCH_ASUS_BOARD_NAME(name, board_info) \ ++ { \ ++ .matches = { \ ++ DMI_EXACT_MATCH(DMI_BOARD_VENDOR, \ ++ "ASUSTeK COMPUTER INC."), \ ++ DMI_EXACT_MATCH(DMI_BOARD_NAME, name), \ ++ }, \ ++ .driver_data = (void *)board_info, \ ++ } ++ ++static const struct dmi_system_id dmi_table[] = { ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("PRIME X470-PRO", ++ &board_info_prime_x470_pro), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("PRIME X570-PRO", ++ &board_info_prime_x570_pro), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ProArt X570-CREATOR WIFI", ++ &board_info_pro_art_x570_creator_wifi), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("Pro WS X570-ACE", ++ &board_info_pro_ws_x570_ace), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII DARK HERO", ++ &board_info_crosshair_viii_dark_hero), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII FORMULA", ++ &board_info_crosshair_viii_hero), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII HERO", ++ &board_info_crosshair_viii_hero), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII HERO (WI-FI)", ++ &board_info_crosshair_viii_hero), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG MAXIMUS XI HERO", ++ &board_info_maximus_xi_hero), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG MAXIMUS XI HERO (WI-FI)", ++ &board_info_maximus_xi_hero), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG CROSSHAIR VIII IMPACT", ++ &board_info_crosshair_viii_impact), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-E GAMING", ++ &board_info_strix_b550_e_gaming), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX B550-I GAMING", ++ &board_info_strix_b550_i_gaming), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-E GAMING", ++ &board_info_strix_x570_e_gaming), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-E GAMING WIFI II", ++ &board_info_strix_x570_e_gaming_wifi_ii), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-F GAMING", ++ &board_info_strix_x570_f_gaming), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX X570-I GAMING", ++ &board_info_strix_x570_i_gaming), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG STRIX Z690-A GAMING WIFI D4", ++ &board_info_strix_z690_a_gaming_wifi_d4), ++ DMI_EXACT_MATCH_ASUS_BOARD_NAME("ROG ZENITH II EXTREME", ++ &board_info_zenith_ii_extreme), ++ {}, + }; + + struct ec_sensor { +@@ -441,12 +577,12 @@ static int find_ec_sensor_index(const struct ec_sensors_data *ec, + return -ENOENT; + } + +-static int __init bank_compare(const void *a, const void *b) ++static int bank_compare(const void *a, const void *b) + { + return *((const s8 *)a) - *((const s8 *)b); + } + +-static void __init setup_sensor_data(struct ec_sensors_data *ec) ++static void setup_sensor_data(struct ec_sensors_data *ec) + { + struct ec_sensor *s = ec->sensors; + bool bank_found; +@@ -478,7 +614,7 @@ static void __init setup_sensor_data(struct ec_sensors_data *ec) + sort(ec->banks, ec->nr_banks, 1, bank_compare, NULL); + } + +-static void __init fill_ec_registers(struct ec_sensors_data *ec) ++static void fill_ec_registers(struct ec_sensors_data *ec) + { + const struct ec_sensor_info *si; + unsigned int i, j, register_idx = 0; +@@ -493,7 +629,7 @@ static void __init fill_ec_registers(struct ec_sensors_data *ec) + } + } + +-static int __init setup_lock_data(struct device *dev) ++static int setup_lock_data(struct device *dev) + { + const char *mutex_path; + int status; +@@ -716,7 +852,7 @@ static umode_t asus_ec_hwmon_is_visible(const void *drvdata, + return find_ec_sensor_index(state, type, channel) >= 0 ? S_IRUGO : 0; + } + +-static int __init ++static int + asus_ec_hwmon_add_chan_info(struct hwmon_channel_info *asus_ec_hwmon_chan, + struct device *dev, int num, + enum hwmon_sensor_types type, u32 config) +@@ -745,27 +881,15 @@ static struct hwmon_chip_info asus_ec_chip_info = { + .ops = &asus_ec_hwmon_ops, + }; + +-static const struct ec_board_info * __init get_board_info(void) ++static const struct ec_board_info *get_board_info(void) + { +- const char *dmi_board_vendor = dmi_get_system_info(DMI_BOARD_VENDOR); +- const char *dmi_board_name = dmi_get_system_info(DMI_BOARD_NAME); +- const struct ec_board_info *board; +- +- if (!dmi_board_vendor || !dmi_board_name || +- strcasecmp(dmi_board_vendor, "ASUSTeK COMPUTER INC.")) +- return NULL; +- +- for (board = board_info; board->sensors; board++) { +- if (match_string(board->board_names, +- MAX_IDENTICAL_BOARD_VARIATIONS, +- dmi_board_name) >= 0) +- return board; +- } ++ const struct dmi_system_id *dmi_entry; + +- return NULL; ++ dmi_entry = dmi_first_match(dmi_table); ++ return dmi_entry ? dmi_entry->driver_data : NULL; + } + +-static int __init asus_ec_probe(struct platform_device *pdev) ++static int asus_ec_probe(struct platform_device *pdev) + { + const struct hwmon_channel_info **ptr_asus_ec_ci; + int nr_count[hwmon_max] = { 0 }, nr_types = 0; +@@ -799,6 +923,12 @@ static int __init asus_ec_probe(struct platform_device *pdev) + case family_amd_500_series: + ec_data->sensors_info = sensors_family_amd_500; + break; ++ case family_intel_300_series: ++ ec_data->sensors_info = sensors_family_intel_300; ++ break; ++ case family_intel_600_series: ++ ec_data->sensors_info = sensors_family_intel_600; ++ break; + default: + dev_err(dev, "Unknown board family: %d", + ec_data->board_info->family); +@@ -868,29 +998,37 @@ static int __init asus_ec_probe(struct platform_device *pdev) + return PTR_ERR_OR_ZERO(hwdev); + } + +- +-static const struct acpi_device_id acpi_ec_ids[] = { +- /* Embedded Controller Device */ +- { "PNP0C09", 0 }, +- {} +-}; ++MODULE_DEVICE_TABLE(dmi, dmi_table); + + static struct platform_driver asus_ec_sensors_platform_driver = { + .driver = { + .name = "asus-ec-sensors", +- .acpi_match_table = acpi_ec_ids, + }, ++ .probe = asus_ec_probe, + }; + +-MODULE_DEVICE_TABLE(acpi, acpi_ec_ids); +-/* +- * we use module_platform_driver_probe() rather than module_platform_driver() +- * because the probe function (and its dependants) are marked with __init, which +- * means we can't put it into the .probe member of the platform_driver struct +- * above, and we can't mark the asus_ec_sensors_platform_driver object as __init +- * because the object is referenced from the module exit code. +- */ +-module_platform_driver_probe(asus_ec_sensors_platform_driver, asus_ec_probe); ++static struct platform_device *asus_ec_sensors_platform_device; ++ ++static int __init asus_ec_init(void) ++{ ++ asus_ec_sensors_platform_device = ++ platform_create_bundle(&asus_ec_sensors_platform_driver, ++ asus_ec_probe, NULL, 0, NULL, 0); ++ ++ if (IS_ERR(asus_ec_sensors_platform_device)) ++ return PTR_ERR(asus_ec_sensors_platform_device); ++ ++ return 0; ++} ++ ++static void __exit asus_ec_exit(void) ++{ ++ platform_device_unregister(asus_ec_sensors_platform_device); ++ platform_driver_unregister(&asus_ec_sensors_platform_driver); ++} ++ ++module_init(asus_ec_init); ++module_exit(asus_ec_exit); + + module_param_named(mutex_path, mutex_path_override, charp, 0); + MODULE_PARM_DESC(mutex_path, +diff --git a/drivers/hwmon/mr75203.c b/drivers/hwmon/mr75203.c +index 26278b0f17a98..9259779cc2dff 100644 +--- a/drivers/hwmon/mr75203.c ++++ b/drivers/hwmon/mr75203.c +@@ -68,8 +68,9 @@ + + /* VM Individual Macro Register */ + #define VM_COM_REG_SIZE 0x200 +-#define VM_SDIF_DONE(n) (VM_COM_REG_SIZE + 0x34 + 0x200 * (n)) +-#define VM_SDIF_DATA(n) (VM_COM_REG_SIZE + 0x40 + 0x200 * (n)) ++#define VM_SDIF_DONE(vm) (VM_COM_REG_SIZE + 0x34 + 0x200 * (vm)) ++#define VM_SDIF_DATA(vm, ch) \ ++ (VM_COM_REG_SIZE + 0x40 + 0x200 * (vm) + 0x4 * (ch)) + + /* SDA Slave Register */ + #define IP_CTRL 0x00 +@@ -115,6 +116,7 @@ struct pvt_device { + u32 t_num; + u32 p_num; + u32 v_num; ++ u32 c_num; + u32 ip_freq; + u8 *vm_idx; + }; +@@ -178,14 +180,15 @@ static int pvt_read_in(struct device *dev, u32 attr, int channel, long *val) + { + struct pvt_device *pvt = dev_get_drvdata(dev); + struct regmap *v_map = pvt->v_map; ++ u8 vm_idx, ch_idx; + u32 n, stat; +- u8 vm_idx; + int ret; + +- if (channel >= pvt->v_num) ++ if (channel >= pvt->v_num * pvt->c_num) + return -EINVAL; + +- vm_idx = pvt->vm_idx[channel]; ++ vm_idx = pvt->vm_idx[channel / pvt->c_num]; ++ ch_idx = channel % pvt->c_num; + + switch (attr) { + case hwmon_in_input: +@@ -196,13 +199,23 @@ static int pvt_read_in(struct device *dev, u32 attr, int channel, long *val) + if (ret) + return ret; + +- ret = regmap_read(v_map, VM_SDIF_DATA(vm_idx), &n); ++ ret = regmap_read(v_map, VM_SDIF_DATA(vm_idx, ch_idx), &n); + if(ret < 0) + return ret; + + n &= SAMPLE_DATA_MSK; +- /* Convert the N bitstream count into voltage */ +- *val = (PVT_N_CONST * n - PVT_R_CONST) >> PVT_CONV_BITS; ++ /* ++ * Convert the N bitstream count into voltage. ++ * To support negative voltage calculation for 64bit machines ++ * n must be cast to long, since n and *val differ both in ++ * signedness and in size. ++ * Division is used instead of right shift, because for signed ++ * numbers, the sign bit is used to fill the vacated bit ++ * positions, and if the number is negative, 1 is used. ++ * BIT(x) may not be used instead of (1 << x) because it's ++ * unsigned. ++ */ ++ *val = (PVT_N_CONST * (long)n - PVT_R_CONST) / (1 << PVT_CONV_BITS); + + return 0; + default: +@@ -375,6 +388,19 @@ static int pvt_init(struct pvt_device *pvt) + if (ret) + return ret; + ++ val = (BIT(pvt->c_num) - 1) | VM_CH_INIT | ++ IP_POLL << SDIF_ADDR_SFT | SDIF_WRN_W | SDIF_PROG; ++ ret = regmap_write(v_map, SDIF_W, val); ++ if (ret < 0) ++ return ret; ++ ++ ret = regmap_read_poll_timeout(v_map, SDIF_STAT, ++ val, !(val & SDIF_BUSY), ++ PVT_POLL_DELAY_US, ++ PVT_POLL_TIMEOUT_US); ++ if (ret) ++ return ret; ++ + val = CFG1_VOL_MEAS_MODE | CFG1_PARALLEL_OUT | + CFG1_14_BIT | IP_CFG << SDIF_ADDR_SFT | + SDIF_WRN_W | SDIF_PROG; +@@ -489,8 +515,8 @@ static int pvt_reset_control_deassert(struct device *dev, struct pvt_device *pvt + + static int mr75203_probe(struct platform_device *pdev) + { ++ u32 ts_num, vm_num, pd_num, ch_num, val, index, i; + const struct hwmon_channel_info **pvt_info; +- u32 ts_num, vm_num, pd_num, val, index, i; + struct device *dev = &pdev->dev; + u32 *temp_config, *in_config; + struct device *hwmon_dev; +@@ -531,9 +557,11 @@ static int mr75203_probe(struct platform_device *pdev) + ts_num = (val & TS_NUM_MSK) >> TS_NUM_SFT; + pd_num = (val & PD_NUM_MSK) >> PD_NUM_SFT; + vm_num = (val & VM_NUM_MSK) >> VM_NUM_SFT; ++ ch_num = (val & CH_NUM_MSK) >> CH_NUM_SFT; + pvt->t_num = ts_num; + pvt->p_num = pd_num; + pvt->v_num = vm_num; ++ pvt->c_num = ch_num; + val = 0; + if (ts_num) + val++; +@@ -570,7 +598,7 @@ static int mr75203_probe(struct platform_device *pdev) + } + + if (vm_num) { +- u32 num = vm_num; ++ u32 total_ch; + + ret = pvt_get_regmap(pdev, "vm", pvt); + if (ret) +@@ -584,30 +612,30 @@ static int mr75203_probe(struct platform_device *pdev) + ret = device_property_read_u8_array(dev, "intel,vm-map", + pvt->vm_idx, vm_num); + if (ret) { +- num = 0; ++ /* ++ * Incase intel,vm-map property is not defined, we ++ * assume incremental channel numbers. ++ */ ++ for (i = 0; i < vm_num; i++) ++ pvt->vm_idx[i] = i; + } else { + for (i = 0; i < vm_num; i++) + if (pvt->vm_idx[i] >= vm_num || + pvt->vm_idx[i] == 0xff) { +- num = i; ++ pvt->v_num = i; ++ vm_num = i; + break; + } + } + +- /* +- * Incase intel,vm-map property is not defined, we assume +- * incremental channel numbers. +- */ +- for (i = num; i < vm_num; i++) +- pvt->vm_idx[i] = i; +- +- in_config = devm_kcalloc(dev, num + 1, ++ total_ch = ch_num * vm_num; ++ in_config = devm_kcalloc(dev, total_ch + 1, + sizeof(*in_config), GFP_KERNEL); + if (!in_config) + return -ENOMEM; + +- memset32(in_config, HWMON_I_INPUT, num); +- in_config[num] = 0; ++ memset32(in_config, HWMON_I_INPUT, total_ch); ++ in_config[total_ch] = 0; + pvt_in.config = in_config; + + pvt_info[index++] = &pvt_in; +diff --git a/drivers/hwmon/tps23861.c b/drivers/hwmon/tps23861.c +index 8bd6435c13e82..2148fd543bb4b 100644 +--- a/drivers/hwmon/tps23861.c ++++ b/drivers/hwmon/tps23861.c +@@ -489,18 +489,20 @@ static char *tps23861_port_poe_plus_status(struct tps23861_data *data, int port) + + static int tps23861_port_resistance(struct tps23861_data *data, int port) + { +- u16 regval; ++ unsigned int raw_val; ++ __le16 regval; + + regmap_bulk_read(data->regmap, + PORT_1_RESISTANCE_LSB + PORT_N_RESISTANCE_LSB_OFFSET * (port - 1), + ®val, + 2); + +- switch (FIELD_GET(PORT_RESISTANCE_RSN_MASK, regval)) { ++ raw_val = le16_to_cpu(regval); ++ switch (FIELD_GET(PORT_RESISTANCE_RSN_MASK, raw_val)) { + case PORT_RESISTANCE_RSN_OTHER: +- return (FIELD_GET(PORT_RESISTANCE_MASK, regval) * RESISTANCE_LSB) / 10000; ++ return (FIELD_GET(PORT_RESISTANCE_MASK, raw_val) * RESISTANCE_LSB) / 10000; + case PORT_RESISTANCE_RSN_LOW: +- return (FIELD_GET(PORT_RESISTANCE_MASK, regval) * RESISTANCE_LSB_LOW) / 10000; ++ return (FIELD_GET(PORT_RESISTANCE_MASK, raw_val) * RESISTANCE_LSB_LOW) / 10000; + case PORT_RESISTANCE_RSN_SHORT: + case PORT_RESISTANCE_RSN_OPEN: + default: +diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c +index fabca5e51e3d4..4dd133eccfdfb 100644 +--- a/drivers/infiniband/core/cma.c ++++ b/drivers/infiniband/core/cma.c +@@ -1719,8 +1719,8 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, + } + + if (!validate_net_dev(*net_dev, +- (struct sockaddr *)&req->listen_addr_storage, +- (struct sockaddr *)&req->src_addr_storage)) { ++ (struct sockaddr *)&req->src_addr_storage, ++ (struct sockaddr *)&req->listen_addr_storage)) { + id_priv = ERR_PTR(-EHOSTUNREACH); + goto err; + } +diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c +index 186ed8859920c..d39e16c211e8a 100644 +--- a/drivers/infiniband/core/umem_odp.c ++++ b/drivers/infiniband/core/umem_odp.c +@@ -462,7 +462,7 @@ retry: + mutex_unlock(&umem_odp->umem_mutex); + + out_put_mm: +- mmput(owning_mm); ++ mmput_async(owning_mm); + out_put_task: + if (owning_process) + put_task_struct(owning_process); +diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h +index 2855e9ad4b328..1df076e70e293 100644 +--- a/drivers/infiniband/hw/hns/hns_roce_device.h ++++ b/drivers/infiniband/hw/hns/hns_roce_device.h +@@ -730,7 +730,6 @@ struct hns_roce_caps { + u32 num_qps; + u32 num_pi_qps; + u32 reserved_qps; +- int num_qpc_timer; + u32 num_srqs; + u32 max_wqes; + u32 max_srq_wrs; +diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +index b354caeaa9b29..49edff989f1f1 100644 +--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c ++++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +@@ -1941,7 +1941,7 @@ static void set_default_caps(struct hns_roce_dev *hr_dev) + + caps->num_mtpts = HNS_ROCE_V2_MAX_MTPT_NUM; + caps->num_pds = HNS_ROCE_V2_MAX_PD_NUM; +- caps->num_qpc_timer = HNS_ROCE_V2_MAX_QPC_TIMER_NUM; ++ caps->qpc_timer_bt_num = HNS_ROCE_V2_MAX_QPC_TIMER_BT_NUM; + caps->cqc_timer_bt_num = HNS_ROCE_V2_MAX_CQC_TIMER_BT_NUM; + + caps->max_qp_init_rdma = HNS_ROCE_V2_MAX_QP_INIT_RDMA; +@@ -2237,7 +2237,6 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) + caps->max_rq_sg = le16_to_cpu(resp_a->max_rq_sg); + caps->max_rq_sg = roundup_pow_of_two(caps->max_rq_sg); + caps->max_extend_sg = le32_to_cpu(resp_a->max_extend_sg); +- caps->num_qpc_timer = le16_to_cpu(resp_a->num_qpc_timer); + caps->max_srq_sges = le16_to_cpu(resp_a->max_srq_sges); + caps->max_srq_sges = roundup_pow_of_two(caps->max_srq_sges); + caps->num_aeq_vectors = resp_a->num_aeq_vectors; +diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +index 7ffb7824d2689..e4b640caee1b7 100644 +--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h ++++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +@@ -36,11 +36,11 @@ + #include + + #define HNS_ROCE_V2_MAX_QP_NUM 0x1000 +-#define HNS_ROCE_V2_MAX_QPC_TIMER_NUM 0x200 + #define HNS_ROCE_V2_MAX_WQE_NUM 0x8000 + #define HNS_ROCE_V2_MAX_SRQ_WR 0x8000 + #define HNS_ROCE_V2_MAX_SRQ_SGE 64 + #define HNS_ROCE_V2_MAX_CQ_NUM 0x100000 ++#define HNS_ROCE_V2_MAX_QPC_TIMER_BT_NUM 0x100 + #define HNS_ROCE_V2_MAX_CQC_TIMER_BT_NUM 0x100 + #define HNS_ROCE_V2_MAX_SRQ_NUM 0x100000 + #define HNS_ROCE_V2_MAX_CQE_NUM 0x400000 +@@ -83,7 +83,7 @@ + + #define HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ PAGE_SIZE + #define HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ PAGE_SIZE +-#define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED 0xFFFFF000 ++#define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED 0xFFFF000 + #define HNS_ROCE_V2_MAX_INNER_MTPT_NUM 2 + #define HNS_ROCE_INVALID_LKEY 0x0 + #define HNS_ROCE_INVALID_SGE_LENGTH 0x80000000 +diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c +index c8af4ebd7cbd3..4ccb217b2841d 100644 +--- a/drivers/infiniband/hw/hns/hns_roce_main.c ++++ b/drivers/infiniband/hw/hns/hns_roce_main.c +@@ -725,7 +725,7 @@ static int hns_roce_init_hem(struct hns_roce_dev *hr_dev) + ret = hns_roce_init_hem_table(hr_dev, &hr_dev->qpc_timer_table, + HEM_TYPE_QPC_TIMER, + hr_dev->caps.qpc_timer_entry_sz, +- hr_dev->caps.num_qpc_timer, 1); ++ hr_dev->caps.qpc_timer_bt_num, 1); + if (ret) { + dev_err(dev, + "Failed to init QPC timer memory, aborting.\n"); +diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c +index 48d3616a6d71d..7bee7f6c5e702 100644 +--- a/drivers/infiniband/hw/hns/hns_roce_qp.c ++++ b/drivers/infiniband/hw/hns/hns_roce_qp.c +@@ -462,11 +462,8 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, + hr_qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge) + + hr_qp->rq.rsv_sge); + +- if (hr_dev->caps.max_rq_sg <= HNS_ROCE_SGE_IN_WQE) +- hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz); +- else +- hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz * +- hr_qp->rq.max_gs); ++ hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz * ++ hr_qp->rq.max_gs); + + hr_qp->rq.wqe_cnt = cnt; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE && +diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c +index daeab5daed5bc..d003ad864ee44 100644 +--- a/drivers/infiniband/hw/irdma/uk.c ++++ b/drivers/infiniband/hw/irdma/uk.c +@@ -1005,6 +1005,7 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, + int ret_code; + bool move_cq_head = true; + u8 polarity; ++ u8 op_type; + bool ext_valid; + __le64 *ext_cqe; + +@@ -1187,7 +1188,6 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, + do { + __le64 *sw_wqe; + u64 wqe_qword; +- u8 op_type; + u32 tail; + + tail = qp->sq_ring.tail; +@@ -1204,6 +1204,8 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, + break; + } + } while (1); ++ if (op_type == IRDMA_OP_TYPE_BIND_MW && info->minor_err == FLUSH_PROT_ERR) ++ info->minor_err = FLUSH_MW_BIND_ERR; + qp->sq_flush_seen = true; + if (!IRDMA_RING_MORE_WORK(qp->sq_ring)) + qp->sq_flush_complete = true; +diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c +index ab3c5208a1231..f4d774451160d 100644 +--- a/drivers/infiniband/hw/irdma/utils.c ++++ b/drivers/infiniband/hw/irdma/utils.c +@@ -590,11 +590,14 @@ static int irdma_wait_event(struct irdma_pci_f *rf, + cqp_error = cqp_request->compl_info.error; + if (cqp_error) { + err_code = -EIO; +- if (cqp_request->compl_info.maj_err_code == 0xFFFF && +- cqp_request->compl_info.min_err_code == 0x8029) { +- if (!rf->reset) { +- rf->reset = true; +- rf->gen_ops.request_reset(rf); ++ if (cqp_request->compl_info.maj_err_code == 0xFFFF) { ++ if (cqp_request->compl_info.min_err_code == 0x8002) ++ err_code = -EBUSY; ++ else if (cqp_request->compl_info.min_err_code == 0x8029) { ++ if (!rf->reset) { ++ rf->reset = true; ++ rf->gen_ops.request_reset(rf); ++ } + } + } + } +@@ -2597,7 +2600,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) + spin_unlock_irqrestore(&iwqp->lock, flags2); + spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + if (compl_generated) +- irdma_comp_handler(iwqp->iwrcq); ++ irdma_comp_handler(iwqp->iwscq); + } else { + spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, +diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c +index 227a799385d1d..ab73d1715f991 100644 +--- a/drivers/infiniband/hw/irdma/verbs.c ++++ b/drivers/infiniband/hw/irdma/verbs.c +@@ -39,15 +39,18 @@ static int irdma_query_device(struct ib_device *ibdev, + props->max_send_sge = hw_attrs->uk_attrs.max_hw_wq_frags; + props->max_recv_sge = hw_attrs->uk_attrs.max_hw_wq_frags; + props->max_cq = rf->max_cq - rf->used_cqs; +- props->max_cqe = rf->max_cqe; ++ props->max_cqe = rf->max_cqe - 1; + props->max_mr = rf->max_mr - rf->used_mrs; + props->max_mw = props->max_mr; + props->max_pd = rf->max_pd - rf->used_pds; + props->max_sge_rd = hw_attrs->uk_attrs.max_hw_read_sges; + props->max_qp_rd_atom = hw_attrs->max_hw_ird; + props->max_qp_init_rd_atom = hw_attrs->max_hw_ord; +- if (rdma_protocol_roce(ibdev, 1)) ++ if (rdma_protocol_roce(ibdev, 1)) { ++ props->device_cap_flags |= IB_DEVICE_RC_RNR_NAK_GEN; + props->max_pkeys = IRDMA_PKEY_TBL_SZ; ++ } ++ + props->max_ah = rf->max_ah; + props->max_mcast_grp = rf->max_mcg; + props->max_mcast_qp_attach = IRDMA_MAX_MGS_PER_CTX; +@@ -3001,6 +3004,7 @@ static int irdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) + struct irdma_pble_alloc *palloc = &iwpbl->pble_alloc; + struct irdma_cqp_request *cqp_request; + struct cqp_cmds_info *cqp_info; ++ int status; + + if (iwmr->type != IRDMA_MEMREG_TYPE_MEM) { + if (iwmr->region) { +@@ -3031,8 +3035,11 @@ static int irdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) + cqp_info->post_sq = 1; + cqp_info->in.u.dealloc_stag.dev = &iwdev->rf->sc_dev; + cqp_info->in.u.dealloc_stag.scratch = (uintptr_t)cqp_request; +- irdma_handle_cqp_op(iwdev->rf, cqp_request); ++ status = irdma_handle_cqp_op(iwdev->rf, cqp_request); + irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request); ++ if (status) ++ return status; ++ + irdma_free_stag(iwdev, iwmr->stag); + done: + if (iwpbl->pbl_allocated) +diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c +index 293ed709e5ed5..b4dc52392275b 100644 +--- a/drivers/infiniband/hw/mlx5/mad.c ++++ b/drivers/infiniband/hw/mlx5/mad.c +@@ -166,6 +166,12 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, + mdev = dev->mdev; + mdev_port_num = 1; + } ++ if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1) { ++ /* set local port to one for Function-Per-Port HCA. */ ++ mdev = dev->mdev; ++ mdev_port_num = 1; ++ } ++ + /* Declaring support of extended counters */ + if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { + struct ib_class_port_info cpi = {}; +diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c +index 1f4e60257700e..7d47b521070b1 100644 +--- a/drivers/infiniband/sw/siw/siw_qp_tx.c ++++ b/drivers/infiniband/sw/siw/siw_qp_tx.c +@@ -29,7 +29,7 @@ static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) + dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); + + if (paddr) +- return virt_to_page(paddr); ++ return virt_to_page((void *)paddr); + + return NULL; + } +@@ -533,13 +533,23 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) + kunmap_local(kaddr); + } + } else { +- u64 va = sge->laddr + sge_off; ++ /* ++ * Cast to an uintptr_t to preserve all 64 bits ++ * in sge->laddr. ++ */ ++ uintptr_t va = (uintptr_t)(sge->laddr + sge_off); + +- page_array[seg] = virt_to_page(va & PAGE_MASK); ++ /* ++ * virt_to_page() takes a (void *) pointer ++ * so cast to a (void *) meaning it will be 64 ++ * bits on a 64 bit platform and 32 bits on a ++ * 32 bit platform. ++ */ ++ page_array[seg] = virt_to_page((void *)(va & PAGE_MASK)); + if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, +- (void *)(uintptr_t)va, ++ (void *)va, + plen); + } + +diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c +index 525f083fcaeb4..bf464400a4409 100644 +--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c ++++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c +@@ -1004,7 +1004,8 @@ rtrs_clt_get_copy_req(struct rtrs_clt_path *alive_path, + static int rtrs_post_rdma_write_sg(struct rtrs_clt_con *con, + struct rtrs_clt_io_req *req, + struct rtrs_rbuf *rbuf, bool fr_en, +- u32 size, u32 imm, struct ib_send_wr *wr, ++ u32 count, u32 size, u32 imm, ++ struct ib_send_wr *wr, + struct ib_send_wr *tail) + { + struct rtrs_clt_path *clt_path = to_clt_path(con->c.path); +@@ -1024,12 +1025,12 @@ static int rtrs_post_rdma_write_sg(struct rtrs_clt_con *con, + num_sge = 2; + ptail = tail; + } else { +- for_each_sg(req->sglist, sg, req->sg_cnt, i) { ++ for_each_sg(req->sglist, sg, count, i) { + sge[i].addr = sg_dma_address(sg); + sge[i].length = sg_dma_len(sg); + sge[i].lkey = clt_path->s.dev->ib_pd->local_dma_lkey; + } +- num_sge = 1 + req->sg_cnt; ++ num_sge = 1 + count; + } + sge[i].addr = req->iu->dma_addr; + sge[i].length = size; +@@ -1142,7 +1143,7 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) + */ + rtrs_clt_update_all_stats(req, WRITE); + +- ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, fr_en, ++ ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, fr_en, count, + req->usr_len + sizeof(*msg), + imm, wr, &inv_wr); + if (ret) { +diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c +index 24024bce25664..ee4876bdce4ac 100644 +--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c ++++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c +@@ -600,7 +600,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) + struct sg_table *sgt = &srv_mr->sgt; + struct scatterlist *s; + struct ib_mr *mr; +- int nr, chunks; ++ int nr, nr_sgt, chunks; + + chunks = chunks_per_mr * mri; + if (!always_invalidate) +@@ -615,19 +615,19 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) + sg_set_page(s, srv->chunks[chunks + i], + max_chunk_size, 0); + +- nr = ib_dma_map_sg(srv_path->s.dev->ib_dev, sgt->sgl, ++ nr_sgt = ib_dma_map_sg(srv_path->s.dev->ib_dev, sgt->sgl, + sgt->nents, DMA_BIDIRECTIONAL); +- if (nr < sgt->nents) { +- err = nr < 0 ? nr : -EINVAL; ++ if (!nr_sgt) { ++ err = -EINVAL; + goto free_sg; + } + mr = ib_alloc_mr(srv_path->s.dev->ib_pd, IB_MR_TYPE_MEM_REG, +- sgt->nents); ++ nr_sgt); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto unmap_sg; + } +- nr = ib_map_mr_sg(mr, sgt->sgl, sgt->nents, ++ nr = ib_map_mr_sg(mr, sgt->sgl, nr_sgt, + NULL, max_chunk_size); + if (nr < 0 || nr < sgt->nents) { + err = nr < 0 ? nr : -EINVAL; +@@ -646,7 +646,7 @@ static int map_cont_bufs(struct rtrs_srv_path *srv_path) + } + } + /* Eventually dma addr for each chunk can be cached */ +- for_each_sg(sgt->sgl, s, sgt->orig_nents, i) ++ for_each_sg(sgt->sgl, s, nr_sgt, i) + srv_path->dma_addr[chunks + i] = sg_dma_address(s); + + ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); +diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c +index 6058abf42ba74..3d9c108d73ad8 100644 +--- a/drivers/infiniband/ulp/srp/ib_srp.c ++++ b/drivers/infiniband/ulp/srp/ib_srp.c +@@ -1962,7 +1962,8 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) + if (scmnd) { + req = scsi_cmd_priv(scmnd); + scmnd = srp_claim_req(ch, req, NULL, scmnd); +- } else { ++ } ++ if (!scmnd) { + shost_printk(KERN_ERR, target->scsi_host, + "Null scmnd for RSP w/tag %#016llx received on ch %td / QP %#x\n", + rsp->tag, ch - target->ch, ch->qp->qp_num); +diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c +index 840831d5d2ad9..a0924144bac80 100644 +--- a/drivers/iommu/amd/iommu.c ++++ b/drivers/iommu/amd/iommu.c +@@ -874,7 +874,8 @@ static void build_completion_wait(struct iommu_cmd *cmd, + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; + cmd->data[1] = upper_32_bits(paddr); +- cmd->data[2] = data; ++ cmd->data[2] = lower_32_bits(data); ++ cmd->data[3] = upper_32_bits(data); + CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); + } + +diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c +index afb3efd565b78..f3e2689787ae5 100644 +--- a/drivers/iommu/amd/iommu_v2.c ++++ b/drivers/iommu/amd/iommu_v2.c +@@ -786,6 +786,8 @@ int amd_iommu_init_device(struct pci_dev *pdev, int pasids) + if (dev_state->domain == NULL) + goto out_free_states; + ++ /* See iommu_is_default_domain() */ ++ dev_state->domain->type = IOMMU_DOMAIN_IDENTITY; + amd_iommu_domain_direct_map(dev_state->domain); + + ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids); +diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c +index 64b14ac4c7b02..fc8c1420c0b69 100644 +--- a/drivers/iommu/intel/dmar.c ++++ b/drivers/iommu/intel/dmar.c +@@ -2368,6 +2368,13 @@ static int dmar_device_hotplug(acpi_handle handle, bool insert) + if (!dmar_in_use()) + return 0; + ++ /* ++ * It's unlikely that any I/O board is hot added before the IOMMU ++ * subsystem is initialized. ++ */ ++ if (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled) ++ return -EOPNOTSUPP; ++ + if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) { + tmp = handle; + } else { +diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c +index 5c0dce78586aa..40ac3a78d90ef 100644 +--- a/drivers/iommu/intel/iommu.c ++++ b/drivers/iommu/intel/iommu.c +@@ -422,14 +422,36 @@ static inline int domain_pfn_supported(struct dmar_domain *domain, + return !(addr_width < BITS_PER_LONG && pfn >> addr_width); + } + ++/* ++ * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. ++ * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of ++ * the returned SAGAW. ++ */ ++static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) ++{ ++ unsigned long fl_sagaw, sl_sagaw; ++ ++ fl_sagaw = BIT(2) | (cap_fl1gp_support(iommu->cap) ? BIT(3) : 0); ++ sl_sagaw = cap_sagaw(iommu->cap); ++ ++ /* Second level only. */ ++ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) ++ return sl_sagaw; ++ ++ /* First level only. */ ++ if (!ecap_slts(iommu->ecap)) ++ return fl_sagaw; ++ ++ return fl_sagaw & sl_sagaw; ++} ++ + static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) + { + unsigned long sagaw; + int agaw; + +- sagaw = cap_sagaw(iommu->cap); +- for (agaw = width_to_agaw(max_gaw); +- agaw >= 0; agaw--) { ++ sagaw = __iommu_calculate_sagaw(iommu); ++ for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { + if (test_bit(agaw, &sagaw)) + break; + } +@@ -3123,13 +3145,7 @@ static int __init init_dmars(void) + + #ifdef CONFIG_INTEL_IOMMU_SVM + if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { +- /* +- * Call dmar_alloc_hwirq() with dmar_global_lock held, +- * could cause possible lock race condition. +- */ +- up_write(&dmar_global_lock); + ret = intel_svm_enable_prq(iommu); +- down_write(&dmar_global_lock); + if (ret) + goto free_iommu; + } +@@ -4035,7 +4051,6 @@ int __init intel_iommu_init(void) + force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || + platform_optin_force_iommu(); + +- down_write(&dmar_global_lock); + if (dmar_table_init()) { + if (force_on) + panic("tboot: Failed to initialize DMAR table\n"); +@@ -4048,16 +4063,6 @@ int __init intel_iommu_init(void) + goto out_free_dmar; + } + +- up_write(&dmar_global_lock); +- +- /* +- * The bus notifier takes the dmar_global_lock, so lockdep will +- * complain later when we register it under the lock. +- */ +- dmar_register_bus_notifier(); +- +- down_write(&dmar_global_lock); +- + if (!no_iommu) + intel_iommu_debugfs_init(); + +@@ -4105,11 +4110,9 @@ int __init intel_iommu_init(void) + pr_err("Initialization failed\n"); + goto out_free_dmar; + } +- up_write(&dmar_global_lock); + + init_iommu_pm_ops(); + +- down_read(&dmar_global_lock); + for_each_active_iommu(iommu, drhd) { + /* + * The flush queue implementation does not perform +@@ -4127,13 +4130,11 @@ int __init intel_iommu_init(void) + "%s", iommu->name); + iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + } +- up_read(&dmar_global_lock); + + bus_set_iommu(&pci_bus_type, &intel_iommu_ops); + if (si_domain && !hw_pass_through) + register_memory_notifier(&intel_iommu_memory_nb); + +- down_read(&dmar_global_lock); + if (probe_acpi_namespace_devices()) + pr_warn("ACPI name space devices didn't probe correctly\n"); + +@@ -4144,17 +4145,15 @@ int __init intel_iommu_init(void) + + iommu_disable_protect_mem_regions(iommu); + } +- up_read(&dmar_global_lock); +- +- pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); + + intel_iommu_enabled = 1; ++ dmar_register_bus_notifier(); ++ pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); + + return 0; + + out_free_dmar: + intel_iommu_free_dmars(); +- up_write(&dmar_global_lock); + return ret; + } + +diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c +index 847ad47a2dfd3..f113833c3075c 100644 +--- a/drivers/iommu/iommu.c ++++ b/drivers/iommu/iommu.c +@@ -3089,6 +3089,24 @@ out: + return ret; + } + ++static bool iommu_is_default_domain(struct iommu_group *group) ++{ ++ if (group->domain == group->default_domain) ++ return true; ++ ++ /* ++ * If the default domain was set to identity and it is still an identity ++ * domain then we consider this a pass. This happens because of ++ * amd_iommu_init_device() replacing the default idenytity domain with an ++ * identity domain that has a different configuration for AMDGPU. ++ */ ++ if (group->default_domain && ++ group->default_domain->type == IOMMU_DOMAIN_IDENTITY && ++ group->domain && group->domain->type == IOMMU_DOMAIN_IDENTITY) ++ return true; ++ return false; ++} ++ + /** + * iommu_device_use_default_domain() - Device driver wants to handle device + * DMA through the kernel DMA API. +@@ -3107,8 +3125,7 @@ int iommu_device_use_default_domain(struct device *dev) + + mutex_lock(&group->mutex); + if (group->owner_cnt) { +- if (group->domain != group->default_domain || +- group->owner) { ++ if (group->owner || !iommu_is_default_domain(group)) { + ret = -EBUSY; + goto unlock_out; + } +diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c +index 25be4b822aa07..bf340d779c10b 100644 +--- a/drivers/iommu/virtio-iommu.c ++++ b/drivers/iommu/virtio-iommu.c +@@ -1006,7 +1006,18 @@ static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args) + return iommu_fwspec_add_ids(dev, args->args, 1); + } + ++static bool viommu_capable(enum iommu_cap cap) ++{ ++ switch (cap) { ++ case IOMMU_CAP_CACHE_COHERENCY: ++ return true; ++ default: ++ return false; ++ } ++} ++ + static struct iommu_ops viommu_ops = { ++ .capable = viommu_capable, + .domain_alloc = viommu_domain_alloc, + .probe_device = viommu_probe_device, + .probe_finalize = viommu_probe_finalize, +diff --git a/drivers/md/md.c b/drivers/md/md.c +index 91e7e80fce489..25d18b67a1620 100644 +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -5647,6 +5647,7 @@ static int md_alloc(dev_t dev, char *name) + * removed (mddev_delayed_delete). + */ + flush_workqueue(md_misc_wq); ++ flush_workqueue(md_rdev_misc_wq); + + mutex_lock(&disks_mutex); + mddev = mddev_alloc(dev); +diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c +index 6ba4c83fe5fc0..bff0bfd10e235 100644 +--- a/drivers/net/bonding/bond_main.c ++++ b/drivers/net/bonding/bond_main.c +@@ -1974,6 +1974,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, + for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) + new_slave->target_last_arp_rx[i] = new_slave->last_rx; + ++ new_slave->last_tx = new_slave->last_rx; ++ + if (bond->params.miimon && !bond->params.use_carrier) { + link_reporting = bond_check_dev_link(bond, slave_dev, 1); + +@@ -2857,8 +2859,11 @@ static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip, + return; + } + +- if (bond_handle_vlan(slave, tags, skb)) ++ if (bond_handle_vlan(slave, tags, skb)) { ++ slave_update_last_tx(slave); + arp_xmit(skb); ++ } ++ + return; + } + +@@ -3047,8 +3052,7 @@ static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, + curr_active_slave->last_link_up)) + bond_validate_arp(bond, slave, tip, sip); + else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) && +- bond_time_in_interval(bond, +- dev_trans_start(curr_arp_slave->dev), 1)) ++ bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) + bond_validate_arp(bond, slave, sip, tip); + + out_unlock: +@@ -3076,8 +3080,10 @@ static void bond_ns_send(struct slave *slave, const struct in6_addr *daddr, + } + + addrconf_addr_solict_mult(daddr, &mcaddr); +- if (bond_handle_vlan(slave, tags, skb)) ++ if (bond_handle_vlan(slave, tags, skb)) { ++ slave_update_last_tx(slave); + ndisc_send_skb(skb, &mcaddr, saddr); ++ } + } + + static void bond_ns_send_all(struct bonding *bond, struct slave *slave) +@@ -3134,6 +3140,9 @@ static void bond_ns_send_all(struct bonding *bond, struct slave *slave) + found: + if (!ipv6_dev_get_saddr(dev_net(dst->dev), dst->dev, &targets[i], 0, &saddr)) + bond_ns_send(slave, &targets[i], &saddr, tags); ++ else ++ bond_ns_send(slave, &targets[i], &in6addr_any, tags); ++ + dst_release(dst); + kfree(tags); + } +@@ -3165,12 +3174,19 @@ static bool bond_has_this_ip6(struct bonding *bond, struct in6_addr *addr) + return ret; + } + +-static void bond_validate_ns(struct bonding *bond, struct slave *slave, ++static void bond_validate_na(struct bonding *bond, struct slave *slave, + struct in6_addr *saddr, struct in6_addr *daddr) + { + int i; + +- if (ipv6_addr_any(saddr) || !bond_has_this_ip6(bond, daddr)) { ++ /* Ignore NAs that: ++ * 1. Source address is unspecified address. ++ * 2. Dest address is neither all-nodes multicast address nor ++ * exist on bond interface. ++ */ ++ if (ipv6_addr_any(saddr) || ++ (!ipv6_addr_equal(daddr, &in6addr_linklocal_allnodes) && ++ !bond_has_this_ip6(bond, daddr))) { + slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c tip %pI6c not found\n", + __func__, saddr, daddr); + return; +@@ -3213,15 +3229,14 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, + * see bond_arp_rcv(). + */ + if (bond_is_active_slave(slave)) +- bond_validate_ns(bond, slave, saddr, daddr); ++ bond_validate_na(bond, slave, saddr, daddr); + else if (curr_active_slave && + time_after(slave_last_rx(bond, curr_active_slave), + curr_active_slave->last_link_up)) +- bond_validate_ns(bond, slave, saddr, daddr); ++ bond_validate_na(bond, slave, saddr, daddr); + else if (curr_arp_slave && +- bond_time_in_interval(bond, +- dev_trans_start(curr_arp_slave->dev), 1)) +- bond_validate_ns(bond, slave, saddr, daddr); ++ bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) ++ bond_validate_na(bond, slave, saddr, daddr); + + out: + return RX_HANDLER_ANOTHER; +@@ -3308,12 +3323,12 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) + * so it can wait + */ + bond_for_each_slave_rcu(bond, slave, iter) { +- unsigned long trans_start = dev_trans_start(slave->dev); ++ unsigned long last_tx = slave_last_tx(slave); + + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); + + if (slave->link != BOND_LINK_UP) { +- if (bond_time_in_interval(bond, trans_start, 1) && ++ if (bond_time_in_interval(bond, last_tx, 1) && + bond_time_in_interval(bond, slave->last_rx, 1)) { + + bond_propose_link_state(slave, BOND_LINK_UP); +@@ -3338,7 +3353,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) + * when the source ip is 0, so don't take the link down + * if we don't know our ip yet + */ +- if (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) || ++ if (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || + !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) { + + bond_propose_link_state(slave, BOND_LINK_DOWN); +@@ -3404,7 +3419,7 @@ re_arm: + */ + static int bond_ab_arp_inspect(struct bonding *bond) + { +- unsigned long trans_start, last_rx; ++ unsigned long last_tx, last_rx; + struct list_head *iter; + struct slave *slave; + int commit = 0; +@@ -3455,9 +3470,9 @@ static int bond_ab_arp_inspect(struct bonding *bond) + * - (more than missed_max*delta since receive AND + * the bond has an IP address) + */ +- trans_start = dev_trans_start(slave->dev); ++ last_tx = slave_last_tx(slave); + if (bond_is_active_slave(slave) && +- (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) || ++ (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || + !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) { + bond_propose_link_state(slave, BOND_LINK_DOWN); + commit++; +@@ -3474,8 +3489,8 @@ static int bond_ab_arp_inspect(struct bonding *bond) + */ + static void bond_ab_arp_commit(struct bonding *bond) + { +- unsigned long trans_start; + struct list_head *iter; ++ unsigned long last_tx; + struct slave *slave; + + bond_for_each_slave(bond, slave, iter) { +@@ -3484,10 +3499,10 @@ static void bond_ab_arp_commit(struct bonding *bond) + continue; + + case BOND_LINK_UP: +- trans_start = dev_trans_start(slave->dev); ++ last_tx = slave_last_tx(slave); + if (rtnl_dereference(bond->curr_active_slave) != slave || + (!rtnl_dereference(bond->curr_active_slave) && +- bond_time_in_interval(bond, trans_start, 1))) { ++ bond_time_in_interval(bond, last_tx, 1))) { + struct slave *current_arp_slave; + + current_arp_slave = rtnl_dereference(bond->current_arp_slave); +diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c +index 6439b56f381f9..517bc3922ee24 100644 +--- a/drivers/net/dsa/ocelot/felix_vsc9959.c ++++ b/drivers/net/dsa/ocelot/felix_vsc9959.c +@@ -16,11 +16,13 @@ + #include + #include + #include ++#include + #include "felix.h" + + #define VSC9959_NUM_PORTS 6 + + #define VSC9959_TAS_GCL_ENTRY_MAX 63 ++#define VSC9959_TAS_MIN_GATE_LEN_NS 33 + #define VSC9959_VCAP_POLICER_BASE 63 + #define VSC9959_VCAP_POLICER_MAX 383 + #define VSC9959_SWITCH_PCI_BAR 4 +@@ -1410,6 +1412,23 @@ static void vsc9959_mdio_bus_free(struct ocelot *ocelot) + mdiobus_free(felix->imdio); + } + ++/* The switch considers any frame (regardless of size) as eligible for ++ * transmission if the traffic class gate is open for at least 33 ns. ++ * Overruns are prevented by cropping an interval at the end of the gate time ++ * slot for which egress scheduling is blocked, but we need to still keep 33 ns ++ * available for one packet to be transmitted, otherwise the port tc will hang. ++ * This function returns the size of a gate interval that remains available for ++ * setting the guard band, after reserving the space for one egress frame. ++ */ ++static u64 vsc9959_tas_remaining_gate_len_ps(u64 gate_len_ns) ++{ ++ /* Gate always open */ ++ if (gate_len_ns == U64_MAX) ++ return U64_MAX; ++ ++ return (gate_len_ns - VSC9959_TAS_MIN_GATE_LEN_NS) * PSEC_PER_NSEC; ++} ++ + /* Extract shortest continuous gate open intervals in ns for each traffic class + * of a cyclic tc-taprio schedule. If a gate is always open, the duration is + * considered U64_MAX. If the gate is always closed, it is considered 0. +@@ -1471,6 +1490,65 @@ static void vsc9959_tas_min_gate_lengths(struct tc_taprio_qopt_offload *taprio, + min_gate_len[tc] = 0; + } + ++/* ocelot_write_rix is a macro that concatenates QSYS_MAXSDU_CFG_* with _RSZ, ++ * so we need to spell out the register access to each traffic class in helper ++ * functions, to simplify callers ++ */ ++static void vsc9959_port_qmaxsdu_set(struct ocelot *ocelot, int port, int tc, ++ u32 max_sdu) ++{ ++ switch (tc) { ++ case 0: ++ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_0, ++ port); ++ break; ++ case 1: ++ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_1, ++ port); ++ break; ++ case 2: ++ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_2, ++ port); ++ break; ++ case 3: ++ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_3, ++ port); ++ break; ++ case 4: ++ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_4, ++ port); ++ break; ++ case 5: ++ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_5, ++ port); ++ break; ++ case 6: ++ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_6, ++ port); ++ break; ++ case 7: ++ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_7, ++ port); ++ break; ++ } ++} ++ ++static u32 vsc9959_port_qmaxsdu_get(struct ocelot *ocelot, int port, int tc) ++{ ++ switch (tc) { ++ case 0: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_0, port); ++ case 1: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_1, port); ++ case 2: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_2, port); ++ case 3: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_3, port); ++ case 4: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_4, port); ++ case 5: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_5, port); ++ case 6: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_6, port); ++ case 7: return ocelot_read_rix(ocelot, QSYS_QMAXSDU_CFG_7, port); ++ default: ++ return 0; ++ } ++} ++ + /* Update QSYS_PORT_MAX_SDU to make sure the static guard bands added by the + * switch (see the ALWAYS_GUARD_BAND_SCH_Q comment) are correct at all MTU + * values (the default value is 1518). Also, for traffic class windows smaller +@@ -1527,11 +1605,16 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) + + vsc9959_tas_min_gate_lengths(ocelot_port->taprio, min_gate_len); + ++ mutex_lock(&ocelot->fwd_domain_lock); ++ + for (tc = 0; tc < OCELOT_NUM_TC; tc++) { ++ u64 remaining_gate_len_ps; + u32 max_sdu; + +- if (min_gate_len[tc] == U64_MAX /* Gate always open */ || +- min_gate_len[tc] * 1000 > needed_bit_time_ps) { ++ remaining_gate_len_ps = ++ vsc9959_tas_remaining_gate_len_ps(min_gate_len[tc]); ++ ++ if (remaining_gate_len_ps > needed_bit_time_ps) { + /* Setting QMAXSDU_CFG to 0 disables oversized frame + * dropping. + */ +@@ -1544,9 +1627,15 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) + /* If traffic class doesn't support a full MTU sized + * frame, make sure to enable oversize frame dropping + * for frames larger than the smallest that would fit. ++ * ++ * However, the exact same register, QSYS_QMAXSDU_CFG_*, ++ * controls not only oversized frame dropping, but also ++ * per-tc static guard band lengths, so it reduces the ++ * useful gate interval length. Therefore, be careful ++ * to calculate a guard band (and therefore max_sdu) ++ * that still leaves 33 ns available in the time slot. + */ +- max_sdu = div_u64(min_gate_len[tc] * 1000, +- picos_per_byte); ++ max_sdu = div_u64(remaining_gate_len_ps, picos_per_byte); + /* A TC gate may be completely closed, which is a + * special case where all packets are oversized. + * Any limit smaller than 64 octets accomplishes this +@@ -1569,47 +1658,14 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) + max_sdu); + } + +- /* ocelot_write_rix is a macro that concatenates +- * QSYS_MAXSDU_CFG_* with _RSZ, so we need to spell out +- * the writes to each traffic class +- */ +- switch (tc) { +- case 0: +- ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_0, +- port); +- break; +- case 1: +- ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_1, +- port); +- break; +- case 2: +- ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_2, +- port); +- break; +- case 3: +- ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_3, +- port); +- break; +- case 4: +- ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_4, +- port); +- break; +- case 5: +- ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_5, +- port); +- break; +- case 6: +- ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_6, +- port); +- break; +- case 7: +- ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_7, +- port); +- break; +- } ++ vsc9959_port_qmaxsdu_set(ocelot, port, tc, max_sdu); + } + + ocelot_write_rix(ocelot, maxlen, QSYS_PORT_MAX_SDU, port); ++ ++ ocelot->ops->cut_through_fwd(ocelot); ++ ++ mutex_unlock(&ocelot->fwd_domain_lock); + } + + static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port, +@@ -1636,13 +1692,13 @@ static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port, + break; + } + ++ mutex_lock(&ocelot->tas_lock); ++ + ocelot_rmw_rix(ocelot, + QSYS_TAG_CONFIG_LINK_SPEED(tas_speed), + QSYS_TAG_CONFIG_LINK_SPEED_M, + QSYS_TAG_CONFIG, port); + +- mutex_lock(&ocelot->tas_lock); +- + if (ocelot_port->taprio) + vsc9959_tas_guard_bands_update(ocelot, port); + +@@ -2709,7 +2765,7 @@ static void vsc9959_cut_through_fwd(struct ocelot *ocelot) + { + struct felix *felix = ocelot_to_felix(ocelot); + struct dsa_switch *ds = felix->ds; +- int port, other_port; ++ int tc, port, other_port; + + lockdep_assert_held(&ocelot->fwd_domain_lock); + +@@ -2753,19 +2809,27 @@ static void vsc9959_cut_through_fwd(struct ocelot *ocelot) + min_speed = other_ocelot_port->speed; + } + +- /* Enable cut-through forwarding for all traffic classes. */ +- if (ocelot_port->speed == min_speed) ++ /* Enable cut-through forwarding for all traffic classes that ++ * don't have oversized dropping enabled, since this check is ++ * bypassed in cut-through mode. ++ */ ++ if (ocelot_port->speed == min_speed) { + val = GENMASK(7, 0); + ++ for (tc = 0; tc < OCELOT_NUM_TC; tc++) ++ if (vsc9959_port_qmaxsdu_get(ocelot, port, tc)) ++ val &= ~BIT(tc); ++ } ++ + set: + tmp = ocelot_read_rix(ocelot, ANA_CUT_THRU_CFG, port); + if (tmp == val) + continue; + + dev_dbg(ocelot->dev, +- "port %d fwd mask 0x%lx speed %d min_speed %d, %s cut-through forwarding\n", ++ "port %d fwd mask 0x%lx speed %d min_speed %d, %s cut-through forwarding on TC mask 0x%x\n", + port, mask, ocelot_port->speed, min_speed, +- val ? "enabling" : "disabling"); ++ val ? "enabling" : "disabling", val); + + ocelot_write_rix(ocelot, val, ANA_CUT_THRU_CFG, port); + } +diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h +index 407fe8f340a06..c5b61bc80f783 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e.h ++++ b/drivers/net/ethernet/intel/i40e/i40e.h +@@ -1291,4 +1291,18 @@ int i40e_add_del_cloud_filter(struct i40e_vsi *vsi, + int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, + struct i40e_cloud_filter *filter, + bool add); ++ ++/** ++ * i40e_is_tc_mqprio_enabled - check if TC MQPRIO is enabled on PF ++ * @pf: pointer to a pf. ++ * ++ * Check and return value of flag I40E_FLAG_TC_MQPRIO. ++ * ++ * Return: I40E_FLAG_TC_MQPRIO set state. ++ **/ ++static inline u32 i40e_is_tc_mqprio_enabled(struct i40e_pf *pf) ++{ ++ return pf->flags & I40E_FLAG_TC_MQPRIO; ++} ++ + #endif /* _I40E_H_ */ +diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c +index ea2bb0140a6eb..10d7a982a5b9b 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_client.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_client.c +@@ -177,6 +177,10 @@ void i40e_notify_client_of_netdev_close(struct i40e_vsi *vsi, bool reset) + "Cannot locate client instance close routine\n"); + return; + } ++ if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) { ++ dev_dbg(&pf->pdev->dev, "Client is not open, abort close\n"); ++ return; ++ } + cdev->client->ops->close(&cdev->lan_info, cdev->client, reset); + clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state); + i40e_client_release_qvlist(&cdev->lan_info); +@@ -429,7 +433,6 @@ void i40e_client_subtask(struct i40e_pf *pf) + /* Remove failed client instance */ + clear_bit(__I40E_CLIENT_INSTANCE_OPENED, + &cdev->state); +- i40e_client_del_instance(pf); + return; + } + } +diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +index 22a61802a4027..ed9984f1e1b9f 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +@@ -4931,7 +4931,7 @@ static int i40e_set_channels(struct net_device *dev, + /* We do not support setting channels via ethtool when TCs are + * configured through mqprio + */ +- if (pf->flags & I40E_FLAG_TC_MQPRIO) ++ if (i40e_is_tc_mqprio_enabled(pf)) + return -EINVAL; + + /* verify they are not requesting separate vectors */ +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index 71a8e1698ed48..1aaf0c5ddf6cf 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -5339,7 +5339,7 @@ static u8 i40e_pf_get_num_tc(struct i40e_pf *pf) + u8 num_tc = 0; + struct i40e_dcbx_config *dcbcfg = &hw->local_dcbx_config; + +- if (pf->flags & I40E_FLAG_TC_MQPRIO) ++ if (i40e_is_tc_mqprio_enabled(pf)) + return pf->vsi[pf->lan_vsi]->mqprio_qopt.qopt.num_tc; + + /* If neither MQPRIO nor DCB is enabled, then always use single TC */ +@@ -5371,7 +5371,7 @@ static u8 i40e_pf_get_num_tc(struct i40e_pf *pf) + **/ + static u8 i40e_pf_get_tc_map(struct i40e_pf *pf) + { +- if (pf->flags & I40E_FLAG_TC_MQPRIO) ++ if (i40e_is_tc_mqprio_enabled(pf)) + return i40e_mqprio_get_enabled_tc(pf); + + /* If neither MQPRIO nor DCB is enabled for this PF then just return +@@ -5468,7 +5468,7 @@ static int i40e_vsi_configure_bw_alloc(struct i40e_vsi *vsi, u8 enabled_tc, + int i; + + /* There is no need to reset BW when mqprio mode is on. */ +- if (pf->flags & I40E_FLAG_TC_MQPRIO) ++ if (i40e_is_tc_mqprio_enabled(pf)) + return 0; + if (!vsi->mqprio_qopt.qopt.hw && !(pf->flags & I40E_FLAG_DCB_ENABLED)) { + ret = i40e_set_bw_limit(vsi, vsi->seid, 0); +@@ -5540,7 +5540,7 @@ static void i40e_vsi_config_netdev_tc(struct i40e_vsi *vsi, u8 enabled_tc) + vsi->tc_config.tc_info[i].qoffset); + } + +- if (pf->flags & I40E_FLAG_TC_MQPRIO) ++ if (i40e_is_tc_mqprio_enabled(pf)) + return; + + /* Assign UP2TC map for the VSI */ +@@ -5701,7 +5701,7 @@ static int i40e_vsi_config_tc(struct i40e_vsi *vsi, u8 enabled_tc) + ctxt.vf_num = 0; + ctxt.uplink_seid = vsi->uplink_seid; + ctxt.info = vsi->info; +- if (vsi->back->flags & I40E_FLAG_TC_MQPRIO) { ++ if (i40e_is_tc_mqprio_enabled(pf)) { + ret = i40e_vsi_setup_queue_map_mqprio(vsi, &ctxt, enabled_tc); + if (ret) + goto out; +@@ -6425,7 +6425,7 @@ int i40e_create_queue_channel(struct i40e_vsi *vsi, + pf->flags |= I40E_FLAG_VEB_MODE_ENABLED; + + if (vsi->type == I40E_VSI_MAIN) { +- if (pf->flags & I40E_FLAG_TC_MQPRIO) ++ if (i40e_is_tc_mqprio_enabled(pf)) + i40e_do_reset(pf, I40E_PF_RESET_FLAG, true); + else + i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG); +@@ -6536,6 +6536,9 @@ static int i40e_configure_queue_channels(struct i40e_vsi *vsi) + vsi->tc_seid_map[i] = ch->seid; + } + } ++ ++ /* reset to reconfigure TX queue contexts */ ++ i40e_do_reset(vsi->back, I40E_PF_RESET_FLAG, true); + return ret; + + err_free: +@@ -7819,7 +7822,7 @@ static void *i40e_fwd_add(struct net_device *netdev, struct net_device *vdev) + netdev_info(netdev, "Macvlans are not supported when DCB is enabled\n"); + return ERR_PTR(-EINVAL); + } +- if ((pf->flags & I40E_FLAG_TC_MQPRIO)) { ++ if (i40e_is_tc_mqprio_enabled(pf)) { + netdev_info(netdev, "Macvlans are not supported when HW TC offload is on\n"); + return ERR_PTR(-EINVAL); + } +@@ -8072,7 +8075,7 @@ config_tc: + /* Quiesce VSI queues */ + i40e_quiesce_vsi(vsi); + +- if (!hw && !(pf->flags & I40E_FLAG_TC_MQPRIO)) ++ if (!hw && !i40e_is_tc_mqprio_enabled(pf)) + i40e_remove_queue_channels(vsi); + + /* Configure VSI for enabled TCs */ +@@ -8096,7 +8099,7 @@ config_tc: + "Setup channel (id:%u) utilizing num_queues %d\n", + vsi->seid, vsi->tc_config.tc_info[0].qcount); + +- if (pf->flags & I40E_FLAG_TC_MQPRIO) { ++ if (i40e_is_tc_mqprio_enabled(pf)) { + if (vsi->mqprio_qopt.max_rate[0]) { + u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0]; + +@@ -10750,7 +10753,7 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) + * unless I40E_FLAG_TC_MQPRIO was enabled or DCB + * is not supported with new link speed + */ +- if (pf->flags & I40E_FLAG_TC_MQPRIO) { ++ if (i40e_is_tc_mqprio_enabled(pf)) { + i40e_aq_set_dcb_parameters(hw, false, NULL); + } else { + if (I40E_IS_X710TL_DEVICE(hw->device_id) && +diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +index af69ccc6e8d2f..07f1e209d524d 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +@@ -3689,7 +3689,8 @@ u16 i40e_lan_select_queue(struct net_device *netdev, + u8 prio; + + /* is DCB enabled at all? */ +- if (vsi->tc_config.numtc == 1) ++ if (vsi->tc_config.numtc == 1 || ++ i40e_is_tc_mqprio_enabled(vsi->back)) + return netdev_pick_tx(netdev, skb, sb_dev); + + prio = skb->priority; +diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c +index 6d159334da9ec..981c43b204ff4 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_main.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_main.c +@@ -2789,6 +2789,11 @@ static void iavf_reset_task(struct work_struct *work) + int i = 0, err; + bool running; + ++ /* Detach interface to avoid subsequent NDO callbacks */ ++ rtnl_lock(); ++ netif_device_detach(netdev); ++ rtnl_unlock(); ++ + /* When device is being removed it doesn't make sense to run the reset + * task, just return in such a case. + */ +@@ -2796,7 +2801,7 @@ static void iavf_reset_task(struct work_struct *work) + if (adapter->state != __IAVF_REMOVE) + queue_work(iavf_wq, &adapter->reset_task); + +- return; ++ goto reset_finish; + } + + while (!mutex_trylock(&adapter->client_lock)) +@@ -2866,7 +2871,6 @@ continue_reset: + + if (running) { + netif_carrier_off(netdev); +- netif_tx_stop_all_queues(netdev); + adapter->link_up = false; + iavf_napi_disable_all(adapter); + } +@@ -2996,7 +3000,7 @@ continue_reset: + mutex_unlock(&adapter->client_lock); + mutex_unlock(&adapter->crit_lock); + +- return; ++ goto reset_finish; + reset_err: + if (running) { + set_bit(__IAVF_VSI_DOWN, adapter->vsi.state); +@@ -3007,6 +3011,10 @@ reset_err: + mutex_unlock(&adapter->client_lock); + mutex_unlock(&adapter->crit_lock); + dev_err(&adapter->pdev->dev, "failed to allocate resources during reinit\n"); ++reset_finish: ++ rtnl_lock(); ++ netif_device_attach(netdev); ++ rtnl_unlock(); + } + + /** +diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c +index 136d7911adb48..1e32438081780 100644 +--- a/drivers/net/ethernet/intel/ice/ice_base.c ++++ b/drivers/net/ethernet/intel/ice/ice_base.c +@@ -7,18 +7,6 @@ + #include "ice_dcb_lib.h" + #include "ice_sriov.h" + +-static bool ice_alloc_rx_buf_zc(struct ice_rx_ring *rx_ring) +-{ +- rx_ring->xdp_buf = kcalloc(rx_ring->count, sizeof(*rx_ring->xdp_buf), GFP_KERNEL); +- return !!rx_ring->xdp_buf; +-} +- +-static bool ice_alloc_rx_buf(struct ice_rx_ring *rx_ring) +-{ +- rx_ring->rx_buf = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_buf), GFP_KERNEL); +- return !!rx_ring->rx_buf; +-} +- + /** + * __ice_vsi_get_qs_contig - Assign a contiguous chunk of queues to VSI + * @qs_cfg: gathered variables needed for PF->VSI queues assignment +@@ -519,11 +507,8 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) + xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, + ring->q_index, ring->q_vector->napi.napi_id); + +- kfree(ring->rx_buf); + ring->xsk_pool = ice_xsk_pool(ring); + if (ring->xsk_pool) { +- if (!ice_alloc_rx_buf_zc(ring)) +- return -ENOMEM; + xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + + ring->rx_buf_len = +@@ -538,8 +523,6 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) + dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", + ring->q_index); + } else { +- if (!ice_alloc_rx_buf(ring)) +- return -ENOMEM; + if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) + /* coverity[check_return] */ + xdp_rxq_info_reg(&ring->xdp_rxq, +diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c +index 3d45e075204e3..4c6bb7482b362 100644 +--- a/drivers/net/ethernet/intel/ice/ice_main.c ++++ b/drivers/net/ethernet/intel/ice/ice_main.c +@@ -2898,10 +2898,18 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog, + if (xdp_ring_err) + NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Tx resources failed"); + } ++ /* reallocate Rx queues that are used for zero-copy */ ++ xdp_ring_err = ice_realloc_zc_buf(vsi, true); ++ if (xdp_ring_err) ++ NL_SET_ERR_MSG_MOD(extack, "Setting up XDP Rx resources failed"); + } else if (ice_is_xdp_ena_vsi(vsi) && !prog) { + xdp_ring_err = ice_destroy_xdp_rings(vsi); + if (xdp_ring_err) + NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Tx resources failed"); ++ /* reallocate Rx queues that were used for zero-copy */ ++ xdp_ring_err = ice_realloc_zc_buf(vsi, false); ++ if (xdp_ring_err) ++ NL_SET_ERR_MSG_MOD(extack, "Freeing XDP Rx resources failed"); + } else { + /* safe to call even when prog == vsi->xdp_prog as + * dev_xdp_install in net/core/dev.c incremented prog's +@@ -3904,7 +3912,7 @@ static int ice_init_pf(struct ice_pf *pf) + + pf->avail_rxqs = bitmap_zalloc(pf->max_pf_rxqs, GFP_KERNEL); + if (!pf->avail_rxqs) { +- devm_kfree(ice_pf_to_dev(pf), pf->avail_txqs); ++ bitmap_free(pf->avail_txqs); + pf->avail_txqs = NULL; + return -ENOMEM; + } +diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c +index e48e29258450f..03ce85f6e6df8 100644 +--- a/drivers/net/ethernet/intel/ice/ice_xsk.c ++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c +@@ -192,6 +192,7 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) + err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true); + if (err) + return err; ++ ice_clean_rx_ring(rx_ring); + + ice_qvec_toggle_napi(vsi, q_vector, false); + ice_qp_clean_rings(vsi, q_idx); +@@ -316,6 +317,62 @@ ice_xsk_pool_enable(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) + return 0; + } + ++/** ++ * ice_realloc_rx_xdp_bufs - reallocate for either XSK or normal buffer ++ * @rx_ring: Rx ring ++ * @pool_present: is pool for XSK present ++ * ++ * Try allocating memory and return ENOMEM, if failed to allocate. ++ * If allocation was successful, substitute buffer with allocated one. ++ * Returns 0 on success, negative on failure ++ */ ++static int ++ice_realloc_rx_xdp_bufs(struct ice_rx_ring *rx_ring, bool pool_present) ++{ ++ size_t elem_size = pool_present ? sizeof(*rx_ring->xdp_buf) : ++ sizeof(*rx_ring->rx_buf); ++ void *sw_ring = kcalloc(rx_ring->count, elem_size, GFP_KERNEL); ++ ++ if (!sw_ring) ++ return -ENOMEM; ++ ++ if (pool_present) { ++ kfree(rx_ring->rx_buf); ++ rx_ring->rx_buf = NULL; ++ rx_ring->xdp_buf = sw_ring; ++ } else { ++ kfree(rx_ring->xdp_buf); ++ rx_ring->xdp_buf = NULL; ++ rx_ring->rx_buf = sw_ring; ++ } ++ ++ return 0; ++} ++ ++/** ++ * ice_realloc_zc_buf - reallocate XDP ZC queue pairs ++ * @vsi: Current VSI ++ * @zc: is zero copy set ++ * ++ * Reallocate buffer for rx_rings that might be used by XSK. ++ * XDP requires more memory, than rx_buf provides. ++ * Returns 0 on success, negative on failure ++ */ ++int ice_realloc_zc_buf(struct ice_vsi *vsi, bool zc) ++{ ++ struct ice_rx_ring *rx_ring; ++ unsigned long q; ++ ++ for_each_set_bit(q, vsi->af_xdp_zc_qps, ++ max_t(int, vsi->alloc_txq, vsi->alloc_rxq)) { ++ rx_ring = vsi->rx_rings[q]; ++ if (ice_realloc_rx_xdp_bufs(rx_ring, zc)) ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ + /** + * ice_xsk_pool_setup - enable/disable a buffer pool region depending on its state + * @vsi: Current VSI +@@ -345,11 +402,17 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) + if_running = netif_running(vsi->netdev) && ice_is_xdp_ena_vsi(vsi); + + if (if_running) { ++ struct ice_rx_ring *rx_ring = vsi->rx_rings[qid]; ++ + ret = ice_qp_dis(vsi, qid); + if (ret) { + netdev_err(vsi->netdev, "ice_qp_dis error = %d\n", ret); + goto xsk_pool_if_up; + } ++ ++ ret = ice_realloc_rx_xdp_bufs(rx_ring, pool_present); ++ if (ret) ++ goto xsk_pool_if_up; + } + + pool_failure = pool_present ? ice_xsk_pool_enable(vsi, pool, qid) : +diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h +index 21faec8e97db1..4edbe81eb6460 100644 +--- a/drivers/net/ethernet/intel/ice/ice_xsk.h ++++ b/drivers/net/ethernet/intel/ice/ice_xsk.h +@@ -27,6 +27,7 @@ bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi); + void ice_xsk_clean_rx_ring(struct ice_rx_ring *rx_ring); + void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring); + bool ice_xmit_zc(struct ice_tx_ring *xdp_ring, u32 budget, int napi_budget); ++int ice_realloc_zc_buf(struct ice_vsi *vsi, bool zc); + #else + static inline bool + ice_xmit_zc(struct ice_tx_ring __always_unused *xdp_ring, +@@ -72,5 +73,12 @@ ice_xsk_wakeup(struct net_device __always_unused *netdev, + + static inline void ice_xsk_clean_rx_ring(struct ice_rx_ring *rx_ring) { } + static inline void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring) { } ++ ++static inline int ++ice_realloc_zc_buf(struct ice_vsi __always_unused *vsi, ++ bool __always_unused zc) ++{ ++ return 0; ++} + #endif /* CONFIG_XDP_SOCKETS */ + #endif /* !_ICE_XSK_H_ */ +diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c +index dab8f3f771f84..cfe804bc8d205 100644 +--- a/drivers/net/ethernet/mediatek/mtk_ppe.c ++++ b/drivers/net/ethernet/mediatek/mtk_ppe.c +@@ -412,7 +412,7 @@ __mtk_foe_entry_clear(struct mtk_ppe *ppe, struct mtk_flow_entry *entry) + if (entry->hash != 0xffff) { + ppe->foe_table[entry->hash].ib1 &= ~MTK_FOE_IB1_STATE; + ppe->foe_table[entry->hash].ib1 |= FIELD_PREP(MTK_FOE_IB1_STATE, +- MTK_FOE_STATE_BIND); ++ MTK_FOE_STATE_UNBIND); + dma_wmb(); + } + entry->hash = 0xffff; +diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h +index 1f5cf1c9a9475..69ffce04d6306 100644 +--- a/drivers/net/ethernet/mediatek/mtk_ppe.h ++++ b/drivers/net/ethernet/mediatek/mtk_ppe.h +@@ -293,6 +293,9 @@ mtk_ppe_check_skb(struct mtk_ppe *ppe, struct sk_buff *skb, u16 hash) + if (!ppe) + return; + ++ if (hash > MTK_PPE_HASH_MASK) ++ return; ++ + now = (u16)jiffies; + diff = now - ppe->foe_check_time[hash]; + if (diff < HZ / 10) +diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c +index 73f7962a37d33..c49062ad72c6c 100644 +--- a/drivers/net/phy/meson-gxl.c ++++ b/drivers/net/phy/meson-gxl.c +@@ -243,13 +243,7 @@ static irqreturn_t meson_gxl_handle_interrupt(struct phy_device *phydev) + irq_status == INTSRC_ENERGY_DETECT) + return IRQ_HANDLED; + +- /* Give PHY some time before MAC starts sending data. This works +- * around an issue where network doesn't come up properly. +- */ +- if (!(irq_status & INTSRC_LINK_DOWN)) +- phy_queue_state_machine(phydev, msecs_to_jiffies(100)); +- else +- phy_trigger_machine(phydev); ++ phy_trigger_machine(phydev); + + return IRQ_HANDLED; + } +diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c +index d4c93d59bc539..8569a545e0a3f 100644 +--- a/drivers/net/phy/microchip_t1.c ++++ b/drivers/net/phy/microchip_t1.c +@@ -28,12 +28,16 @@ + + /* Interrupt Source Register */ + #define LAN87XX_INTERRUPT_SOURCE (0x18) ++#define LAN87XX_INTERRUPT_SOURCE_2 (0x08) + + /* Interrupt Mask Register */ + #define LAN87XX_INTERRUPT_MASK (0x19) + #define LAN87XX_MASK_LINK_UP (0x0004) + #define LAN87XX_MASK_LINK_DOWN (0x0002) + ++#define LAN87XX_INTERRUPT_MASK_2 (0x09) ++#define LAN87XX_MASK_COMM_RDY BIT(10) ++ + /* MISC Control 1 Register */ + #define LAN87XX_CTRL_1 (0x11) + #define LAN87XX_MASK_RGMII_TXC_DLY_EN (0x4000) +@@ -424,17 +428,55 @@ static int lan87xx_phy_config_intr(struct phy_device *phydev) + int rc, val = 0; + + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { +- /* unmask all source and clear them before enable */ +- rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, 0x7FFF); ++ /* clear all interrupt */ ++ rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, val); ++ if (rc < 0) ++ return rc; ++ + rc = phy_read(phydev, LAN87XX_INTERRUPT_SOURCE); +- val = LAN87XX_MASK_LINK_UP | LAN87XX_MASK_LINK_DOWN; ++ if (rc < 0) ++ return rc; ++ ++ rc = access_ereg(phydev, PHYACC_ATTR_MODE_WRITE, ++ PHYACC_ATTR_BANK_MISC, ++ LAN87XX_INTERRUPT_MASK_2, val); ++ if (rc < 0) ++ return rc; ++ ++ rc = access_ereg(phydev, PHYACC_ATTR_MODE_READ, ++ PHYACC_ATTR_BANK_MISC, ++ LAN87XX_INTERRUPT_SOURCE_2, 0); ++ if (rc < 0) ++ return rc; ++ ++ /* enable link down and comm ready interrupt */ ++ val = LAN87XX_MASK_LINK_DOWN; + rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, val); ++ if (rc < 0) ++ return rc; ++ ++ val = LAN87XX_MASK_COMM_RDY; ++ rc = access_ereg(phydev, PHYACC_ATTR_MODE_WRITE, ++ PHYACC_ATTR_BANK_MISC, ++ LAN87XX_INTERRUPT_MASK_2, val); + } else { + rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, val); +- if (rc) ++ if (rc < 0) + return rc; + + rc = phy_read(phydev, LAN87XX_INTERRUPT_SOURCE); ++ if (rc < 0) ++ return rc; ++ ++ rc = access_ereg(phydev, PHYACC_ATTR_MODE_WRITE, ++ PHYACC_ATTR_BANK_MISC, ++ LAN87XX_INTERRUPT_MASK_2, val); ++ if (rc < 0) ++ return rc; ++ ++ rc = access_ereg(phydev, PHYACC_ATTR_MODE_READ, ++ PHYACC_ATTR_BANK_MISC, ++ LAN87XX_INTERRUPT_SOURCE_2, 0); + } + + return rc < 0 ? rc : 0; +@@ -444,6 +486,14 @@ static irqreturn_t lan87xx_handle_interrupt(struct phy_device *phydev) + { + int irq_status; + ++ irq_status = access_ereg(phydev, PHYACC_ATTR_MODE_READ, ++ PHYACC_ATTR_BANK_MISC, ++ LAN87XX_INTERRUPT_SOURCE_2, 0); ++ if (irq_status < 0) { ++ phy_error(phydev); ++ return IRQ_NONE; ++ } ++ + irq_status = phy_read(phydev, LAN87XX_INTERRUPT_SOURCE); + if (irq_status < 0) { + phy_error(phydev); +diff --git a/drivers/net/wireless/intel/iwlegacy/4965-rs.c b/drivers/net/wireless/intel/iwlegacy/4965-rs.c +index c62f299b9e0a8..d8a5dbf89a021 100644 +--- a/drivers/net/wireless/intel/iwlegacy/4965-rs.c ++++ b/drivers/net/wireless/intel/iwlegacy/4965-rs.c +@@ -2403,7 +2403,7 @@ il4965_rs_fill_link_cmd(struct il_priv *il, struct il_lq_sta *lq_sta, + /* Repeat initial/next rate. + * For legacy IL_NUMBER_TRY == 1, this loop will not execute. + * For HT IL_HT_NUMBER_TRY == 3, this executes twice. */ +- while (repeat_rate > 0) { ++ while (repeat_rate > 0 && idx < (LINK_QUAL_MAX_RETRY_NUM - 1)) { + if (is_legacy(tbl_type.lq_type)) { + if (ant_toggle_cnt < NUM_TRY_BEFORE_ANT_TOGGLE) + ant_toggle_cnt++; +@@ -2422,8 +2422,6 @@ il4965_rs_fill_link_cmd(struct il_priv *il, struct il_lq_sta *lq_sta, + cpu_to_le32(new_rate); + repeat_rate--; + idx++; +- if (idx >= LINK_QUAL_MAX_RETRY_NUM) +- goto out; + } + + il4965_rs_get_tbl_info_from_mcs(new_rate, lq_sta->band, +@@ -2468,7 +2466,6 @@ il4965_rs_fill_link_cmd(struct il_priv *il, struct il_lq_sta *lq_sta, + repeat_rate--; + } + +-out: + lq_cmd->agg_params.agg_frame_cnt_limit = LINK_QUAL_AGG_FRAME_LIMIT_DEF; + lq_cmd->agg_params.agg_dis_start_th = LINK_QUAL_AGG_DISABLE_START_DEF; + +diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c +index b0f58bcf70cb0..106c88b723b90 100644 +--- a/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c ++++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c +@@ -345,7 +345,7 @@ int mt7921e_mac_reset(struct mt7921_dev *dev) + + err = mt7921e_driver_own(dev); + if (err) +- return err; ++ goto out; + + err = mt7921_run_firmware(dev); + if (err) +diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.h b/drivers/net/wireless/microchip/wilc1000/netdev.h +index a067274c20144..bf001e9def6aa 100644 +--- a/drivers/net/wireless/microchip/wilc1000/netdev.h ++++ b/drivers/net/wireless/microchip/wilc1000/netdev.h +@@ -254,6 +254,7 @@ struct wilc { + u8 *rx_buffer; + u32 rx_buffer_offset; + u8 *tx_buffer; ++ u32 *vmm_table; + + struct txq_handle txq[NQUEUES]; + int txq_entries; +diff --git a/drivers/net/wireless/microchip/wilc1000/sdio.c b/drivers/net/wireless/microchip/wilc1000/sdio.c +index 7962c11cfe848..56f924a31bc66 100644 +--- a/drivers/net/wireless/microchip/wilc1000/sdio.c ++++ b/drivers/net/wireless/microchip/wilc1000/sdio.c +@@ -27,6 +27,7 @@ struct wilc_sdio { + bool irq_gpio; + u32 block_size; + int has_thrpt_enh3; ++ u8 *cmd53_buf; + }; + + struct sdio_cmd52 { +@@ -46,6 +47,7 @@ struct sdio_cmd53 { + u32 count: 9; + u8 *buffer; + u32 block_size; ++ bool use_global_buf; + }; + + static const struct wilc_hif_func wilc_hif_sdio; +@@ -90,6 +92,8 @@ static int wilc_sdio_cmd53(struct wilc *wilc, struct sdio_cmd53 *cmd) + { + struct sdio_func *func = container_of(wilc->dev, struct sdio_func, dev); + int size, ret; ++ struct wilc_sdio *sdio_priv = wilc->bus_data; ++ u8 *buf = cmd->buffer; + + sdio_claim_host(func); + +@@ -100,12 +104,23 @@ static int wilc_sdio_cmd53(struct wilc *wilc, struct sdio_cmd53 *cmd) + else + size = cmd->count; + ++ if (cmd->use_global_buf) { ++ if (size > sizeof(u32)) ++ return -EINVAL; ++ ++ buf = sdio_priv->cmd53_buf; ++ } ++ + if (cmd->read_write) { /* write */ +- ret = sdio_memcpy_toio(func, cmd->address, +- (void *)cmd->buffer, size); ++ if (cmd->use_global_buf) ++ memcpy(buf, cmd->buffer, size); ++ ++ ret = sdio_memcpy_toio(func, cmd->address, buf, size); + } else { /* read */ +- ret = sdio_memcpy_fromio(func, (void *)cmd->buffer, +- cmd->address, size); ++ ret = sdio_memcpy_fromio(func, buf, cmd->address, size); ++ ++ if (cmd->use_global_buf) ++ memcpy(cmd->buffer, buf, size); + } + + sdio_release_host(func); +@@ -127,6 +142,12 @@ static int wilc_sdio_probe(struct sdio_func *func, + if (!sdio_priv) + return -ENOMEM; + ++ sdio_priv->cmd53_buf = kzalloc(sizeof(u32), GFP_KERNEL); ++ if (!sdio_priv->cmd53_buf) { ++ ret = -ENOMEM; ++ goto free; ++ } ++ + ret = wilc_cfg80211_init(&wilc, &func->dev, WILC_HIF_SDIO, + &wilc_hif_sdio); + if (ret) +@@ -160,6 +181,7 @@ dispose_irq: + irq_dispose_mapping(wilc->dev_irq_num); + wilc_netdev_cleanup(wilc); + free: ++ kfree(sdio_priv->cmd53_buf); + kfree(sdio_priv); + return ret; + } +@@ -171,6 +193,7 @@ static void wilc_sdio_remove(struct sdio_func *func) + + clk_disable_unprepare(wilc->rtc_clk); + wilc_netdev_cleanup(wilc); ++ kfree(sdio_priv->cmd53_buf); + kfree(sdio_priv); + } + +@@ -367,8 +390,9 @@ static int wilc_sdio_write_reg(struct wilc *wilc, u32 addr, u32 data) + cmd.address = WILC_SDIO_FBR_DATA_REG; + cmd.block_mode = 0; + cmd.increment = 1; +- cmd.count = 4; ++ cmd.count = sizeof(u32); + cmd.buffer = (u8 *)&data; ++ cmd.use_global_buf = true; + cmd.block_size = sdio_priv->block_size; + ret = wilc_sdio_cmd53(wilc, &cmd); + if (ret) +@@ -406,6 +430,7 @@ static int wilc_sdio_write(struct wilc *wilc, u32 addr, u8 *buf, u32 size) + nblk = size / block_size; + nleft = size % block_size; + ++ cmd.use_global_buf = false; + if (nblk > 0) { + cmd.block_mode = 1; + cmd.increment = 1; +@@ -484,8 +509,9 @@ static int wilc_sdio_read_reg(struct wilc *wilc, u32 addr, u32 *data) + cmd.address = WILC_SDIO_FBR_DATA_REG; + cmd.block_mode = 0; + cmd.increment = 1; +- cmd.count = 4; ++ cmd.count = sizeof(u32); + cmd.buffer = (u8 *)data; ++ cmd.use_global_buf = true; + + cmd.block_size = sdio_priv->block_size; + ret = wilc_sdio_cmd53(wilc, &cmd); +@@ -527,6 +553,7 @@ static int wilc_sdio_read(struct wilc *wilc, u32 addr, u8 *buf, u32 size) + nblk = size / block_size; + nleft = size % block_size; + ++ cmd.use_global_buf = false; + if (nblk > 0) { + cmd.block_mode = 1; + cmd.increment = 1; +diff --git a/drivers/net/wireless/microchip/wilc1000/wlan.c b/drivers/net/wireless/microchip/wilc1000/wlan.c +index 48441f0389ca1..0c8a571486d25 100644 +--- a/drivers/net/wireless/microchip/wilc1000/wlan.c ++++ b/drivers/net/wireless/microchip/wilc1000/wlan.c +@@ -714,7 +714,7 @@ int wilc_wlan_handle_txq(struct wilc *wilc, u32 *txq_count) + int ret = 0; + int counter; + int timeout; +- u32 vmm_table[WILC_VMM_TBL_SIZE]; ++ u32 *vmm_table = wilc->vmm_table; + u8 ac_pkt_num_to_chip[NQUEUES] = {0, 0, 0, 0}; + const struct wilc_hif_func *func; + int srcu_idx; +@@ -1251,6 +1251,8 @@ void wilc_wlan_cleanup(struct net_device *dev) + while ((rqe = wilc_wlan_rxq_remove(wilc))) + kfree(rqe); + ++ kfree(wilc->vmm_table); ++ wilc->vmm_table = NULL; + kfree(wilc->rx_buffer); + wilc->rx_buffer = NULL; + kfree(wilc->tx_buffer); +@@ -1485,6 +1487,14 @@ int wilc_wlan_init(struct net_device *dev) + goto fail; + } + ++ if (!wilc->vmm_table) ++ wilc->vmm_table = kzalloc(WILC_VMM_TBL_SIZE, GFP_KERNEL); ++ ++ if (!wilc->vmm_table) { ++ ret = -ENOBUFS; ++ goto fail; ++ } ++ + if (!wilc->tx_buffer) + wilc->tx_buffer = kmalloc(WILC_TX_BUFF_SIZE, GFP_KERNEL); + +@@ -1509,7 +1519,8 @@ int wilc_wlan_init(struct net_device *dev) + return 0; + + fail: +- ++ kfree(wilc->vmm_table); ++ wilc->vmm_table = NULL; + kfree(wilc->rx_buffer); + wilc->rx_buffer = NULL; + kfree(wilc->tx_buffer); +diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c +index 990360d75cb64..e85b3c5d4acce 100644 +--- a/drivers/net/xen-netback/xenbus.c ++++ b/drivers/net/xen-netback/xenbus.c +@@ -256,7 +256,6 @@ static void backend_disconnect(struct backend_info *be) + unsigned int queue_index; + + xen_unregister_watchers(vif); +- xenbus_rm(XBT_NIL, be->dev->nodename, "hotplug-status"); + #ifdef CONFIG_DEBUG_FS + xenvif_debugfs_delif(vif); + #endif /* CONFIG_DEBUG_FS */ +@@ -984,6 +983,7 @@ static int netback_remove(struct xenbus_device *dev) + struct backend_info *be = dev_get_drvdata(&dev->dev); + + unregister_hotplug_status_watch(be); ++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); + if (be->vif) { + kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); + backend_disconnect(be); +diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c +index 7a9e6ffa23429..daa0e160e1212 100644 +--- a/drivers/nvme/host/tcp.c ++++ b/drivers/nvme/host/tcp.c +@@ -121,7 +121,6 @@ struct nvme_tcp_queue { + struct mutex send_mutex; + struct llist_head req_list; + struct list_head send_list; +- bool more_requests; + + /* recv state */ + void *pdu; +@@ -318,7 +317,7 @@ static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue) + static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) + { + return !list_empty(&queue->send_list) || +- !llist_empty(&queue->req_list) || queue->more_requests; ++ !llist_empty(&queue->req_list); + } + + static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, +@@ -337,9 +336,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, + */ + if (queue->io_cpu == raw_smp_processor_id() && + sync && empty && mutex_trylock(&queue->send_mutex)) { +- queue->more_requests = !last; + nvme_tcp_send_all(queue); +- queue->more_requests = false; + mutex_unlock(&queue->send_mutex); + } + +@@ -1227,7 +1224,7 @@ static void nvme_tcp_io_work(struct work_struct *w) + else if (unlikely(result < 0)) + return; + +- if (!pending) ++ if (!pending || !queue->rd_enabled) + return; + + } while (!time_after(jiffies, deadline)); /* quota is exhausted */ +diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c +index c27660a660d9a..a339719100051 100644 +--- a/drivers/nvme/target/core.c ++++ b/drivers/nvme/target/core.c +@@ -735,6 +735,8 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status) + + static void __nvmet_req_complete(struct nvmet_req *req, u16 status) + { ++ struct nvmet_ns *ns = req->ns; ++ + if (!req->sq->sqhd_disabled) + nvmet_update_sq_head(req); + req->cqe->sq_id = cpu_to_le16(req->sq->qid); +@@ -745,9 +747,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) + + trace_nvmet_req_complete(req); + +- if (req->ns) +- nvmet_put_namespace(req->ns); + req->ops->queue_response(req); ++ if (ns) ++ nvmet_put_namespace(ns); + } + + void nvmet_req_complete(struct nvmet_req *req, u16 status) +diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c +index 82b61acf7a72b..1956be87ac5ff 100644 +--- a/drivers/nvme/target/zns.c ++++ b/drivers/nvme/target/zns.c +@@ -100,6 +100,7 @@ void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) + struct nvme_id_ns_zns *id_zns; + u64 zsze; + u16 status; ++ u32 mar, mor; + + if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { + req->error_loc = offsetof(struct nvme_identify, nsid); +@@ -130,8 +131,20 @@ void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) + zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >> + req->ns->blksize_shift; + id_zns->lbafe[0].zsze = cpu_to_le64(zsze); +- id_zns->mor = cpu_to_le32(bdev_max_open_zones(req->ns->bdev)); +- id_zns->mar = cpu_to_le32(bdev_max_active_zones(req->ns->bdev)); ++ ++ mor = bdev_max_open_zones(req->ns->bdev); ++ if (!mor) ++ mor = U32_MAX; ++ else ++ mor--; ++ id_zns->mor = cpu_to_le32(mor); ++ ++ mar = bdev_max_active_zones(req->ns->bdev); ++ if (!mar) ++ mar = U32_MAX; ++ else ++ mar--; ++ id_zns->mar = cpu_to_le32(mar); + + done: + status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns)); +diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c +index 9be007c9420f9..f69ab90b5e22d 100644 +--- a/drivers/parisc/ccio-dma.c ++++ b/drivers/parisc/ccio-dma.c +@@ -1380,15 +1380,17 @@ ccio_init_resource(struct resource *res, char *name, void __iomem *ioaddr) + } + } + +-static void __init ccio_init_resources(struct ioc *ioc) ++static int __init ccio_init_resources(struct ioc *ioc) + { + struct resource *res = ioc->mmio_region; + char *name = kmalloc(14, GFP_KERNEL); +- ++ if (unlikely(!name)) ++ return -ENOMEM; + snprintf(name, 14, "GSC Bus [%d/]", ioc->hw_path); + + ccio_init_resource(res, name, &ioc->ioc_regs->io_io_low); + ccio_init_resource(res + 1, name, &ioc->ioc_regs->io_io_low_hv); ++ return 0; + } + + static int new_ioc_area(struct resource *res, unsigned long size, +@@ -1543,7 +1545,10 @@ static int __init ccio_probe(struct parisc_device *dev) + return -ENOMEM; + } + ccio_ioc_init(ioc); +- ccio_init_resources(ioc); ++ if (ccio_init_resources(ioc)) { ++ kfree(ioc); ++ return -ENOMEM; ++ } + hppa_dma_ops = &ccio_ops; + + hba = kzalloc(sizeof(*hba), GFP_KERNEL); +diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c +index 231d86d3949c0..1ec5baa673f92 100644 +--- a/drivers/perf/riscv_pmu_sbi.c ++++ b/drivers/perf/riscv_pmu_sbi.c +@@ -467,7 +467,7 @@ static int pmu_sbi_get_ctrinfo(int nctr) + if (!pmu_ctr_list) + return -ENOMEM; + +- for (i = 0; i <= nctr; i++) { ++ for (i = 0; i < nctr; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, 0, 0, 0, 0, 0); + if (ret.error) + /* The logical counter ids are not expected to be contiguous */ +diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c +index 1e54a833f2cf0..a9daaf4d5aaab 100644 +--- a/drivers/regulator/core.c ++++ b/drivers/regulator/core.c +@@ -2732,13 +2732,18 @@ static int _regulator_do_enable(struct regulator_dev *rdev) + */ + static int _regulator_handle_consumer_enable(struct regulator *regulator) + { ++ int ret; + struct regulator_dev *rdev = regulator->rdev; + + lockdep_assert_held_once(&rdev->mutex.base); + + regulator->enable_count++; +- if (regulator->uA_load && regulator->enable_count == 1) +- return drms_uA_update(rdev); ++ if (regulator->uA_load && regulator->enable_count == 1) { ++ ret = drms_uA_update(rdev); ++ if (ret) ++ regulator->enable_count--; ++ return ret; ++ } + + return 0; + } +diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c +index 750dd1e9f2cc7..2ddc431cbd337 100644 +--- a/drivers/scsi/lpfc/lpfc_init.c ++++ b/drivers/scsi/lpfc/lpfc_init.c +@@ -8061,7 +8061,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) + /* Allocate device driver memory */ + rc = lpfc_mem_alloc(phba, SGL_ALIGN_SZ); + if (rc) +- return -ENOMEM; ++ goto out_destroy_workqueue; + + /* IF Type 2 ports get initialized now. */ + if (bf_get(lpfc_sli_intf_if_type, &phba->sli4_hba.sli_intf) >= +@@ -8489,6 +8489,9 @@ out_free_bsmbx: + lpfc_destroy_bootstrap_mbox(phba); + out_free_mem: + lpfc_mem_free(phba); ++out_destroy_workqueue: ++ destroy_workqueue(phba->wq); ++ phba->wq = NULL; + return rc; + } + +diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c +index 5b5885d9732b6..3e9b2b0099c7a 100644 +--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c ++++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c +@@ -5311,7 +5311,6 @@ megasas_alloc_fusion_context(struct megasas_instance *instance) + if (!fusion->log_to_span) { + dev_err(&instance->pdev->dev, "Failed from %s %d\n", + __func__, __LINE__); +- kfree(instance->ctrl_context); + return -ENOMEM; + } + } +diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c +index 5e8887fa02c8a..e3b7ebf464244 100644 +--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c ++++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c +@@ -3670,6 +3670,7 @@ static struct fw_event_work *dequeue_next_fw_event(struct MPT3SAS_ADAPTER *ioc) + fw_event = list_first_entry(&ioc->fw_event_list, + struct fw_event_work, list); + list_del_init(&fw_event->list); ++ fw_event_work_put(fw_event); + } + spin_unlock_irqrestore(&ioc->fw_event_lock, flags); + +@@ -3751,7 +3752,6 @@ _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc) + if (cancel_work_sync(&fw_event->work)) + fw_event_work_put(fw_event); + +- fw_event_work_put(fw_event); + } + ioc->fw_events_cleanup = 0; + } +diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c +index 2b2f682883752..62666df1a59eb 100644 +--- a/drivers/scsi/qla2xxx/qla_target.c ++++ b/drivers/scsi/qla2xxx/qla_target.c +@@ -6935,14 +6935,8 @@ qlt_24xx_config_rings(struct scsi_qla_host *vha) + + if (ha->flags.msix_enabled) { + if (IS_QLA83XX(ha) || IS_QLA27XX(ha) || IS_QLA28XX(ha)) { +- if (IS_QLA2071(ha)) { +- /* 4 ports Baker: Enable Interrupt Handshake */ +- icb->msix_atio = 0; +- icb->firmware_options_2 |= cpu_to_le32(BIT_26); +- } else { +- icb->msix_atio = cpu_to_le16(msix->entry); +- icb->firmware_options_2 &= cpu_to_le32(~BIT_26); +- } ++ icb->msix_atio = cpu_to_le16(msix->entry); ++ icb->firmware_options_2 &= cpu_to_le32(~BIT_26); + ql_dbg(ql_dbg_init, vha, 0xf072, + "Registering ICB vector 0x%x for atio que.\n", + msix->entry); +diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c +index 78edb1ea4748d..f5c876d03c1ad 100644 +--- a/drivers/scsi/scsi_lib.c ++++ b/drivers/scsi/scsi_lib.c +@@ -118,7 +118,7 @@ scsi_set_blocked(struct scsi_cmnd *cmd, int reason) + } + } + +-static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd) ++static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd, unsigned long msecs) + { + struct request *rq = scsi_cmd_to_rq(cmd); + +@@ -128,7 +128,12 @@ static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd) + } else { + WARN_ON_ONCE(true); + } +- blk_mq_requeue_request(rq, true); ++ ++ if (msecs) { ++ blk_mq_requeue_request(rq, false); ++ blk_mq_delay_kick_requeue_list(rq->q, msecs); ++ } else ++ blk_mq_requeue_request(rq, true); + } + + /** +@@ -658,14 +663,6 @@ static unsigned int scsi_rq_err_bytes(const struct request *rq) + return bytes; + } + +-/* Helper for scsi_io_completion() when "reprep" action required. */ +-static void scsi_io_completion_reprep(struct scsi_cmnd *cmd, +- struct request_queue *q) +-{ +- /* A new command will be prepared and issued. */ +- scsi_mq_requeue_cmd(cmd); +-} +- + static bool scsi_cmd_runtime_exceeced(struct scsi_cmnd *cmd) + { + struct request *req = scsi_cmd_to_rq(cmd); +@@ -683,14 +680,21 @@ static bool scsi_cmd_runtime_exceeced(struct scsi_cmnd *cmd) + return false; + } + ++/* ++ * When ALUA transition state is returned, reprep the cmd to ++ * use the ALUA handler's transition timeout. Delay the reprep ++ * 1 sec to avoid aggressive retries of the target in that ++ * state. ++ */ ++#define ALUA_TRANSITION_REPREP_DELAY 1000 ++ + /* Helper for scsi_io_completion() when special action required. */ + static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) + { +- struct request_queue *q = cmd->device->request_queue; + struct request *req = scsi_cmd_to_rq(cmd); + int level = 0; +- enum {ACTION_FAIL, ACTION_REPREP, ACTION_RETRY, +- ACTION_DELAYED_RETRY} action; ++ enum {ACTION_FAIL, ACTION_REPREP, ACTION_DELAYED_REPREP, ++ ACTION_RETRY, ACTION_DELAYED_RETRY} action; + struct scsi_sense_hdr sshdr; + bool sense_valid; + bool sense_current = true; /* false implies "deferred sense" */ +@@ -779,8 +783,8 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) + action = ACTION_DELAYED_RETRY; + break; + case 0x0a: /* ALUA state transition */ +- blk_stat = BLK_STS_TRANSPORT; +- fallthrough; ++ action = ACTION_DELAYED_REPREP; ++ break; + default: + action = ACTION_FAIL; + break; +@@ -839,7 +843,10 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) + return; + fallthrough; + case ACTION_REPREP: +- scsi_io_completion_reprep(cmd, q); ++ scsi_mq_requeue_cmd(cmd, 0); ++ break; ++ case ACTION_DELAYED_REPREP: ++ scsi_mq_requeue_cmd(cmd, ALUA_TRANSITION_REPREP_DELAY); + break; + case ACTION_RETRY: + /* Retry the same command immediately */ +@@ -933,7 +940,7 @@ static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result, + * command block will be released and the queue function will be goosed. If we + * are not done then we have to figure out what to do next: + * +- * a) We can call scsi_io_completion_reprep(). The request will be ++ * a) We can call scsi_mq_requeue_cmd(). The request will be + * unprepared and put back on the queue. Then a new command will + * be created for it. This should be used if we made forward + * progress, or if we want to switch from READ(10) to READ(6) for +@@ -949,7 +956,6 @@ static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result, + void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) + { + int result = cmd->result; +- struct request_queue *q = cmd->device->request_queue; + struct request *req = scsi_cmd_to_rq(cmd); + blk_status_t blk_stat = BLK_STS_OK; + +@@ -986,7 +992,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) + * request just queue the command up again. + */ + if (likely(result == 0)) +- scsi_io_completion_reprep(cmd, q); ++ scsi_mq_requeue_cmd(cmd, 0); + else + scsi_io_completion_action(cmd, result); + } +diff --git a/drivers/soc/bcm/brcmstb/pm/pm-arm.c b/drivers/soc/bcm/brcmstb/pm/pm-arm.c +index 70ad0f3dce283..286f5d57c0cab 100644 +--- a/drivers/soc/bcm/brcmstb/pm/pm-arm.c ++++ b/drivers/soc/bcm/brcmstb/pm/pm-arm.c +@@ -684,13 +684,14 @@ static int brcmstb_pm_probe(struct platform_device *pdev) + const struct of_device_id *of_id = NULL; + struct device_node *dn; + void __iomem *base; +- int ret, i; ++ int ret, i, s; + + /* AON ctrl registers */ + base = brcmstb_ioremap_match(aon_ctrl_dt_ids, 0, NULL); + if (IS_ERR(base)) { + pr_err("error mapping AON_CTRL\n"); +- return PTR_ERR(base); ++ ret = PTR_ERR(base); ++ goto aon_err; + } + ctrl.aon_ctrl_base = base; + +@@ -700,8 +701,10 @@ static int brcmstb_pm_probe(struct platform_device *pdev) + /* Assume standard offset */ + ctrl.aon_sram = ctrl.aon_ctrl_base + + AON_CTRL_SYSTEM_DATA_RAM_OFS; ++ s = 0; + } else { + ctrl.aon_sram = base; ++ s = 1; + } + + writel_relaxed(0, ctrl.aon_sram + AON_REG_PANIC); +@@ -711,7 +714,8 @@ static int brcmstb_pm_probe(struct platform_device *pdev) + (const void **)&ddr_phy_data); + if (IS_ERR(base)) { + pr_err("error mapping DDR PHY\n"); +- return PTR_ERR(base); ++ ret = PTR_ERR(base); ++ goto ddr_phy_err; + } + ctrl.support_warm_boot = ddr_phy_data->supports_warm_boot; + ctrl.pll_status_offset = ddr_phy_data->pll_status_offset; +@@ -731,17 +735,20 @@ static int brcmstb_pm_probe(struct platform_device *pdev) + for_each_matching_node(dn, ddr_shimphy_dt_ids) { + i = ctrl.num_memc; + if (i >= MAX_NUM_MEMC) { ++ of_node_put(dn); + pr_warn("too many MEMCs (max %d)\n", MAX_NUM_MEMC); + break; + } + + base = of_io_request_and_map(dn, 0, dn->full_name); + if (IS_ERR(base)) { ++ of_node_put(dn); + if (!ctrl.support_warm_boot) + break; + + pr_err("error mapping DDR SHIMPHY %d\n", i); +- return PTR_ERR(base); ++ ret = PTR_ERR(base); ++ goto ddr_shimphy_err; + } + ctrl.memcs[i].ddr_shimphy_base = base; + ctrl.num_memc++; +@@ -752,14 +759,18 @@ static int brcmstb_pm_probe(struct platform_device *pdev) + for_each_matching_node(dn, brcmstb_memc_of_match) { + base = of_iomap(dn, 0); + if (!base) { ++ of_node_put(dn); + pr_err("error mapping DDR Sequencer %d\n", i); +- return -ENOMEM; ++ ret = -ENOMEM; ++ goto brcmstb_memc_err; + } + + of_id = of_match_node(brcmstb_memc_of_match, dn); + if (!of_id) { + iounmap(base); +- return -EINVAL; ++ of_node_put(dn); ++ ret = -EINVAL; ++ goto brcmstb_memc_err; + } + + ddr_seq_data = of_id->data; +@@ -779,21 +790,24 @@ static int brcmstb_pm_probe(struct platform_device *pdev) + dn = of_find_matching_node(NULL, sram_dt_ids); + if (!dn) { + pr_err("SRAM not found\n"); +- return -EINVAL; ++ ret = -EINVAL; ++ goto brcmstb_memc_err; + } + + ret = brcmstb_init_sram(dn); + of_node_put(dn); + if (ret) { + pr_err("error setting up SRAM for PM\n"); +- return ret; ++ goto brcmstb_memc_err; + } + + ctrl.pdev = pdev; + + ctrl.s3_params = kmalloc(sizeof(*ctrl.s3_params), GFP_KERNEL); +- if (!ctrl.s3_params) +- return -ENOMEM; ++ if (!ctrl.s3_params) { ++ ret = -ENOMEM; ++ goto s3_params_err; ++ } + ctrl.s3_params_pa = dma_map_single(&pdev->dev, ctrl.s3_params, + sizeof(*ctrl.s3_params), + DMA_TO_DEVICE); +@@ -813,7 +827,21 @@ static int brcmstb_pm_probe(struct platform_device *pdev) + + out: + kfree(ctrl.s3_params); +- ++s3_params_err: ++ iounmap(ctrl.boot_sram); ++brcmstb_memc_err: ++ for (i--; i >= 0; i--) ++ iounmap(ctrl.memcs[i].ddr_ctrl); ++ddr_shimphy_err: ++ for (i = 0; i < ctrl.num_memc; i++) ++ iounmap(ctrl.memcs[i].ddr_shimphy_base); ++ ++ iounmap(ctrl.memcs[0].ddr_phy_base); ++ddr_phy_err: ++ iounmap(ctrl.aon_ctrl_base); ++ if (s) ++ iounmap(ctrl.aon_sram); ++aon_err: + pr_warn("PM: initialization failed with code %d\n", ret); + + return ret; +diff --git a/drivers/soc/fsl/Kconfig b/drivers/soc/fsl/Kconfig +index 07d52cafbb313..fcec6ed83d5e2 100644 +--- a/drivers/soc/fsl/Kconfig ++++ b/drivers/soc/fsl/Kconfig +@@ -24,6 +24,7 @@ config FSL_MC_DPIO + tristate "QorIQ DPAA2 DPIO driver" + depends on FSL_MC_BUS + select SOC_BUS ++ select FSL_GUTS + select DIMLIB + help + Driver for the DPAA2 DPIO object. A DPIO provides queue and +diff --git a/drivers/soc/imx/gpcv2.c b/drivers/soc/imx/gpcv2.c +index 85aa86e1338af..5a3809f6a698f 100644 +--- a/drivers/soc/imx/gpcv2.c ++++ b/drivers/soc/imx/gpcv2.c +@@ -333,6 +333,8 @@ static int imx_pgc_power_up(struct generic_pm_domain *genpd) + } + } + ++ reset_control_assert(domain->reset); ++ + /* Enable reset clocks for all devices in the domain */ + ret = clk_bulk_prepare_enable(domain->num_clks, domain->clks); + if (ret) { +@@ -340,7 +342,8 @@ static int imx_pgc_power_up(struct generic_pm_domain *genpd) + goto out_regulator_disable; + } + +- reset_control_assert(domain->reset); ++ /* delays for reset to propagate */ ++ udelay(5); + + if (domain->bits.pxx) { + /* request the domain to power up */ +diff --git a/drivers/soc/imx/imx8m-blk-ctrl.c b/drivers/soc/imx/imx8m-blk-ctrl.c +index 7ebc28709e945..2782a7e0a8719 100644 +--- a/drivers/soc/imx/imx8m-blk-ctrl.c ++++ b/drivers/soc/imx/imx8m-blk-ctrl.c +@@ -242,7 +242,6 @@ static int imx8m_blk_ctrl_probe(struct platform_device *pdev) + ret = PTR_ERR(domain->power_dev); + goto cleanup_pds; + } +- dev_set_name(domain->power_dev, "%s", data->name); + + domain->genpd.name = data->name; + domain->genpd.power_on = imx8m_blk_ctrl_power_on; +diff --git a/drivers/spi/spi-bitbang-txrx.h b/drivers/spi/spi-bitbang-txrx.h +index 267342dfa7388..2dcbe166df63e 100644 +--- a/drivers/spi/spi-bitbang-txrx.h ++++ b/drivers/spi/spi-bitbang-txrx.h +@@ -116,6 +116,7 @@ bitbang_txrx_le_cpha0(struct spi_device *spi, + { + /* if (cpol == 0) this is SPI_MODE_0; else this is SPI_MODE_2 */ + ++ u8 rxbit = bits - 1; + u32 oldbit = !(word & 1); + /* clock starts at inactive polarity */ + for (; likely(bits); bits--) { +@@ -135,7 +136,7 @@ bitbang_txrx_le_cpha0(struct spi_device *spi, + /* sample LSB (from slave) on leading edge */ + word >>= 1; + if ((flags & SPI_MASTER_NO_RX) == 0) +- word |= getmiso(spi) << (bits - 1); ++ word |= getmiso(spi) << rxbit; + setsck(spi, cpol); + } + return word; +@@ -148,6 +149,7 @@ bitbang_txrx_le_cpha1(struct spi_device *spi, + { + /* if (cpol == 0) this is SPI_MODE_1; else this is SPI_MODE_3 */ + ++ u8 rxbit = bits - 1; + u32 oldbit = !(word & 1); + /* clock starts at inactive polarity */ + for (; likely(bits); bits--) { +@@ -168,7 +170,7 @@ bitbang_txrx_le_cpha1(struct spi_device *spi, + /* sample LSB (from slave) on trailing edge */ + word >>= 1; + if ((flags & SPI_MASTER_NO_RX) == 0) +- word |= getmiso(spi) << (bits - 1); ++ word |= getmiso(spi) << rxbit; + } + return word; + } +diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c +index 1175f3a46859f..27295bda3e0bd 100644 +--- a/drivers/tee/tee_shm.c ++++ b/drivers/tee/tee_shm.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + #include "tee_private.h" + +diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +index 80d4e0676083a..365489bf4b8c1 100644 +--- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c ++++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +@@ -527,7 +527,7 @@ static void int3400_setup_gddv(struct int3400_thermal_priv *priv) + priv->data_vault = kmemdup(obj->package.elements[0].buffer.pointer, + obj->package.elements[0].buffer.length, + GFP_KERNEL); +- if (!priv->data_vault) ++ if (ZERO_OR_NULL_PTR(priv->data_vault)) + goto out_free; + + bin_attr_data_vault.private = priv->data_vault; +@@ -597,7 +597,7 @@ static int int3400_thermal_probe(struct platform_device *pdev) + goto free_imok; + } + +- if (priv->data_vault) { ++ if (!ZERO_OR_NULL_PTR(priv->data_vault)) { + result = sysfs_create_group(&pdev->dev.kobj, + &data_attribute_group); + if (result) +@@ -615,7 +615,8 @@ static int int3400_thermal_probe(struct platform_device *pdev) + free_sysfs: + cleanup_odvp(priv); + if (priv->data_vault) { +- sysfs_remove_group(&pdev->dev.kobj, &data_attribute_group); ++ if (!ZERO_OR_NULL_PTR(priv->data_vault)) ++ sysfs_remove_group(&pdev->dev.kobj, &data_attribute_group); + kfree(priv->data_vault); + } + free_uuid: +@@ -647,7 +648,7 @@ static int int3400_thermal_remove(struct platform_device *pdev) + if (!priv->rel_misc_dev_res) + acpi_thermal_rel_misc_device_remove(priv->adev->handle); + +- if (priv->data_vault) ++ if (!ZERO_OR_NULL_PTR(priv->data_vault)) + sysfs_remove_group(&pdev->dev.kobj, &data_attribute_group); + sysfs_remove_group(&pdev->dev.kobj, &uuid_attribute_group); + sysfs_remove_group(&pdev->dev.kobj, &imok_attribute_group); +diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c +index a51ca56a0ebe7..829da9cb14a86 100644 +--- a/drivers/ufs/core/ufshcd.c ++++ b/drivers/ufs/core/ufshcd.c +@@ -8723,6 +8723,8 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, + struct scsi_device *sdp; + unsigned long flags; + int ret, retries; ++ unsigned long deadline; ++ int32_t remaining; + + spin_lock_irqsave(hba->host->host_lock, flags); + sdp = hba->ufs_device_wlun; +@@ -8755,9 +8757,14 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, + * callbacks hence set the RQF_PM flag so that it doesn't resume the + * already suspended childs. + */ ++ deadline = jiffies + 10 * HZ; + for (retries = 3; retries > 0; --retries) { ++ ret = -ETIMEDOUT; ++ remaining = deadline - jiffies; ++ if (remaining <= 0) ++ break; + ret = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, &sshdr, +- START_STOP_TIMEOUT, 0, 0, RQF_PM, NULL); ++ remaining / HZ, 0, 0, RQF_PM, NULL); + if (!scsi_status_is_check_condition(ret) || + !scsi_sense_valid(&sshdr) || + sshdr.sense_key != UNIT_ATTENTION) +diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c +index c13b9290e3575..d0057d18d2f4a 100644 +--- a/drivers/vfio/vfio_iommu_type1.c ++++ b/drivers/vfio/vfio_iommu_type1.c +@@ -557,6 +557,18 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, + ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, + pages, NULL, NULL); + if (ret > 0) { ++ int i; ++ ++ /* ++ * The zero page is always resident, we don't need to pin it ++ * and it falls into our invalid/reserved test so we don't ++ * unpin in put_pfn(). Unpin all zero pages in the batch here. ++ */ ++ for (i = 0 ; i < ret; i++) { ++ if (unlikely(is_zero_pfn(page_to_pfn(pages[i])))) ++ unpin_user_page(pages[i]); ++ } ++ + *pfn = page_to_pfn(pages[0]); + goto done; + } +diff --git a/drivers/video/fbdev/chipsfb.c b/drivers/video/fbdev/chipsfb.c +index 393894af26f84..2b00a9d554fc0 100644 +--- a/drivers/video/fbdev/chipsfb.c ++++ b/drivers/video/fbdev/chipsfb.c +@@ -430,6 +430,7 @@ static int chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent) + err_release_fb: + framebuffer_release(p); + err_disable: ++ pci_disable_device(dp); + err_out: + return rc; + } +diff --git a/drivers/video/fbdev/core/fbsysfs.c b/drivers/video/fbdev/core/fbsysfs.c +index c2a60b187467e..4d7f63892dcc4 100644 +--- a/drivers/video/fbdev/core/fbsysfs.c ++++ b/drivers/video/fbdev/core/fbsysfs.c +@@ -84,6 +84,10 @@ void framebuffer_release(struct fb_info *info) + if (WARN_ON(refcount_read(&info->count))) + return; + ++#if IS_ENABLED(CONFIG_FB_BACKLIGHT) ++ mutex_destroy(&info->bl_curve_mutex); ++#endif ++ + kfree(info->apertures); + kfree(info); + } +diff --git a/drivers/video/fbdev/omap/omapfb_main.c b/drivers/video/fbdev/omap/omapfb_main.c +index 292fcb0a24fc9..6ff237cee7f87 100644 +--- a/drivers/video/fbdev/omap/omapfb_main.c ++++ b/drivers/video/fbdev/omap/omapfb_main.c +@@ -1643,14 +1643,14 @@ static int omapfb_do_probe(struct platform_device *pdev, + goto cleanup; + } + fbdev->int_irq = platform_get_irq(pdev, 0); +- if (!fbdev->int_irq) { ++ if (fbdev->int_irq < 0) { + dev_err(&pdev->dev, "unable to get irq\n"); + r = ENXIO; + goto cleanup; + } + + fbdev->ext_irq = platform_get_irq(pdev, 1); +- if (!fbdev->ext_irq) { ++ if (fbdev->ext_irq < 0) { + dev_err(&pdev->dev, "unable to get irq\n"); + r = ENXIO; + goto cleanup; +diff --git a/fs/afs/flock.c b/fs/afs/flock.c +index c4210a3964d8b..bbcc5afd15760 100644 +--- a/fs/afs/flock.c ++++ b/fs/afs/flock.c +@@ -76,7 +76,7 @@ void afs_lock_op_done(struct afs_call *call) + if (call->error == 0) { + spin_lock(&vnode->lock); + trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0); +- vnode->locked_at = call->reply_time; ++ vnode->locked_at = call->issue_time; + afs_schedule_lock_extension(vnode); + spin_unlock(&vnode->lock); + } +diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c +index 4943413d9c5f7..7d37f63ef0f09 100644 +--- a/fs/afs/fsclient.c ++++ b/fs/afs/fsclient.c +@@ -131,7 +131,7 @@ bad: + + static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry) + { +- return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry; ++ return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry; + } + + static void xdr_decode_AFSCallBack(const __be32 **_bp, +diff --git a/fs/afs/internal.h b/fs/afs/internal.h +index a6f25d9e75b52..28bdd0387e5ea 100644 +--- a/fs/afs/internal.h ++++ b/fs/afs/internal.h +@@ -137,7 +137,6 @@ struct afs_call { + bool need_attention; /* T if RxRPC poked us */ + bool async; /* T if asynchronous */ + bool upgrade; /* T to request service upgrade */ +- bool have_reply_time; /* T if have got reply_time */ + bool intr; /* T if interruptible */ + bool unmarshalling_error; /* T if an unmarshalling error occurred */ + u16 service_id; /* Actual service ID (after upgrade) */ +@@ -151,7 +150,7 @@ struct afs_call { + } __attribute__((packed)); + __be64 tmp64; + }; +- ktime_t reply_time; /* Time of first reply packet */ ++ ktime_t issue_time; /* Time of issue of operation */ + }; + + struct afs_call_type { +diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c +index a5434f3e57c68..e3de7fea36435 100644 +--- a/fs/afs/rxrpc.c ++++ b/fs/afs/rxrpc.c +@@ -347,6 +347,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) + if (call->max_lifespan) + rxrpc_kernel_set_max_life(call->net->socket, rxcall, + call->max_lifespan); ++ call->issue_time = ktime_get_real(); + + /* send the request */ + iov[0].iov_base = call->request; +@@ -497,12 +498,6 @@ static void afs_deliver_to_call(struct afs_call *call) + return; + } + +- if (!call->have_reply_time && +- rxrpc_kernel_get_reply_time(call->net->socket, +- call->rxcall, +- &call->reply_time)) +- call->have_reply_time = true; +- + ret = call->type->deliver(call); + state = READ_ONCE(call->state); + if (ret == 0 && call->unmarshalling_error) +diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c +index fdc7d675b4b0c..11571cca86c19 100644 +--- a/fs/afs/yfsclient.c ++++ b/fs/afs/yfsclient.c +@@ -232,8 +232,7 @@ static void xdr_decode_YFSCallBack(const __be32 **_bp, + struct afs_callback *cb = &scb->callback; + ktime_t cb_expiry; + +- cb_expiry = call->reply_time; +- cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100); ++ cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100); + cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC); + scb->have_cb = true; + *_bp += xdr_size(x); +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 4d8acd7e63eb5..1bbc810574f22 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1065,8 +1065,6 @@ struct btrfs_fs_info { + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; +- /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ +- wait_queue_head_t zone_finish_wait; + + #ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index a2505cfc6bc10..781952c5a5c23 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3173,7 +3173,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) + init_waitqueue_head(&fs_info->transaction_blocked_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->delayed_iputs_wait); +- init_waitqueue_head(&fs_info->zone_finish_wait); + + /* Usable values until the real ones are cached from the superblock */ + fs_info->nodesize = 4096; +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 61496ecb1e201..f79f8d7cffcf2 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1643,10 +1643,9 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, + done_offset = end; + + if (done_offset == start) { +- struct btrfs_fs_info *info = inode->root->fs_info; +- +- wait_var_event(&info->zone_finish_wait, +- !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); ++ wait_on_bit_io(&inode->root->fs_info->flags, ++ BTRFS_FS_NEED_ZONE_FINISH, ++ TASK_UNINTERRUPTIBLE); + continue; + } + +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index b0c5b4738b1f7..17623e6410c5d 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -199,7 +199,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + + if (flags & BTRFS_BLOCK_GROUP_DATA) +- return SZ_1G; ++ return BTRFS_MAX_DATA_CHUNK_SIZE; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return SZ_32M; + +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 3460fd6743807..16e01fbdcec83 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -5266,6 +5266,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, + ctl->stripe_size); + } + ++ /* Stripe size should not go beyond 1G. */ ++ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); ++ + /* Align to BTRFS_STRIPE_LEN */ + ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); + ctl->chunk_size = ctl->stripe_size * data_stripes; +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 31cb11daa8e82..1386362fad3b8 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -421,10 +421,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) + * since btrfs adds the pages one by one to a bio, and btrfs cannot + * increase the metadata reservation even if it increases the number of + * extents, it is safe to stick with the limit. ++ * ++ * With the zoned emulation, we can have non-zoned device on the zoned ++ * mode. In this case, we don't have a valid max zone append size. So, ++ * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. + */ +- zone_info->max_zone_append_size = +- min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, +- (u64)bdev_max_segments(bdev) << PAGE_SHIFT); ++ if (bdev_is_zoned(bdev)) { ++ zone_info->max_zone_append_size = min_t(u64, ++ (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, ++ (u64)bdev_max_segments(bdev) << PAGE_SHIFT); ++ } else { ++ zone_info->max_zone_append_size = ++ (u64)bdev_max_segments(bdev) << PAGE_SHIFT; ++ } + if (!IS_ALIGNED(nr_sectors, zone_sectors)) + zone_info->nr_zones++; + +@@ -1178,7 +1187,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) + * offset. + */ + static int calculate_alloc_pointer(struct btrfs_block_group *cache, +- u64 *offset_ret) ++ u64 *offset_ret, bool new) + { + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_root *root; +@@ -1188,6 +1197,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, + int ret; + u64 length; + ++ /* ++ * Avoid tree lookups for a new block group, there's no use for it. ++ * It must always be 0. ++ * ++ * Also, we have a lock chain of extent buffer lock -> chunk mutex. ++ * For new a block group, this function is called from ++ * btrfs_make_block_group() which is already taking the chunk mutex. ++ * Thus, we cannot call calculate_alloc_pointer() which takes extent ++ * buffer locks to avoid deadlock. ++ */ ++ if (new) { ++ *offset_ret = 0; ++ return 0; ++ } ++ + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +@@ -1323,6 +1347,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) + else + num_conventional++; + ++ /* ++ * Consider a zone as active if we can allow any number of ++ * active zones. ++ */ ++ if (!device->zone_info->max_active_zones) ++ __set_bit(i, active); ++ + if (!is_sequential) { + alloc_offsets[i] = WP_CONVENTIONAL; + continue; +@@ -1389,45 +1420,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) + __set_bit(i, active); + break; + } +- +- /* +- * Consider a zone as active if we can allow any number of +- * active zones. +- */ +- if (!device->zone_info->max_active_zones) +- __set_bit(i, active); + } + + if (num_sequential > 0) + cache->seq_zone = true; + + if (num_conventional > 0) { +- /* +- * Avoid calling calculate_alloc_pointer() for new BG. It +- * is no use for new BG. It must be always 0. +- * +- * Also, we have a lock chain of extent buffer lock -> +- * chunk mutex. For new BG, this function is called from +- * btrfs_make_block_group() which is already taking the +- * chunk mutex. Thus, we cannot call +- * calculate_alloc_pointer() which takes extent buffer +- * locks to avoid deadlock. +- */ +- + /* Zone capacity is always zone size in emulation */ + cache->zone_capacity = cache->length; +- if (new) { +- cache->alloc_offset = 0; +- goto out; +- } +- ret = calculate_alloc_pointer(cache, &last_alloc); +- if (ret || map->num_stripes == num_conventional) { +- if (!ret) +- cache->alloc_offset = last_alloc; +- else +- btrfs_err(fs_info, ++ ret = calculate_alloc_pointer(cache, &last_alloc, new); ++ if (ret) { ++ btrfs_err(fs_info, + "zoned: failed to determine allocation offset of bg %llu", +- cache->start); ++ cache->start); ++ goto out; ++ } else if (map->num_stripes == num_conventional) { ++ cache->alloc_offset = last_alloc; ++ cache->zone_is_active = 1; + goto out; + } + } +@@ -1495,13 +1504,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) + goto out; + } + +- if (cache->zone_is_active) { +- btrfs_get_block_group(cache); +- spin_lock(&fs_info->zone_active_bgs_lock); +- list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); +- spin_unlock(&fs_info->zone_active_bgs_lock); +- } +- + out: + if (cache->alloc_offset > fs_info->zone_size) { + btrfs_err(fs_info, +@@ -1526,10 +1528,16 @@ out: + ret = -EIO; + } + +- if (!ret) ++ if (!ret) { + cache->meta_write_pointer = cache->alloc_offset + cache->start; +- +- if (ret) { ++ if (cache->zone_is_active) { ++ btrfs_get_block_group(cache); ++ spin_lock(&fs_info->zone_active_bgs_lock); ++ list_add_tail(&cache->active_bg_list, ++ &fs_info->zone_active_bgs); ++ spin_unlock(&fs_info->zone_active_bgs_lock); ++ } ++ } else { + kfree(cache->physical_map); + cache->physical_map = NULL; + } +@@ -2007,8 +2015,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ + /* For active_bg_list */ + btrfs_put_block_group(block_group); + +- clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); +- wake_up_all(&fs_info->zone_finish_wait); ++ clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + + return 0; + } +diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c +index f5dcc4940b6da..9dfd2dd612c25 100644 +--- a/fs/cifs/smb2file.c ++++ b/fs/cifs/smb2file.c +@@ -61,7 +61,6 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, + nr_ioctl_req.Reserved = 0; + rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, + fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, +- true /* is_fsctl */, + (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), + CIFSMaxBufSize, NULL, NULL /* no return info */); + if (rc == -EOPNOTSUPP) { +diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c +index 3898ec2632dc4..e8a8daa82ed76 100644 +--- a/fs/cifs/smb2ops.c ++++ b/fs/cifs/smb2ops.c +@@ -680,7 +680,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) + struct cifs_ses *ses = tcon->ses; + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, +- FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, ++ FSCTL_QUERY_NETWORK_INTERFACE_INFO, + NULL /* no data input */, 0 /* no data input */, + CIFSMaxBufSize, (char **)&out_buf, &ret_data_len); + if (rc == -EOPNOTSUPP) { +@@ -1609,9 +1609,8 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, + struct resume_key_req *res_key; + + rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, +- FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, +- NULL, 0 /* no input */, CIFSMaxBufSize, +- (char **)&res_key, &ret_data_len); ++ FSCTL_SRV_REQUEST_RESUME_KEY, NULL, 0 /* no input */, ++ CIFSMaxBufSize, (char **)&res_key, &ret_data_len); + + if (rc == -EOPNOTSUPP) { + pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); +@@ -1753,7 +1752,7 @@ smb2_ioctl_query_info(const unsigned int xid, + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID, +- qi.info_type, true, buffer, qi.output_buffer_length, ++ qi.info_type, buffer, qi.output_buffer_length, + CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); + free_req1_func = SMB2_ioctl_free; +@@ -1929,9 +1928,8 @@ smb2_copychunk_range(const unsigned int xid, + retbuf = NULL; + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, +- true /* is_fsctl */, (char *)pcchunk, +- sizeof(struct copychunk_ioctl), CIFSMaxBufSize, +- (char **)&retbuf, &ret_data_len); ++ (char *)pcchunk, sizeof(struct copychunk_ioctl), ++ CIFSMaxBufSize, (char **)&retbuf, &ret_data_len); + if (rc == 0) { + if (ret_data_len != + sizeof(struct copychunk_ioctl_rsp)) { +@@ -2091,7 +2089,6 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_SPARSE, +- true /* is_fctl */, + &setsparse, 1, CIFSMaxBufSize, NULL, NULL); + if (rc) { + tcon->broken_sparse_sup = true; +@@ -2174,7 +2171,6 @@ smb2_duplicate_extents(const unsigned int xid, + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, + FSCTL_DUPLICATE_EXTENTS_TO_FILE, +- true /* is_fsctl */, + (char *)&dup_ext_buf, + sizeof(struct duplicate_extents_to_file), + CIFSMaxBufSize, NULL, +@@ -2209,7 +2205,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, + return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SET_INTEGRITY_INFORMATION, +- true /* is_fsctl */, + (char *)&integr_info, + sizeof(struct fsctl_set_integrity_information_req), + CIFSMaxBufSize, NULL, +@@ -2262,7 +2257,6 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SRV_ENUMERATE_SNAPSHOTS, +- true /* is_fsctl */, + NULL, 0 /* no input data */, max_response_size, + (char **)&retbuf, + &ret_data_len); +@@ -2982,7 +2976,6 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, + do { + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_DFS_GET_REFERRALS, +- true /* is_fsctl */, + (char *)dfs_req, dfs_req_size, CIFSMaxBufSize, + (char **)&dfs_rsp, &dfs_rsp_size); + if (!is_retryable_error(rc)) +@@ -3189,8 +3182,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, + + rc = SMB2_ioctl_init(tcon, server, + &rqst[1], fid.persistent_fid, +- fid.volatile_fid, FSCTL_GET_REPARSE_POINT, +- true /* is_fctl */, NULL, 0, ++ fid.volatile_fid, FSCTL_GET_REPARSE_POINT, NULL, 0, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); +@@ -3370,8 +3362,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, + + rc = SMB2_ioctl_init(tcon, server, + &rqst[1], COMPOUND_FID, +- COMPOUND_FID, FSCTL_GET_REPARSE_POINT, +- true /* is_fctl */, NULL, 0, ++ COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); +@@ -3599,26 +3590,43 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb, + return pntsd; + } + ++static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon, ++ loff_t offset, loff_t len, unsigned int xid) ++{ ++ struct cifsFileInfo *cfile = file->private_data; ++ struct file_zero_data_information fsctl_buf; ++ ++ cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len); ++ ++ fsctl_buf.FileOffset = cpu_to_le64(offset); ++ fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); ++ ++ return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, ++ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, ++ (char *)&fsctl_buf, ++ sizeof(struct file_zero_data_information), ++ 0, NULL, NULL); ++} ++ + static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, + loff_t offset, loff_t len, bool keep_size) + { + struct cifs_ses *ses = tcon->ses; +- struct inode *inode; +- struct cifsInodeInfo *cifsi; ++ struct inode *inode = file_inode(file); ++ struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifsFileInfo *cfile = file->private_data; +- struct file_zero_data_information fsctl_buf; + long rc; + unsigned int xid; + __le64 eof; + + xid = get_xid(); + +- inode = d_inode(cfile->dentry); +- cifsi = CIFS_I(inode); +- + trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); + ++ inode_lock(inode); ++ filemap_invalidate_lock(inode->i_mapping); ++ + /* + * We zero the range through ioctl, so we need remove the page caches + * first, otherwise the data may be inconsistent with the server. +@@ -3626,26 +3634,12 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, + truncate_pagecache_range(inode, offset, offset + len - 1); + + /* if file not oplocked can't be sure whether asking to extend size */ +- if (!CIFS_CACHE_READ(cifsi)) +- if (keep_size == false) { +- rc = -EOPNOTSUPP; +- trace_smb3_zero_err(xid, cfile->fid.persistent_fid, +- tcon->tid, ses->Suid, offset, len, rc); +- free_xid(xid); +- return rc; +- } +- +- cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len); +- +- fsctl_buf.FileOffset = cpu_to_le64(offset); +- fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); ++ rc = -EOPNOTSUPP; ++ if (keep_size == false && !CIFS_CACHE_READ(cifsi)) ++ goto zero_range_exit; + +- rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, +- cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true, +- (char *)&fsctl_buf, +- sizeof(struct file_zero_data_information), +- 0, NULL, NULL); +- if (rc) ++ rc = smb3_zero_data(file, tcon, offset, len, xid); ++ if (rc < 0) + goto zero_range_exit; + + /* +@@ -3658,6 +3652,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, + } + + zero_range_exit: ++ filemap_invalidate_unlock(inode->i_mapping); ++ inode_unlock(inode); + free_xid(xid); + if (rc) + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, +@@ -3702,7 +3698,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, +- true /* is_fctl */, (char *)&fsctl_buf, ++ (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), + CIFSMaxBufSize, NULL, NULL); + filemap_invalidate_unlock(inode->i_mapping); +@@ -3764,7 +3760,7 @@ static int smb3_simple_fallocate_range(unsigned int xid, + in_data.length = cpu_to_le64(len); + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, +- FSCTL_QUERY_ALLOCATED_RANGES, true, ++ FSCTL_QUERY_ALLOCATED_RANGES, + (char *)&in_data, sizeof(in_data), + 1024 * sizeof(struct file_allocated_range_buffer), + (char **)&out_data, &out_data_len); +@@ -4085,7 +4081,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, +- FSCTL_QUERY_ALLOCATED_RANGES, true, ++ FSCTL_QUERY_ALLOCATED_RANGES, + (char *)&in_data, sizeof(in_data), + sizeof(struct file_allocated_range_buffer), + (char **)&out_data, &out_data_len); +@@ -4145,7 +4141,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon, + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, +- FSCTL_QUERY_ALLOCATED_RANGES, true, ++ FSCTL_QUERY_ALLOCATED_RANGES, + (char *)&in_data, sizeof(in_data), + 1024 * sizeof(struct file_allocated_range_buffer), + (char **)&out_data, &out_data_len); +diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c +index ba58d7fd54f9e..31d37afae741f 100644 +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -1174,7 +1174,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) + } + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, +- FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, ++ FSCTL_VALIDATE_NEGOTIATE_INFO, + (char *)pneg_inbuf, inbuflen, CIFSMaxBufSize, + (char **)&pneg_rsp, &rsplen); + if (rc == -EOPNOTSUPP) { +@@ -3053,7 +3053,7 @@ int + SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, + struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, +- bool is_fsctl, char *in_data, u32 indatalen, ++ char *in_data, u32 indatalen, + __u32 max_response_size) + { + struct smb2_ioctl_req *req; +@@ -3128,10 +3128,8 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, + req->hdr.CreditCharge = + cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size), + SMB2_MAX_BUFFER_SIZE)); +- if (is_fsctl) +- req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); +- else +- req->Flags = 0; ++ /* always an FSCTL (for now) */ ++ req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); + + /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ + if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) +@@ -3158,9 +3156,9 @@ SMB2_ioctl_free(struct smb_rqst *rqst) + */ + int + SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, +- u64 volatile_fid, u32 opcode, bool is_fsctl, +- char *in_data, u32 indatalen, u32 max_out_data_len, +- char **out_data, u32 *plen /* returned data len */) ++ u64 volatile_fid, u32 opcode, char *in_data, u32 indatalen, ++ u32 max_out_data_len, char **out_data, ++ u32 *plen /* returned data len */) + { + struct smb_rqst rqst; + struct smb2_ioctl_rsp *rsp = NULL; +@@ -3202,7 +3200,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + + rc = SMB2_ioctl_init(tcon, server, + &rqst, persistent_fid, volatile_fid, opcode, +- is_fsctl, in_data, indatalen, max_out_data_len); ++ in_data, indatalen, max_out_data_len); + if (rc) + goto ioctl_exit; + +@@ -3294,7 +3292,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, + cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + + rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, +- FSCTL_SET_COMPRESSION, true /* is_fsctl */, ++ FSCTL_SET_COMPRESSION, + (char *)&fsctl_input /* data input */, + 2 /* in data len */, CIFSMaxBufSize /* max out data */, + &ret_data /* out data */, NULL); +diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h +index a69f1eed1cfe5..d57d7202dc367 100644 +--- a/fs/cifs/smb2proto.h ++++ b/fs/cifs/smb2proto.h +@@ -147,13 +147,13 @@ extern int SMB2_open_init(struct cifs_tcon *tcon, + extern void SMB2_open_free(struct smb_rqst *rqst); + extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 opcode, +- bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen, ++ char *in_data, u32 indatalen, u32 maxoutlen, + char **out_data, u32 *plen /* returned data len */); + extern int SMB2_ioctl_init(struct cifs_tcon *tcon, + struct TCP_Server_Info *server, + struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, +- bool is_fsctl, char *in_data, u32 indatalen, ++ char *in_data, u32 indatalen, + __u32 max_response_size); + extern void SMB2_ioctl_free(struct smb_rqst *rqst); + extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, +diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c +index 3dcf0b8b4e932..232cfdf095aeb 100644 +--- a/fs/debugfs/inode.c ++++ b/fs/debugfs/inode.c +@@ -744,6 +744,28 @@ void debugfs_remove(struct dentry *dentry) + } + EXPORT_SYMBOL_GPL(debugfs_remove); + ++/** ++ * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it ++ * @name: a pointer to a string containing the name of the item to look up. ++ * @parent: a pointer to the parent dentry of the item. ++ * ++ * This is the equlivant of doing something like ++ * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting ++ * handled for the directory being looked up. ++ */ ++void debugfs_lookup_and_remove(const char *name, struct dentry *parent) ++{ ++ struct dentry *dentry; ++ ++ dentry = debugfs_lookup(name, parent); ++ if (!dentry) ++ return; ++ ++ debugfs_remove(dentry); ++ dput(dentry); ++} ++EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); ++ + /** + * debugfs_rename - rename a file/directory in the debugfs filesystem + * @old_dir: a pointer to the parent dentry for the renamed object. This +diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c +index 8e01d89c3319e..b5fd9d71e67f1 100644 +--- a/fs/erofs/fscache.c ++++ b/fs/erofs/fscache.c +@@ -222,8 +222,10 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) + + rreq = erofs_fscache_alloc_request(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); +- if (IS_ERR(rreq)) ++ if (IS_ERR(rreq)) { ++ ret = PTR_ERR(rreq); + goto out; ++ } + + return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + rreq, mdev.m_pa); +@@ -301,8 +303,10 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio) + + rreq = erofs_fscache_alloc_request(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); +- if (IS_ERR(rreq)) ++ if (IS_ERR(rreq)) { ++ ret = PTR_ERR(rreq); + goto out_unlock; ++ } + + pstart = mdev.m_pa + (pos - map.m_la); + return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, +diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h +index cfee49d33b95a..a01cc82795a25 100644 +--- a/fs/erofs/internal.h ++++ b/fs/erofs/internal.h +@@ -195,7 +195,6 @@ struct erofs_workgroup { + atomic_t refcount; + }; + +-#if defined(CONFIG_SMP) + static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, + int val) + { +@@ -224,34 +223,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) + return atomic_cond_read_relaxed(&grp->refcount, + VAL != EROFS_LOCKED_MAGIC); + } +-#else +-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, +- int val) +-{ +- preempt_disable(); +- /* no need to spin on UP platforms, let's just disable preemption. */ +- if (val != atomic_read(&grp->refcount)) { +- preempt_enable(); +- return false; +- } +- return true; +-} +- +-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, +- int orig_val) +-{ +- preempt_enable(); +-} +- +-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) +-{ +- int v = atomic_read(&grp->refcount); +- +- /* workgroup is never freezed on uniprocessor systems */ +- DBG_BUGON(v == EROFS_LOCKED_MAGIC); +- return v; +-} +-#endif /* !CONFIG_SMP */ + #endif /* !CONFIG_EROFS_FS_ZIP */ + + /* we strictly follow PAGE_SIZE and no buffer head yet */ +diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c +index 81d26abf486fa..da85b39791957 100644 +--- a/fs/tracefs/inode.c ++++ b/fs/tracefs/inode.c +@@ -141,6 +141,8 @@ struct tracefs_mount_opts { + kuid_t uid; + kgid_t gid; + umode_t mode; ++ /* Opt_* bitfield. */ ++ unsigned int opts; + }; + + enum { +@@ -241,6 +243,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) + kgid_t gid; + char *p; + ++ opts->opts = 0; + opts->mode = TRACEFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { +@@ -275,24 +278,36 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) + * but traditionally tracefs has ignored all mount options + */ + } ++ ++ opts->opts |= BIT(token); + } + + return 0; + } + +-static int tracefs_apply_options(struct super_block *sb) ++static int tracefs_apply_options(struct super_block *sb, bool remount) + { + struct tracefs_fs_info *fsi = sb->s_fs_info; + struct inode *inode = d_inode(sb->s_root); + struct tracefs_mount_opts *opts = &fsi->mount_opts; + +- inode->i_mode &= ~S_IALLUGO; +- inode->i_mode |= opts->mode; ++ /* ++ * On remount, only reset mode/uid/gid if they were provided as mount ++ * options. ++ */ ++ ++ if (!remount || opts->opts & BIT(Opt_mode)) { ++ inode->i_mode &= ~S_IALLUGO; ++ inode->i_mode |= opts->mode; ++ } + +- inode->i_uid = opts->uid; ++ if (!remount || opts->opts & BIT(Opt_uid)) ++ inode->i_uid = opts->uid; + +- /* Set all the group ids to the mount option */ +- set_gid(sb->s_root, opts->gid); ++ if (!remount || opts->opts & BIT(Opt_gid)) { ++ /* Set all the group ids to the mount option */ ++ set_gid(sb->s_root, opts->gid); ++ } + + return 0; + } +@@ -307,7 +322,7 @@ static int tracefs_remount(struct super_block *sb, int *flags, char *data) + if (err) + goto fail; + +- tracefs_apply_options(sb); ++ tracefs_apply_options(sb, true); + + fail: + return err; +@@ -359,7 +374,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent) + + sb->s_op = &tracefs_super_operations; + +- tracefs_apply_options(sb); ++ tracefs_apply_options(sb, false); + + return 0; + +diff --git a/include/kunit/test.h b/include/kunit/test.h +index 8ffcd7de96070..648dbb00a3008 100644 +--- a/include/kunit/test.h ++++ b/include/kunit/test.h +@@ -863,7 +863,7 @@ do { \ + + #define KUNIT_EXPECT_LE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_INT_ASSERTION(test, \ +- KUNIT_ASSERTION, \ ++ KUNIT_EXPECTATION, \ + left, <=, right, \ + fmt, \ + ##__VA_ARGS__) +@@ -1153,7 +1153,7 @@ do { \ + + #define KUNIT_ASSERT_LT_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_INT_ASSERTION(test, \ +- KUNIT_EXPECTATION, \ ++ KUNIT_ASSERTION, \ + left, <, right, \ + fmt, \ + ##__VA_ARGS__) +@@ -1194,7 +1194,7 @@ do { \ + + #define KUNIT_ASSERT_GT_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_INT_ASSERTION(test, \ +- KUNIT_EXPECTATION, \ ++ KUNIT_ASSERTION, \ + left, >, right, \ + fmt, \ + ##__VA_ARGS__) +diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h +index badcc0e3418f2..262664107b839 100644 +--- a/include/linux/buffer_head.h ++++ b/include/linux/buffer_head.h +@@ -136,6 +136,17 @@ BUFFER_FNS(Defer_Completion, defer_completion) + + static __always_inline void set_buffer_uptodate(struct buffer_head *bh) + { ++ /* ++ * If somebody else already set this uptodate, they will ++ * have done the memory barrier, and a reader will thus ++ * see *some* valid buffer state. ++ * ++ * Any other serialization (with IO errors or whatever that ++ * might clear the bit) has to come from other state (eg BH_Lock). ++ */ ++ if (test_bit(BH_Uptodate, &bh->b_state)) ++ return; ++ + /* + * make it consistent with folio_mark_uptodate + * pairs with smp_load_acquire in buffer_uptodate +diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h +index c869f1e73d755..f60674692d365 100644 +--- a/include/linux/debugfs.h ++++ b/include/linux/debugfs.h +@@ -91,6 +91,8 @@ struct dentry *debugfs_create_automount(const char *name, + void debugfs_remove(struct dentry *dentry); + #define debugfs_remove_recursive debugfs_remove + ++void debugfs_lookup_and_remove(const char *name, struct dentry *parent); ++ + const struct file_operations *debugfs_real_fops(const struct file *filp); + + int debugfs_file_get(struct dentry *dentry); +@@ -225,6 +227,10 @@ static inline void debugfs_remove(struct dentry *dentry) + static inline void debugfs_remove_recursive(struct dentry *dentry) + { } + ++static inline void debugfs_lookup_and_remove(const char *name, ++ struct dentry *parent) ++{ } ++ + const struct file_operations *debugfs_real_fops(const struct file *filp); + + static inline int debugfs_file_get(struct dentry *dentry) +diff --git a/include/linux/dmar.h b/include/linux/dmar.h +index cbd714a198a0a..f3a3d95df5325 100644 +--- a/include/linux/dmar.h ++++ b/include/linux/dmar.h +@@ -69,6 +69,7 @@ struct dmar_pci_notify_info { + + extern struct rw_semaphore dmar_global_lock; + extern struct list_head dmar_drhd_units; ++extern int intel_iommu_enabled; + + #define for_each_drhd_unit(drhd) \ + list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ +@@ -92,7 +93,8 @@ extern struct list_head dmar_drhd_units; + static inline bool dmar_rcu_check(void) + { + return rwsem_is_locked(&dmar_global_lock) || +- system_state == SYSTEM_BOOTING; ++ system_state == SYSTEM_BOOTING || ++ (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled); + } + + #define dmar_rcu_dereference(p) rcu_dereference_check((p), dmar_rcu_check()) +diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h +index eafa1d2489fda..4e94755098f19 100644 +--- a/include/linux/lsm_hook_defs.h ++++ b/include/linux/lsm_hook_defs.h +@@ -406,4 +406,5 @@ LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) + #ifdef CONFIG_IO_URING + LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) + LSM_HOOK(int, 0, uring_sqpoll, void) ++LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) + #endif /* CONFIG_IO_URING */ +diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h +index 91c8146649f59..b681cfce6190a 100644 +--- a/include/linux/lsm_hooks.h ++++ b/include/linux/lsm_hooks.h +@@ -1575,6 +1575,9 @@ + * Check whether the current task is allowed to spawn a io_uring polling + * thread (IORING_SETUP_SQPOLL). + * ++ * @uring_cmd: ++ * Check whether the file_operations uring_cmd is allowed to run. ++ * + */ + union security_list_options { + #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); +diff --git a/include/linux/security.h b/include/linux/security.h +index 7fc4e9f49f542..3cc127bb5bfd4 100644 +--- a/include/linux/security.h ++++ b/include/linux/security.h +@@ -2051,6 +2051,7 @@ static inline int security_perf_event_write(struct perf_event *event) + #ifdef CONFIG_SECURITY + extern int security_uring_override_creds(const struct cred *new); + extern int security_uring_sqpoll(void); ++extern int security_uring_cmd(struct io_uring_cmd *ioucmd); + #else + static inline int security_uring_override_creds(const struct cred *new) + { +@@ -2060,6 +2061,10 @@ static inline int security_uring_sqpoll(void) + { + return 0; + } ++static inline int security_uring_cmd(struct io_uring_cmd *ioucmd) ++{ ++ return 0; ++} + #endif /* CONFIG_SECURITY */ + #endif /* CONFIG_IO_URING */ + +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 2f41364a6791e..63d0a21b63162 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -2528,6 +2528,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) + return skb_headlen(skb) + __skb_pagelen(skb); + } + ++static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, ++ int i, struct page *page, ++ int off, int size) ++{ ++ skb_frag_t *frag = &shinfo->frags[i]; ++ ++ /* ++ * Propagate page pfmemalloc to the skb if we can. The problem is ++ * that not all callers have unique ownership of the page but rely ++ * on page_is_pfmemalloc doing the right thing(tm). ++ */ ++ frag->bv_page = page; ++ frag->bv_offset = off; ++ skb_frag_size_set(frag, size); ++} ++ + /** + * __skb_fill_page_desc - initialise a paged fragment in an skb + * @skb: buffer containing fragment to be initialised +@@ -2544,17 +2560,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) + static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, + struct page *page, int off, int size) + { +- skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; +- +- /* +- * Propagate page pfmemalloc to the skb if we can. The problem is +- * that not all callers have unique ownership of the page but rely +- * on page_is_pfmemalloc doing the right thing(tm). +- */ +- frag->bv_page = page; +- frag->bv_offset = off; +- skb_frag_size_set(frag, size); +- ++ __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); + page = compound_head(page); + if (page_is_pfmemalloc(page)) + skb->pfmemalloc = true; +@@ -2581,6 +2587,27 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, + skb_shinfo(skb)->nr_frags = i + 1; + } + ++/** ++ * skb_fill_page_desc_noacc - initialise a paged fragment in an skb ++ * @skb: buffer containing fragment to be initialised ++ * @i: paged fragment index to initialise ++ * @page: the page to use for this fragment ++ * @off: the offset to the data with @page ++ * @size: the length of the data ++ * ++ * Variant of skb_fill_page_desc() which does not deal with ++ * pfmemalloc, if page is not owned by us. ++ */ ++static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, ++ struct page *page, int off, ++ int size) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ ++ __skb_fill_page_desc_noacc(shinfo, i, page, off, size); ++ shinfo->nr_frags = i + 1; ++} ++ + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, + int size, unsigned int truesize); + +diff --git a/include/linux/time64.h b/include/linux/time64.h +index 81b9686a20799..2fb8232cff1d5 100644 +--- a/include/linux/time64.h ++++ b/include/linux/time64.h +@@ -20,6 +20,9 @@ struct itimerspec64 { + struct timespec64 it_value; + }; + ++/* Parameters used to convert the timespec values: */ ++#define PSEC_PER_NSEC 1000L ++ + /* Located here for timespec[64]_valid_strict */ + #define TIME64_MAX ((s64)~((u64)1 << 63)) + #define TIME64_MIN (-TIME64_MAX - 1) +diff --git a/include/linux/udp.h b/include/linux/udp.h +index 254a2654400f8..e96da4157d04d 100644 +--- a/include/linux/udp.h ++++ b/include/linux/udp.h +@@ -70,6 +70,7 @@ struct udp_sock { + * For encapsulation sockets. + */ + int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); ++ void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); + int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); + void (*encap_destroy)(struct sock *sk); + +diff --git a/include/net/bonding.h b/include/net/bonding.h +index cb904d356e31e..3b816ae8b1f3b 100644 +--- a/include/net/bonding.h ++++ b/include/net/bonding.h +@@ -161,8 +161,9 @@ struct slave { + struct net_device *dev; /* first - useful for panic debug */ + struct bonding *bond; /* our master */ + int delay; +- /* all three in jiffies */ ++ /* all 4 in jiffies */ + unsigned long last_link_up; ++ unsigned long last_tx; + unsigned long last_rx; + unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; + s8 link; /* one of BOND_LINK_XXXX */ +@@ -539,6 +540,16 @@ static inline unsigned long slave_last_rx(struct bonding *bond, + return slave->last_rx; + } + ++static inline void slave_update_last_tx(struct slave *slave) ++{ ++ WRITE_ONCE(slave->last_tx, jiffies); ++} ++ ++static inline unsigned long slave_last_tx(struct slave *slave) ++{ ++ return READ_ONCE(slave->last_tx); ++} ++ + #ifdef CONFIG_NET_POLL_CONTROLLER + static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, + struct sk_buff *skb) +diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h +index afc7ce713657b..72394f441dad8 100644 +--- a/include/net/udp_tunnel.h ++++ b/include/net/udp_tunnel.h +@@ -67,6 +67,9 @@ static inline int udp_sock_create(struct net *net, + typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); + typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, + struct sk_buff *skb); ++typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk, ++ struct sk_buff *skb, ++ unsigned int udp_offset); + typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); + typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, + struct list_head *head, +@@ -80,6 +83,7 @@ struct udp_tunnel_sock_cfg { + __u8 encap_type; + udp_tunnel_encap_rcv_t encap_rcv; + udp_tunnel_encap_err_lookup_t encap_err_lookup; ++ udp_tunnel_encap_err_rcv_t encap_err_rcv; + udp_tunnel_encap_destroy_t encap_destroy; + udp_tunnel_gro_receive_t gro_receive; + udp_tunnel_gro_complete_t gro_complete; +diff --git a/include/soc/at91/sama7-ddr.h b/include/soc/at91/sama7-ddr.h +index 9e17247474fa9..6ce3bd22f6c69 100644 +--- a/include/soc/at91/sama7-ddr.h ++++ b/include/soc/at91/sama7-ddr.h +@@ -38,6 +38,14 @@ + #define DDR3PHY_DSGCR_ODTPDD_ODT0 (1 << 20) /* ODT[0] Power Down Driver */ + + #define DDR3PHY_ZQ0SR0 (0x188) /* ZQ status register 0 */ ++#define DDR3PHY_ZQ0SR0_PDO_OFF (0) /* Pull-down output impedance select offset */ ++#define DDR3PHY_ZQ0SR0_PUO_OFF (5) /* Pull-up output impedance select offset */ ++#define DDR3PHY_ZQ0SR0_PDODT_OFF (10) /* Pull-down on-die termination impedance select offset */ ++#define DDR3PHY_ZQ0SRO_PUODT_OFF (15) /* Pull-up on-die termination impedance select offset */ ++ ++#define DDR3PHY_DX0DLLCR (0x1CC) /* DDR3PHY DATX8 DLL Control Register */ ++#define DDR3PHY_DX1DLLCR (0x20C) /* DDR3PHY DATX8 DLL Control Register */ ++#define DDR3PHY_DXDLLCR_DLLDIS (1 << 31) /* DLL Disable */ + + /* UDDRC */ + #define UDDRC_STAT (0x04) /* UDDRC Operating Mode Status Register */ +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index cd155b7e1346d..48833d0edd089 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -4878,6 +4878,10 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) + if (!req->file->f_op->uring_cmd) + return -EOPNOTSUPP; + ++ ret = security_uring_cmd(ioucmd); ++ if (ret) ++ return ret; ++ + if (ctx->flags & IORING_SETUP_SQE128) + issue_flags |= IO_URING_F_SQE128; + if (ctx->flags & IORING_SETUP_CQE32) +@@ -8260,6 +8264,7 @@ static void io_queue_async(struct io_kiocb *req, int ret) + + switch (io_arm_poll_handler(req, 0)) { + case IO_APOLL_READY: ++ io_kbuf_recycle(req, 0); + io_req_task_queue(req); + break; + case IO_APOLL_ABORTED: +diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c +index ce95aee05e8ae..e702ca368539a 100644 +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -2346,6 +2346,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) + } + EXPORT_SYMBOL_GPL(task_cgroup_path); + ++/** ++ * cgroup_attach_lock - Lock for ->attach() ++ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem ++ * ++ * cgroup migration sometimes needs to stabilize threadgroups against forks and ++ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() ++ * implementations (e.g. cpuset), also need to disable CPU hotplug. ++ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can ++ * lead to deadlocks. ++ * ++ * Bringing up a CPU may involve creating and destroying tasks which requires ++ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside ++ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while ++ * write-locking threadgroup_rwsem, the locking order is reversed and we end up ++ * waiting for an on-going CPU hotplug operation which in turn is waiting for ++ * the threadgroup_rwsem to be released to create new tasks. For more details: ++ * ++ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu ++ * ++ * Resolve the situation by always acquiring cpus_read_lock() before optionally ++ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that ++ * CPU hotplug is disabled on entry. ++ */ ++static void cgroup_attach_lock(bool lock_threadgroup) ++{ ++ cpus_read_lock(); ++ if (lock_threadgroup) ++ percpu_down_write(&cgroup_threadgroup_rwsem); ++} ++ ++/** ++ * cgroup_attach_unlock - Undo cgroup_attach_lock() ++ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem ++ */ ++static void cgroup_attach_unlock(bool lock_threadgroup) ++{ ++ if (lock_threadgroup) ++ percpu_up_write(&cgroup_threadgroup_rwsem); ++ cpus_read_unlock(); ++} ++ + /** + * cgroup_migrate_add_task - add a migration target task to a migration context + * @task: target task +@@ -2822,8 +2863,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, + } + + struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, +- bool *locked) +- __acquires(&cgroup_threadgroup_rwsem) ++ bool *threadgroup_locked) + { + struct task_struct *tsk; + pid_t pid; +@@ -2840,12 +2880,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + * Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); +- if (pid || threadgroup) { +- percpu_down_write(&cgroup_threadgroup_rwsem); +- *locked = true; +- } else { +- *locked = false; +- } ++ *threadgroup_locked = pid || threadgroup; ++ cgroup_attach_lock(*threadgroup_locked); + + rcu_read_lock(); + if (pid) { +@@ -2876,17 +2912,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + goto out_unlock_rcu; + + out_unlock_threadgroup: +- if (*locked) { +- percpu_up_write(&cgroup_threadgroup_rwsem); +- *locked = false; +- } ++ cgroup_attach_unlock(*threadgroup_locked); ++ *threadgroup_locked = false; + out_unlock_rcu: + rcu_read_unlock(); + return tsk; + } + +-void cgroup_procs_write_finish(struct task_struct *task, bool locked) +- __releases(&cgroup_threadgroup_rwsem) ++void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) + { + struct cgroup_subsys *ss; + int ssid; +@@ -2894,8 +2927,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked) + /* release reference from cgroup_procs_write_start() */ + put_task_struct(task); + +- if (locked) +- percpu_up_write(&cgroup_threadgroup_rwsem); ++ cgroup_attach_unlock(threadgroup_locked); ++ + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); +@@ -2950,12 +2983,11 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) + struct cgroup_subsys_state *d_css; + struct cgroup *dsct; + struct css_set *src_cset; ++ bool has_tasks; + int ret; + + lockdep_assert_held(&cgroup_mutex); + +- percpu_down_write(&cgroup_threadgroup_rwsem); +- + /* look up all csses currently attached to @cgrp's subtree */ + spin_lock_irq(&css_set_lock); + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { +@@ -2966,6 +2998,15 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) + } + spin_unlock_irq(&css_set_lock); + ++ /* ++ * We need to write-lock threadgroup_rwsem while migrating tasks. ++ * However, if there are no source csets for @cgrp, changing its ++ * controllers isn't gonna produce any task migrations and the ++ * write-locking can be skipped safely. ++ */ ++ has_tasks = !list_empty(&mgctx.preloaded_src_csets); ++ cgroup_attach_lock(has_tasks); ++ + /* NULL dst indicates self on default hierarchy */ + ret = cgroup_migrate_prepare_dst(&mgctx); + if (ret) +@@ -2985,7 +3026,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) + ret = cgroup_migrate_execute(&mgctx); + out_finish: + cgroup_migrate_finish(&mgctx); +- percpu_up_write(&cgroup_threadgroup_rwsem); ++ cgroup_attach_unlock(has_tasks); + return ret; + } + +@@ -4933,13 +4974,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + struct task_struct *task; + const struct cred *saved_cred; + ssize_t ret; +- bool locked; ++ bool threadgroup_locked; + + dst_cgrp = cgroup_kn_lock_live(of->kn, false); + if (!dst_cgrp) + return -ENODEV; + +- task = cgroup_procs_write_start(buf, threadgroup, &locked); ++ task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; +@@ -4965,7 +5006,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + ret = cgroup_attach_task(dst_cgrp, task, threadgroup); + + out_finish: +- cgroup_procs_write_finish(task, locked); ++ cgroup_procs_write_finish(task, threadgroup_locked); + out_unlock: + cgroup_kn_unlock(of->kn); + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 58aadfda9b8b3..1f3a55297f39d 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -2289,7 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + +- cpus_read_lock(); ++ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ + percpu_down_write(&cpuset_rwsem); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +@@ -2343,7 +2343,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) + wake_up(&cpuset_attach_wq); + + percpu_up_write(&cpuset_rwsem); +- cpus_read_unlock(); + } + + /* The various types of files and directories in a cpuset file system */ +diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c +index 5830dce6081b3..ce34d50f7a9bb 100644 +--- a/kernel/dma/swiotlb.c ++++ b/kernel/dma/swiotlb.c +@@ -464,7 +464,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size + } + } + +-#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) ++static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) ++{ ++ return start + (idx << IO_TLB_SHIFT); ++} + + /* + * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. +diff --git a/kernel/fork.c b/kernel/fork.c +index 9d44f2d46c696..d587c85f35b1e 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1225,6 +1225,7 @@ void mmput_async(struct mm_struct *mm) + schedule_work(&mm->async_put_work); + } + } ++EXPORT_SYMBOL_GPL(mmput_async); + #endif + + /** +diff --git a/kernel/kprobes.c b/kernel/kprobes.c +index 08350e35aba24..ca9d834d0b843 100644 +--- a/kernel/kprobes.c ++++ b/kernel/kprobes.c +@@ -1562,6 +1562,7 @@ static int check_kprobe_address_safe(struct kprobe *p, + /* Ensure it is not in reserved area nor out of text */ + if (!(core_kernel_text((unsigned long) p->addr) || + is_module_text_address((unsigned long) p->addr)) || ++ in_gate_area_no_mm((unsigned long) p->addr) || + within_kprobe_blacklist((unsigned long) p->addr) || + jump_label_text_reserved(p->addr, p->addr) || + static_call_text_reserved(p->addr, p->addr) || +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index bb3d63bdf4ae8..667876da8382d 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -416,7 +416,7 @@ void update_sched_domain_debugfs(void) + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%d", cpu); +- debugfs_remove(debugfs_lookup(buf, sd_dentry)); ++ debugfs_lookup_and_remove(buf, sd_dentry); + d_cpu = debugfs_create_dir(buf, sd_dentry); + + i = 0; +diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c +index cb866c3141af2..918730d749325 100644 +--- a/kernel/trace/trace_events_trigger.c ++++ b/kernel/trace/trace_events_trigger.c +@@ -142,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file) + { + struct event_trigger_data *data; + +- list_for_each_entry_rcu(data, &file->triggers, list) { ++ list_for_each_entry_rcu(data, &file->triggers, list, ++ lockdep_is_held(&event_mutex)) { + if (data->flags & EVENT_TRIGGER_FL_PROBE) + continue; + return true; +diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c +index 95b58bd757ce4..1e130da1b742c 100644 +--- a/kernel/trace/trace_preemptirq.c ++++ b/kernel/trace/trace_preemptirq.c +@@ -95,14 +95,14 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) + } + + lockdep_hardirqs_on_prepare(); +- lockdep_hardirqs_on(CALLER_ADDR0); ++ lockdep_hardirqs_on(caller_addr); + } + EXPORT_SYMBOL(trace_hardirqs_on_caller); + NOKPROBE_SYMBOL(trace_hardirqs_on_caller); + + __visible void trace_hardirqs_off_caller(unsigned long caller_addr) + { +- lockdep_hardirqs_off(CALLER_ADDR0); ++ lockdep_hardirqs_off(caller_addr); + + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); +diff --git a/mm/kmemleak.c b/mm/kmemleak.c +index a182f5ddaf68b..acd7cbb82e160 100644 +--- a/mm/kmemleak.c ++++ b/mm/kmemleak.c +@@ -1132,7 +1132,7 @@ EXPORT_SYMBOL(kmemleak_no_scan); + void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, + gfp_t gfp) + { +- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) ++ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + kmemleak_alloc(__va(phys), size, min_count, gfp); + } + EXPORT_SYMBOL(kmemleak_alloc_phys); +@@ -1146,7 +1146,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); + */ + void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) + { +- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) ++ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + kmemleak_free_part(__va(phys), size); + } + EXPORT_SYMBOL(kmemleak_free_part_phys); +@@ -1158,7 +1158,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); + */ + void __ref kmemleak_not_leak_phys(phys_addr_t phys) + { +- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) ++ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + kmemleak_not_leak(__va(phys)); + } + EXPORT_SYMBOL(kmemleak_not_leak_phys); +@@ -1170,7 +1170,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys); + */ + void __ref kmemleak_ignore_phys(phys_addr_t phys) + { +- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) ++ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + kmemleak_ignore(__va(phys)); + } + EXPORT_SYMBOL(kmemleak_ignore_phys); +diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c +index ff47790366497..f20f4373ff408 100644 +--- a/net/bridge/br_netfilter_hooks.c ++++ b/net/bridge/br_netfilter_hooks.c +@@ -384,6 +384,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ + /* - Bridged-and-DNAT'ed traffic doesn't + * require ip_forwarding. */ + if (rt->dst.dev == dev) { ++ skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + goto bridged_dnat; + } +@@ -413,6 +414,7 @@ bridged_dnat: + kfree_skb(skb); + return 0; + } ++ skb_dst_drop(skb); + skb_dst_set_noref(skb, &rt->dst); + } + +diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c +index e4e0c836c3f51..6b07f30675bb0 100644 +--- a/net/bridge/br_netfilter_ipv6.c ++++ b/net/bridge/br_netfilter_ipv6.c +@@ -197,6 +197,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc + kfree_skb(skb); + return 0; + } ++ skb_dst_drop(skb); + skb_dst_set_noref(skb, &rt->dst); + } + +diff --git a/net/core/datagram.c b/net/core/datagram.c +index 50f4faeea76cc..48e82438acb02 100644 +--- a/net/core/datagram.c ++++ b/net/core/datagram.c +@@ -675,7 +675,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + page_ref_sub(last_head, refs); + refs = 0; + } +- skb_fill_page_desc(skb, frag++, head, start, size); ++ skb_fill_page_desc_noacc(skb, frag++, head, start, size); + } + if (refs) + page_ref_sub(last_head, refs); +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index bebf58464d667..4b2b07a9422cf 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -4179,9 +4179,8 @@ normal: + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; + } else { +- skb_copy_bits(head_skb, offset, +- skb_put(nskb, len), +- len); ++ if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) ++ goto err; + } + continue; + } +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 3d446773ff2a5..ab03977b65781 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -1015,7 +1015,7 @@ new_segment: + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(page); +- skb_fill_page_desc(skb, i, page, offset, copy); ++ skb_fill_page_desc_noacc(skb, i, page, offset, copy); + } + + if (!(flags & MSG_NO_SHARED_FRAGS)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index e5435156e545d..c30696eafc361 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -2514,6 +2514,21 @@ static inline bool tcp_may_undo(const struct tcp_sock *tp) + return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); + } + ++static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { ++ /* Hold old state until something *above* high_seq ++ * is ACKed. For Reno it is MUST to prevent false ++ * fast retransmits (RFC2582). SACK TCP is safe. */ ++ if (!tcp_any_retrans_done(sk)) ++ tp->retrans_stamp = 0; ++ return true; ++ } ++ return false; ++} ++ + /* People celebrate: "We love our President!" */ + static bool tcp_try_undo_recovery(struct sock *sk) + { +@@ -2536,14 +2551,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) + } else if (tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_persist--; + } +- if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { +- /* Hold old state until something *above* high_seq +- * is ACKed. For Reno it is MUST to prevent false +- * fast retransmits (RFC2582). SACK TCP is safe. */ +- if (!tcp_any_retrans_done(sk)) +- tp->retrans_stamp = 0; ++ if (tcp_is_non_sack_preventing_reopen(sk)) + return true; +- } + tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; + return false; +@@ -2579,6 +2588,8 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUSRTOS); + inet_csk(sk)->icsk_retransmits = 0; ++ if (tcp_is_non_sack_preventing_reopen(sk)) ++ return true; + if (frto_undo || tcp_is_sack(tp)) { + tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index aa9f2ec3dc468..01e1d36bdf135 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -781,6 +781,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) + */ + if (tunnel) { + /* ...not for tunnels though: we don't have a sending socket */ ++ if (udp_sk(sk)->encap_err_rcv) ++ udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2); + goto out; + } + if (!inet->recverr) { +diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c +index 8efaf8c3fe2a9..8242c8947340e 100644 +--- a/net/ipv4/udp_tunnel_core.c ++++ b/net/ipv4/udp_tunnel_core.c +@@ -72,6 +72,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, + + udp_sk(sk)->encap_type = cfg->encap_type; + udp_sk(sk)->encap_rcv = cfg->encap_rcv; ++ udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; + udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; + udp_sk(sk)->encap_destroy = cfg->encap_destroy; + udp_sk(sk)->gro_receive = cfg->gro_receive; +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c +index b738eb7e1cae8..04cf06866e765 100644 +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -3557,11 +3557,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, + fallthrough; + case NETDEV_UP: + case NETDEV_CHANGE: +- if (dev->flags & IFF_SLAVE) ++ if (idev && idev->cnf.disable_ipv6) + break; + +- if (idev && idev->cnf.disable_ipv6) ++ if (dev->flags & IFF_SLAVE) { ++ if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) && ++ dev->flags & IFF_UP && dev->flags & IFF_MULTICAST) ++ ipv6_mc_up(idev); + break; ++ } + + if (event == NETDEV_UP) { + /* restore routes for permanent addresses */ +diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c +index 73aaabf0e9665..0b0e34ddc64e0 100644 +--- a/net/ipv6/seg6.c ++++ b/net/ipv6/seg6.c +@@ -191,6 +191,11 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) + goto out_unlock; + } + ++ if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) { ++ err = -EINVAL; ++ goto out_unlock; ++ } ++ + if (hinfo) { + err = seg6_hmac_info_del(net, hmackeyid); + if (err) +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index e2f2e087a7531..40074bc7274ea 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -616,8 +616,11 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + } + + /* Tunnels don't have an application socket: don't pass errors back */ +- if (tunnel) ++ if (tunnel) { ++ if (udp_sk(sk)->encap_err_rcv) ++ udp_sk(sk)->encap_err_rcv(sk, skb, offset); + goto out; ++ } + + if (!np->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) +diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c +index 1796c456ac98b..992decbcaa5c1 100644 +--- a/net/netfilter/nf_conntrack_irc.c ++++ b/net/netfilter/nf_conntrack_irc.c +@@ -194,8 +194,9 @@ static int help(struct sk_buff *skb, unsigned int protoff, + + /* dcc_ip can be the internal OR external (NAT'ed) IP */ + tuple = &ct->tuplehash[dir].tuple; +- if (tuple->src.u3.ip != dcc_ip && +- tuple->dst.u3.ip != dcc_ip) { ++ if ((tuple->src.u3.ip != dcc_ip && ++ ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) || ++ dcc_port == 0) { + net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", + &tuple->src.u3.ip, + &dcc_ip, dcc_port); +diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c +index a63b51dceaf2c..a634c72b1ffcf 100644 +--- a/net/netfilter/nf_conntrack_proto_tcp.c ++++ b/net/netfilter/nf_conntrack_proto_tcp.c +@@ -655,6 +655,37 @@ static bool tcp_in_window(struct nf_conn *ct, + tn->tcp_be_liberal) + res = true; + if (!res) { ++ bool seq_ok = before(seq, sender->td_maxend + 1); ++ ++ if (!seq_ok) { ++ u32 overshot = end - sender->td_maxend + 1; ++ bool ack_ok; ++ ++ ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); ++ ++ if (in_recv_win && ++ ack_ok && ++ overshot <= receiver->td_maxwin && ++ before(sack, receiver->td_end + 1)) { ++ /* Work around TCPs that send more bytes than allowed by ++ * the receive window. ++ * ++ * If the (marked as invalid) packet is allowed to pass by ++ * the ruleset and the peer acks this data, then its possible ++ * all future packets will trigger 'ACK is over upper bound' check. ++ * ++ * Thus if only the sequence check fails then do update td_end so ++ * possible ACK for this data can update internal state. ++ */ ++ sender->td_end = end; ++ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; ++ ++ nf_ct_l4proto_log_invalid(skb, ct, hook_state, ++ "%u bytes more than expected", overshot); ++ return res; ++ } ++ } ++ + nf_ct_l4proto_log_invalid(skb, ct, hook_state, + "%s", + before(seq, sender->td_maxend + 1) ? +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index bc690238a3c56..848cc81d69926 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -2166,8 +2166,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, + chain->flags |= NFT_CHAIN_BASE | flags; + basechain->policy = NF_ACCEPT; + if (chain->flags & NFT_CHAIN_HW_OFFLOAD && +- !nft_chain_offload_support(basechain)) ++ !nft_chain_offload_support(basechain)) { ++ list_splice_init(&basechain->hook_list, &hook->list); + return -EOPNOTSUPP; ++ } + + flow_block_init(&basechain->flow_block); + +diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h +index 571436064cd6f..62c70709d7980 100644 +--- a/net/rxrpc/ar-internal.h ++++ b/net/rxrpc/ar-internal.h +@@ -982,6 +982,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); + /* + * peer_event.c + */ ++void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); + void rxrpc_error_report(struct sock *); + void rxrpc_peer_keepalive_worker(struct work_struct *); + +diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c +index 96ecb7356c0fe..79bb02eb67b2b 100644 +--- a/net/rxrpc/local_object.c ++++ b/net/rxrpc/local_object.c +@@ -137,6 +137,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) + + tuncfg.encap_type = UDP_ENCAP_RXRPC; + tuncfg.encap_rcv = rxrpc_input_packet; ++ tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; + tuncfg.sk_user_data = local; + setup_udp_tunnel_sock(net, local->socket, &tuncfg); + +diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c +index be032850ae8ca..32561e9567fe3 100644 +--- a/net/rxrpc/peer_event.c ++++ b/net/rxrpc/peer_event.c +@@ -16,22 +16,105 @@ + #include + #include + #include ++#include + #include "ar-internal.h" + ++static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int); + static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); + static void rxrpc_distribute_error(struct rxrpc_peer *, int, + enum rxrpc_call_completion); + + /* +- * Find the peer associated with an ICMP packet. ++ * Find the peer associated with an ICMPv4 packet. + */ + static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, +- const struct sk_buff *skb, ++ struct sk_buff *skb, ++ unsigned int udp_offset, ++ unsigned int *info, + struct sockaddr_rxrpc *srx) + { +- struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); ++ struct iphdr *ip, *ip0 = ip_hdr(skb); ++ struct icmphdr *icmp = icmp_hdr(skb); ++ struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); + +- _enter(""); ++ _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code); ++ ++ switch (icmp->type) { ++ case ICMP_DEST_UNREACH: ++ *info = ntohs(icmp->un.frag.mtu); ++ fallthrough; ++ case ICMP_TIME_EXCEEDED: ++ case ICMP_PARAMETERPROB: ++ ip = (struct iphdr *)((void *)icmp + 8); ++ break; ++ default: ++ return NULL; ++ } ++ ++ memset(srx, 0, sizeof(*srx)); ++ srx->transport_type = local->srx.transport_type; ++ srx->transport_len = local->srx.transport_len; ++ srx->transport.family = local->srx.transport.family; ++ ++ /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice ++ * versa? ++ */ ++ switch (srx->transport.family) { ++ case AF_INET: ++ srx->transport_len = sizeof(srx->transport.sin); ++ srx->transport.family = AF_INET; ++ srx->transport.sin.sin_port = udp->dest; ++ memcpy(&srx->transport.sin.sin_addr, &ip->daddr, ++ sizeof(struct in_addr)); ++ break; ++ ++#ifdef CONFIG_AF_RXRPC_IPV6 ++ case AF_INET6: ++ srx->transport_len = sizeof(srx->transport.sin); ++ srx->transport.family = AF_INET; ++ srx->transport.sin.sin_port = udp->dest; ++ memcpy(&srx->transport.sin.sin_addr, &ip->daddr, ++ sizeof(struct in_addr)); ++ break; ++#endif ++ ++ default: ++ WARN_ON_ONCE(1); ++ return NULL; ++ } ++ ++ _net("ICMP {%pISp}", &srx->transport); ++ return rxrpc_lookup_peer_rcu(local, srx); ++} ++ ++#ifdef CONFIG_AF_RXRPC_IPV6 ++/* ++ * Find the peer associated with an ICMPv6 packet. ++ */ ++static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, ++ struct sk_buff *skb, ++ unsigned int udp_offset, ++ unsigned int *info, ++ struct sockaddr_rxrpc *srx) ++{ ++ struct icmp6hdr *icmp = icmp6_hdr(skb); ++ struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb); ++ struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); ++ ++ _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code); ++ ++ switch (icmp->icmp6_type) { ++ case ICMPV6_DEST_UNREACH: ++ *info = ntohl(icmp->icmp6_mtu); ++ fallthrough; ++ case ICMPV6_PKT_TOOBIG: ++ case ICMPV6_TIME_EXCEED: ++ case ICMPV6_PARAMPROB: ++ ip = (struct ipv6hdr *)((void *)icmp + 8); ++ break; ++ default: ++ return NULL; ++ } + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; +@@ -41,6 +124,165 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, + /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice + * versa? + */ ++ switch (srx->transport.family) { ++ case AF_INET: ++ _net("Rx ICMP6 on v4 sock"); ++ srx->transport_len = sizeof(srx->transport.sin); ++ srx->transport.family = AF_INET; ++ srx->transport.sin.sin_port = udp->dest; ++ memcpy(&srx->transport.sin.sin_addr, ++ &ip->daddr.s6_addr32[3], sizeof(struct in_addr)); ++ break; ++ case AF_INET6: ++ _net("Rx ICMP6"); ++ srx->transport.sin.sin_port = udp->dest; ++ memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr, ++ sizeof(struct in6_addr)); ++ break; ++ default: ++ WARN_ON_ONCE(1); ++ return NULL; ++ } ++ ++ _net("ICMP {%pISp}", &srx->transport); ++ return rxrpc_lookup_peer_rcu(local, srx); ++} ++#endif /* CONFIG_AF_RXRPC_IPV6 */ ++ ++/* ++ * Handle an error received on the local endpoint as a tunnel. ++ */ ++void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, ++ unsigned int udp_offset) ++{ ++ struct sock_extended_err ee; ++ struct sockaddr_rxrpc srx; ++ struct rxrpc_local *local; ++ struct rxrpc_peer *peer; ++ unsigned int info = 0; ++ int err; ++ u8 version = ip_hdr(skb)->version; ++ u8 type = icmp_hdr(skb)->type; ++ u8 code = icmp_hdr(skb)->code; ++ ++ rcu_read_lock(); ++ local = rcu_dereference_sk_user_data(sk); ++ if (unlikely(!local)) { ++ rcu_read_unlock(); ++ return; ++ } ++ ++ rxrpc_new_skb(skb, rxrpc_skb_received); ++ ++ switch (ip_hdr(skb)->version) { ++ case IPVERSION: ++ peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset, ++ &info, &srx); ++ break; ++#ifdef CONFIG_AF_RXRPC_IPV6 ++ case 6: ++ peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset, ++ &info, &srx); ++ break; ++#endif ++ default: ++ rcu_read_unlock(); ++ return; ++ } ++ ++ if (peer && !rxrpc_get_peer_maybe(peer)) ++ peer = NULL; ++ if (!peer) { ++ rcu_read_unlock(); ++ return; ++ } ++ ++ memset(&ee, 0, sizeof(ee)); ++ ++ switch (version) { ++ case IPVERSION: ++ switch (type) { ++ case ICMP_DEST_UNREACH: ++ switch (code) { ++ case ICMP_FRAG_NEEDED: ++ rxrpc_adjust_mtu(peer, info); ++ rcu_read_unlock(); ++ rxrpc_put_peer(peer); ++ return; ++ default: ++ break; ++ } ++ ++ err = EHOSTUNREACH; ++ if (code <= NR_ICMP_UNREACH) { ++ /* Might want to do something different with ++ * non-fatal errors ++ */ ++ //harderr = icmp_err_convert[code].fatal; ++ err = icmp_err_convert[code].errno; ++ } ++ break; ++ ++ case ICMP_TIME_EXCEEDED: ++ err = EHOSTUNREACH; ++ break; ++ default: ++ err = EPROTO; ++ break; ++ } ++ ++ ee.ee_origin = SO_EE_ORIGIN_ICMP; ++ ee.ee_type = type; ++ ee.ee_code = code; ++ ee.ee_errno = err; ++ break; ++ ++#ifdef CONFIG_AF_RXRPC_IPV6 ++ case 6: ++ switch (type) { ++ case ICMPV6_PKT_TOOBIG: ++ rxrpc_adjust_mtu(peer, info); ++ rcu_read_unlock(); ++ rxrpc_put_peer(peer); ++ return; ++ } ++ ++ icmpv6_err_convert(type, code, &err); ++ ++ if (err == EACCES) ++ err = EHOSTUNREACH; ++ ++ ee.ee_origin = SO_EE_ORIGIN_ICMP6; ++ ee.ee_type = type; ++ ee.ee_code = code; ++ ee.ee_errno = err; ++ break; ++#endif ++ } ++ ++ trace_rxrpc_rx_icmp(peer, &ee, &srx); ++ ++ rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR); ++ rcu_read_unlock(); ++ rxrpc_put_peer(peer); ++} ++ ++/* ++ * Find the peer associated with a local error. ++ */ ++static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, ++ const struct sk_buff *skb, ++ struct sockaddr_rxrpc *srx) ++{ ++ struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); ++ ++ _enter(""); ++ ++ memset(srx, 0, sizeof(*srx)); ++ srx->transport_type = local->srx.transport_type; ++ srx->transport_len = local->srx.transport_len; ++ srx->transport.family = local->srx.transport.family; ++ + switch (srx->transport.family) { + case AF_INET: + srx->transport_len = sizeof(srx->transport.sin); +@@ -104,10 +346,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, + /* + * Handle an MTU/fragmentation problem. + */ +-static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) ++static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) + { +- u32 mtu = serr->ee.ee_info; +- + _net("Rx ICMP Fragmentation Needed (%d)", mtu); + + /* wind down the local interface MTU */ +@@ -148,7 +388,7 @@ void rxrpc_error_report(struct sock *sk) + struct sock_exterr_skb *serr; + struct sockaddr_rxrpc srx; + struct rxrpc_local *local; +- struct rxrpc_peer *peer; ++ struct rxrpc_peer *peer = NULL; + struct sk_buff *skb; + + rcu_read_lock(); +@@ -172,41 +412,20 @@ void rxrpc_error_report(struct sock *sk) + } + rxrpc_new_skb(skb, rxrpc_skb_received); + serr = SKB_EXT_ERR(skb); +- if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { +- _leave("UDP empty message"); +- rcu_read_unlock(); +- rxrpc_free_skb(skb, rxrpc_skb_freed); +- return; +- } + +- peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx); +- if (peer && !rxrpc_get_peer_maybe(peer)) +- peer = NULL; +- if (!peer) { +- rcu_read_unlock(); +- rxrpc_free_skb(skb, rxrpc_skb_freed); +- _leave(" [no peer]"); +- return; +- } +- +- trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); +- +- if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && +- serr->ee.ee_type == ICMP_DEST_UNREACH && +- serr->ee.ee_code == ICMP_FRAG_NEEDED)) { +- rxrpc_adjust_mtu(peer, serr); +- rcu_read_unlock(); +- rxrpc_free_skb(skb, rxrpc_skb_freed); +- rxrpc_put_peer(peer); +- _leave(" [MTU update]"); +- return; ++ if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) { ++ peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); ++ if (peer && !rxrpc_get_peer_maybe(peer)) ++ peer = NULL; ++ if (peer) { ++ trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); ++ rxrpc_store_error(peer, serr); ++ } + } + +- rxrpc_store_error(peer, serr); + rcu_read_unlock(); + rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_put_peer(peer); +- + _leave(""); + } + +diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c +index 08aab5c01437d..db47844f4ac99 100644 +--- a/net/rxrpc/rxkad.c ++++ b/net/rxrpc/rxkad.c +@@ -540,7 +540,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, + * directly into the target buffer. + */ + sg = _sg; +- nsg = skb_shinfo(skb)->nr_frags; ++ nsg = skb_shinfo(skb)->nr_frags + 1; + if (nsg <= 4) { + nsg = 4; + } else { +diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c +index 3d061a13d7ed2..2829455211f8c 100644 +--- a/net/sched/sch_sfb.c ++++ b/net/sched/sch_sfb.c +@@ -135,15 +135,15 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) + } + } + +-static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) ++static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) + { + u32 sfbhash; + +- sfbhash = sfb_hash(skb, 0); ++ sfbhash = cb->hashes[0]; + if (sfbhash) + increment_one_qlen(sfbhash, 0, q); + +- sfbhash = sfb_hash(skb, 1); ++ sfbhash = cb->hashes[1]; + if (sfbhash) + increment_one_qlen(sfbhash, 1, q); + } +@@ -281,8 +281,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, + { + + struct sfb_sched_data *q = qdisc_priv(sch); ++ unsigned int len = qdisc_pkt_len(skb); + struct Qdisc *child = q->qdisc; + struct tcf_proto *fl; ++ struct sfb_skb_cb cb; + int i; + u32 p_min = ~0; + u32 minqlen = ~0; +@@ -399,11 +401,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, + } + + enqueue: ++ memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); + ret = qdisc_enqueue(skb, child, to_free); + if (likely(ret == NET_XMIT_SUCCESS)) { +- qdisc_qstats_backlog_inc(sch, skb); ++ sch->qstats.backlog += len; + sch->q.qlen++; +- increment_qlen(skb, q); ++ increment_qlen(&cb, q); + } else if (net_xmit_drop_count(ret)) { + q->stats.childdrop++; + qdisc_qstats_drop(sch); +diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c +index b9c71a304d399..0b941dd63d268 100644 +--- a/net/sched/sch_taprio.c ++++ b/net/sched/sch_taprio.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -176,7 +177,7 @@ static ktime_t get_interval_end_time(struct sched_gate_list *sched, + + static int length_to_duration(struct taprio_sched *q, int len) + { +- return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); ++ return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); + } + + /* Returns the entry corresponding to next available interval. If +@@ -551,7 +552,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) + static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) + { + atomic_set(&entry->budget, +- div64_u64((u64)entry->interval * 1000, ++ div64_u64((u64)entry->interval * PSEC_PER_NSEC, + atomic64_read(&q->picos_per_byte))); + } + +diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c +index f40f6ed0fbdb4..1f3bb1f6b1f7b 100644 +--- a/net/smc/smc_core.c ++++ b/net/smc/smc_core.c +@@ -755,6 +755,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + lnk->lgr = lgr; + smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ + lnk->link_idx = link_idx; ++ lnk->wr_rx_id_compl = 0; + smc_ibdev_cnt_inc(lnk); + smcr_copy_dev_info_to_link(lnk); + atomic_set(&lnk->conn_cnt, 0); +diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h +index 4cb03e9423648..7b43a78c7f73a 100644 +--- a/net/smc/smc_core.h ++++ b/net/smc/smc_core.h +@@ -115,8 +115,10 @@ struct smc_link { + dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ + u64 wr_rx_id; /* seq # of last recv WR */ ++ u64 wr_rx_id_compl; /* seq # of last completed WR */ + u32 wr_rx_cnt; /* number of WR recv buffers */ + unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ ++ wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ + + struct ib_reg_wr wr_reg; /* WR register memory region */ + wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ +diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c +index 26f8f240d9e84..b0678a417e09d 100644 +--- a/net/smc/smc_wr.c ++++ b/net/smc/smc_wr.c +@@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) + + for (i = 0; i < num; i++) { + link = wc[i].qp->qp_context; ++ link->wr_rx_id_compl = wc[i].wr_id; + if (wc[i].status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; + smc_wr_rx_demultiplex(&wc[i]); +@@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + smcr_link_down_cond_sched(link); ++ if (link->wr_rx_id_compl == link->wr_rx_id) ++ wake_up(&link->wr_rx_empty_wait); + break; + default: + smc_wr_rx_post(link); /* refill WR RX */ +@@ -639,6 +642,7 @@ void smc_wr_free_link(struct smc_link *lnk) + return; + ibdev = lnk->smcibdev->ibdev; + ++ smc_wr_drain_cq(lnk); + smc_wr_wakeup_reg_wait(lnk); + smc_wr_wakeup_tx_wait(lnk); + +@@ -889,6 +893,7 @@ int smc_wr_create_link(struct smc_link *lnk) + atomic_set(&lnk->wr_tx_refcnt, 0); + init_waitqueue_head(&lnk->wr_reg_wait); + atomic_set(&lnk->wr_reg_refcnt, 0); ++ init_waitqueue_head(&lnk->wr_rx_empty_wait); + return rc; + + dma_unmap: +diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h +index a54e90a1110fd..45e9b894d3f8a 100644 +--- a/net/smc/smc_wr.h ++++ b/net/smc/smc_wr.h +@@ -73,6 +73,11 @@ static inline void smc_wr_tx_link_put(struct smc_link *link) + wake_up_all(&link->wr_tx_wait); + } + ++static inline void smc_wr_drain_cq(struct smc_link *lnk) ++{ ++ wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id); ++} ++ + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) + { + wake_up_all(&lnk->wr_tx_wait); +diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c +index 2f4d23238a7e3..9618e4429f0fe 100644 +--- a/net/tipc/monitor.c ++++ b/net/tipc/monitor.c +@@ -160,7 +160,7 @@ static void map_set(u64 *up_map, int i, unsigned int v) + + static int map_get(u64 up_map, int i) + { +- return (up_map & (1 << i)) >> i; ++ return (up_map & (1ULL << i)) >> i; + } + + static struct tipc_peer *peer_prev(struct tipc_peer *peer) +diff --git a/security/security.c b/security/security.c +index 188b8f7822206..8b62654ff3f97 100644 +--- a/security/security.c ++++ b/security/security.c +@@ -2654,4 +2654,8 @@ int security_uring_sqpoll(void) + { + return call_int_hook(uring_sqpoll, 0); + } ++int security_uring_cmd(struct io_uring_cmd *ioucmd) ++{ ++ return call_int_hook(uring_cmd, 0, ioucmd); ++} + #endif /* CONFIG_IO_URING */ +diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c +index 1bbd53321d133..e90dfa36f79aa 100644 +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -91,6 +91,7 @@ + #include + #include + #include ++#include + + #include "avc.h" + #include "objsec.h" +@@ -6990,6 +6991,28 @@ static int selinux_uring_sqpoll(void) + return avc_has_perm(&selinux_state, sid, sid, + SECCLASS_IO_URING, IO_URING__SQPOLL, NULL); + } ++ ++/** ++ * selinux_uring_cmd - check if IORING_OP_URING_CMD is allowed ++ * @ioucmd: the io_uring command structure ++ * ++ * Check to see if the current domain is allowed to execute an ++ * IORING_OP_URING_CMD against the device/file specified in @ioucmd. ++ * ++ */ ++static int selinux_uring_cmd(struct io_uring_cmd *ioucmd) ++{ ++ struct file *file = ioucmd->file; ++ struct inode *inode = file_inode(file); ++ struct inode_security_struct *isec = selinux_inode(inode); ++ struct common_audit_data ad; ++ ++ ad.type = LSM_AUDIT_DATA_FILE; ++ ad.u.file = file; ++ ++ return avc_has_perm(&selinux_state, current_sid(), isec->sid, ++ SECCLASS_IO_URING, IO_URING__CMD, &ad); ++} + #endif /* CONFIG_IO_URING */ + + /* +@@ -7234,6 +7257,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { + #ifdef CONFIG_IO_URING + LSM_HOOK_INIT(uring_override_creds, selinux_uring_override_creds), + LSM_HOOK_INIT(uring_sqpoll, selinux_uring_sqpoll), ++ LSM_HOOK_INIT(uring_cmd, selinux_uring_cmd), + #endif + + /* +diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h +index ff757ae5f2537..1c2f41ff4e551 100644 +--- a/security/selinux/include/classmap.h ++++ b/security/selinux/include/classmap.h +@@ -253,7 +253,7 @@ const struct security_class_mapping secclass_map[] = { + { "anon_inode", + { COMMON_FILE_PERMS, NULL } }, + { "io_uring", +- { "override_creds", "sqpoll", NULL } }, ++ { "override_creds", "sqpoll", "cmd", NULL } }, + { NULL } + }; + +diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c +index 6207762dbdb13..b30e20f64471c 100644 +--- a/security/smack/smack_lsm.c ++++ b/security/smack/smack_lsm.c +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + #include "smack.h" + + #define TRANS_TRUE "TRUE" +@@ -4739,6 +4740,36 @@ static int smack_uring_sqpoll(void) + return -EPERM; + } + ++/** ++ * smack_uring_cmd - check on file operations for io_uring ++ * @ioucmd: the command in question ++ * ++ * Make a best guess about whether a io_uring "command" should ++ * be allowed. Use the same logic used for determining if the ++ * file could be opened for read in the absence of better criteria. ++ */ ++static int smack_uring_cmd(struct io_uring_cmd *ioucmd) ++{ ++ struct file *file = ioucmd->file; ++ struct smk_audit_info ad; ++ struct task_smack *tsp; ++ struct inode *inode; ++ int rc; ++ ++ if (!file) ++ return -EINVAL; ++ ++ tsp = smack_cred(file->f_cred); ++ inode = file_inode(file); ++ ++ smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); ++ smk_ad_setfield_u_fs_path(&ad, file->f_path); ++ rc = smk_tskacc(tsp, smk_of_inode(inode), MAY_READ, &ad); ++ rc = smk_bu_credfile(file->f_cred, file, MAY_READ, rc); ++ ++ return rc; ++} ++ + #endif /* CONFIG_IO_URING */ + + struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = { +@@ -4896,6 +4927,7 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { + #ifdef CONFIG_IO_URING + LSM_HOOK_INIT(uring_override_creds, smack_uring_override_creds), + LSM_HOOK_INIT(uring_sqpoll, smack_uring_sqpoll), ++ LSM_HOOK_INIT(uring_cmd, smack_uring_cmd), + #endif + }; + +diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c +index 55b3c49ba61de..244afc38ddcaa 100644 +--- a/sound/core/memalloc.c ++++ b/sound/core/memalloc.c +@@ -535,10 +535,13 @@ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size) + dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, + sg_dma_address(sgt->sgl)); + p = dma_vmap_noncontiguous(dmab->dev.dev, size, sgt); +- if (p) ++ if (p) { + dmab->private_data = sgt; +- else ++ /* store the first page address for convenience */ ++ dmab->addr = snd_sgbuf_get_addr(dmab, 0); ++ } else { + dma_free_noncontiguous(dmab->dev.dev, size, sgt, dmab->dev.dir); ++ } + return p; + } + +@@ -772,6 +775,8 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) + if (!p) + goto error; + dmab->private_data = sgbuf; ++ /* store the first page address for convenience */ ++ dmab->addr = snd_sgbuf_get_addr(dmab, 0); + return p; + + error: +diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c +index 90c3a367d7de9..02df915eb3c66 100644 +--- a/sound/core/oss/pcm_oss.c ++++ b/sound/core/oss/pcm_oss.c +@@ -1672,14 +1672,14 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file) + runtime = substream->runtime; + if (atomic_read(&substream->mmap_count)) + goto __direct; +- err = snd_pcm_oss_make_ready(substream); +- if (err < 0) +- return err; + atomic_inc(&runtime->oss.rw_ref); + if (mutex_lock_interruptible(&runtime->oss.params_lock)) { + atomic_dec(&runtime->oss.rw_ref); + return -ERESTARTSYS; + } ++ err = snd_pcm_oss_make_ready_locked(substream); ++ if (err < 0) ++ goto unlock; + format = snd_pcm_oss_format_from(runtime->oss.format); + width = snd_pcm_format_physical_width(format); + if (runtime->oss.buffer_used > 0) { +diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c +index 9b4a7cdb103ad..12f12a294df5a 100644 +--- a/sound/drivers/aloop.c ++++ b/sound/drivers/aloop.c +@@ -605,17 +605,18 @@ static unsigned int loopback_jiffies_timer_pos_update + cable->streams[SNDRV_PCM_STREAM_PLAYBACK]; + struct loopback_pcm *dpcm_capt = + cable->streams[SNDRV_PCM_STREAM_CAPTURE]; +- unsigned long delta_play = 0, delta_capt = 0; ++ unsigned long delta_play = 0, delta_capt = 0, cur_jiffies; + unsigned int running, count1, count2; + ++ cur_jiffies = jiffies; + running = cable->running ^ cable->pause; + if (running & (1 << SNDRV_PCM_STREAM_PLAYBACK)) { +- delta_play = jiffies - dpcm_play->last_jiffies; ++ delta_play = cur_jiffies - dpcm_play->last_jiffies; + dpcm_play->last_jiffies += delta_play; + } + + if (running & (1 << SNDRV_PCM_STREAM_CAPTURE)) { +- delta_capt = jiffies - dpcm_capt->last_jiffies; ++ delta_capt = cur_jiffies - dpcm_capt->last_jiffies; + dpcm_capt->last_jiffies += delta_capt; + } + +diff --git a/sound/pci/emu10k1/emupcm.c b/sound/pci/emu10k1/emupcm.c +index b2701a4452d86..48af77ae8020f 100644 +--- a/sound/pci/emu10k1/emupcm.c ++++ b/sound/pci/emu10k1/emupcm.c +@@ -124,7 +124,7 @@ static int snd_emu10k1_pcm_channel_alloc(struct snd_emu10k1_pcm * epcm, int voic + epcm->voices[0]->epcm = epcm; + if (voices > 1) { + for (i = 1; i < voices; i++) { +- epcm->voices[i] = &epcm->emu->voices[epcm->voices[0]->number + i]; ++ epcm->voices[i] = &epcm->emu->voices[(epcm->voices[0]->number + i) % NUM_G]; + epcm->voices[i]->epcm = epcm; + } + } +diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c +index a77165bd92a98..b20694fd69dea 100644 +--- a/sound/pci/hda/hda_intel.c ++++ b/sound/pci/hda/hda_intel.c +@@ -1817,7 +1817,7 @@ static int azx_create(struct snd_card *card, struct pci_dev *pci, + + /* use the non-cached pages in non-snoop mode */ + if (!azx_snoop(chip)) +- azx_bus(chip)->dma_type = SNDRV_DMA_TYPE_DEV_WC; ++ azx_bus(chip)->dma_type = SNDRV_DMA_TYPE_DEV_WC_SG; + + if (chip->driver_type == AZX_DRIVER_NVIDIA) { + dev_dbg(chip->card->dev, "Enable delay in RIRB handling\n"); +diff --git a/sound/soc/atmel/mchp-spdiftx.c b/sound/soc/atmel/mchp-spdiftx.c +index d243800464352..bcca1cf3cd7b6 100644 +--- a/sound/soc/atmel/mchp-spdiftx.c ++++ b/sound/soc/atmel/mchp-spdiftx.c +@@ -196,8 +196,7 @@ struct mchp_spdiftx_dev { + struct clk *pclk; + struct clk *gclk; + unsigned int fmt; +- const struct mchp_i2s_caps *caps; +- int gclk_enabled:1; ++ unsigned int gclk_enabled:1; + }; + + static inline int mchp_spdiftx_is_running(struct mchp_spdiftx_dev *dev) +@@ -766,8 +765,6 @@ static const struct of_device_id mchp_spdiftx_dt_ids[] = { + MODULE_DEVICE_TABLE(of, mchp_spdiftx_dt_ids); + static int mchp_spdiftx_probe(struct platform_device *pdev) + { +- struct device_node *np = pdev->dev.of_node; +- const struct of_device_id *match; + struct mchp_spdiftx_dev *dev; + struct resource *mem; + struct regmap *regmap; +@@ -781,11 +778,6 @@ static int mchp_spdiftx_probe(struct platform_device *pdev) + if (!dev) + return -ENOMEM; + +- /* Get hardware capabilities. */ +- match = of_match_node(mchp_spdiftx_dt_ids, np); +- if (match) +- dev->caps = match->data; +- + /* Map I/O registers. */ + base = devm_platform_get_and_ioremap_resource(pdev, 0, &mem); + if (IS_ERR(base)) +diff --git a/sound/soc/codecs/cs42l42.c b/sound/soc/codecs/cs42l42.c +index 4fade23887972..8cba3015398b7 100644 +--- a/sound/soc/codecs/cs42l42.c ++++ b/sound/soc/codecs/cs42l42.c +@@ -1618,7 +1618,6 @@ static irqreturn_t cs42l42_irq_thread(int irq, void *data) + unsigned int current_plug_status; + unsigned int current_button_status; + unsigned int i; +- int report = 0; + + mutex_lock(&cs42l42->irq_lock); + if (cs42l42->suspended) { +@@ -1713,13 +1712,15 @@ static irqreturn_t cs42l42_irq_thread(int irq, void *data) + + if (current_button_status & CS42L42_M_DETECT_TF_MASK) { + dev_dbg(cs42l42->dev, "Button released\n"); +- report = 0; ++ snd_soc_jack_report(cs42l42->jack, 0, ++ SND_JACK_BTN_0 | SND_JACK_BTN_1 | ++ SND_JACK_BTN_2 | SND_JACK_BTN_3); + } else if (current_button_status & CS42L42_M_DETECT_FT_MASK) { +- report = cs42l42_handle_button_press(cs42l42); +- ++ snd_soc_jack_report(cs42l42->jack, ++ cs42l42_handle_button_press(cs42l42), ++ SND_JACK_BTN_0 | SND_JACK_BTN_1 | ++ SND_JACK_BTN_2 | SND_JACK_BTN_3); + } +- snd_soc_jack_report(cs42l42->jack, report, SND_JACK_BTN_0 | SND_JACK_BTN_1 | +- SND_JACK_BTN_2 | SND_JACK_BTN_3); + } + } + +diff --git a/sound/soc/qcom/sm8250.c b/sound/soc/qcom/sm8250.c +index 6e1184c8b672a..c48ac107810d4 100644 +--- a/sound/soc/qcom/sm8250.c ++++ b/sound/soc/qcom/sm8250.c +@@ -270,6 +270,7 @@ static int sm8250_platform_probe(struct platform_device *pdev) + if (!card) + return -ENOMEM; + ++ card->owner = THIS_MODULE; + /* Allocate the private data */ + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) +diff --git a/sound/soc/sof/Kconfig b/sound/soc/sof/Kconfig +index 4542868cd730f..39216c09f1597 100644 +--- a/sound/soc/sof/Kconfig ++++ b/sound/soc/sof/Kconfig +@@ -196,6 +196,7 @@ config SND_SOC_SOF_DEBUG_ENABLE_FIRMWARE_TRACE + + config SND_SOC_SOF_DEBUG_IPC_FLOOD_TEST + tristate "SOF enable IPC flood test" ++ depends on SND_SOC_SOF + select SND_SOC_SOF_CLIENT + help + This option enables a separate client device for IPC flood test +@@ -214,6 +215,7 @@ config SND_SOC_SOF_DEBUG_IPC_FLOOD_TEST_NUM + + config SND_SOC_SOF_DEBUG_IPC_MSG_INJECTOR + tristate "SOF enable IPC message injector" ++ depends on SND_SOC_SOF + select SND_SOC_SOF_CLIENT + help + This option enables the IPC message injector which can be used to send +diff --git a/sound/usb/card.c b/sound/usb/card.c +index d356743de2ff9..706d249a9ad6b 100644 +--- a/sound/usb/card.c ++++ b/sound/usb/card.c +@@ -699,7 +699,7 @@ static bool check_delayed_register_option(struct snd_usb_audio *chip, int iface) + if (delayed_register[i] && + sscanf(delayed_register[i], "%x:%x", &id, &inum) == 2 && + id == chip->usb_id) +- return inum != iface; ++ return iface < inum; + } + + return false; +diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c +index f9c921683948d..ff2aa13b7b26f 100644 +--- a/sound/usb/endpoint.c ++++ b/sound/usb/endpoint.c +@@ -758,7 +758,8 @@ bool snd_usb_endpoint_compatible(struct snd_usb_audio *chip, + * The endpoint needs to be closed via snd_usb_endpoint_close() later. + * + * Note that this function doesn't configure the endpoint. The substream +- * needs to set it up later via snd_usb_endpoint_configure(). ++ * needs to set it up later via snd_usb_endpoint_set_params() and ++ * snd_usb_endpoint_prepare(). + */ + struct snd_usb_endpoint * + snd_usb_endpoint_open(struct snd_usb_audio *chip, +@@ -924,6 +925,8 @@ void snd_usb_endpoint_close(struct snd_usb_audio *chip, + endpoint_set_interface(chip, ep, false); + + if (!--ep->opened) { ++ if (ep->clock_ref && !atomic_read(&ep->clock_ref->locked)) ++ ep->clock_ref->rate = 0; + ep->iface = 0; + ep->altsetting = 0; + ep->cur_audiofmt = NULL; +@@ -1290,12 +1293,13 @@ out_of_memory: + /* + * snd_usb_endpoint_set_params: configure an snd_usb_endpoint + * ++ * It's called either from hw_params callback. + * Determine the number of URBs to be used on this endpoint. + * An endpoint must be configured before it can be started. + * An endpoint that is already running can not be reconfigured. + */ +-static int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, +- struct snd_usb_endpoint *ep) ++int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, ++ struct snd_usb_endpoint *ep) + { + const struct audioformat *fmt = ep->cur_audiofmt; + int err; +@@ -1378,18 +1382,18 @@ static int init_sample_rate(struct snd_usb_audio *chip, + } + + /* +- * snd_usb_endpoint_configure: Configure the endpoint ++ * snd_usb_endpoint_prepare: Prepare the endpoint + * + * This function sets up the EP to be fully usable state. +- * It's called either from hw_params or prepare callback. ++ * It's called either from prepare callback. + * The function checks need_setup flag, and performs nothing unless needed, + * so it's safe to call this multiple times. + * + * This returns zero if unchanged, 1 if the configuration has changed, + * or a negative error code. + */ +-int snd_usb_endpoint_configure(struct snd_usb_audio *chip, +- struct snd_usb_endpoint *ep) ++int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, ++ struct snd_usb_endpoint *ep) + { + bool iface_first; + int err = 0; +@@ -1410,9 +1414,6 @@ int snd_usb_endpoint_configure(struct snd_usb_audio *chip, + if (err < 0) + goto unlock; + } +- err = snd_usb_endpoint_set_params(chip, ep); +- if (err < 0) +- goto unlock; + goto done; + } + +@@ -1440,10 +1441,6 @@ int snd_usb_endpoint_configure(struct snd_usb_audio *chip, + if (err < 0) + goto unlock; + +- err = snd_usb_endpoint_set_params(chip, ep); +- if (err < 0) +- goto unlock; +- + err = snd_usb_select_mode_quirk(chip, ep->cur_audiofmt); + if (err < 0) + goto unlock; +diff --git a/sound/usb/endpoint.h b/sound/usb/endpoint.h +index 6a9af04cf175a..e67ea28faa54f 100644 +--- a/sound/usb/endpoint.h ++++ b/sound/usb/endpoint.h +@@ -17,8 +17,10 @@ snd_usb_endpoint_open(struct snd_usb_audio *chip, + bool is_sync_ep); + void snd_usb_endpoint_close(struct snd_usb_audio *chip, + struct snd_usb_endpoint *ep); +-int snd_usb_endpoint_configure(struct snd_usb_audio *chip, +- struct snd_usb_endpoint *ep); ++int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, ++ struct snd_usb_endpoint *ep); ++int snd_usb_endpoint_prepare(struct snd_usb_audio *chip, ++ struct snd_usb_endpoint *ep); + int snd_usb_endpoint_get_clock_rate(struct snd_usb_audio *chip, int clock); + + bool snd_usb_endpoint_compatible(struct snd_usb_audio *chip, +diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c +index e692ae04436a5..02035b545f9dd 100644 +--- a/sound/usb/pcm.c ++++ b/sound/usb/pcm.c +@@ -443,17 +443,17 @@ static int configure_endpoints(struct snd_usb_audio *chip, + if (stop_endpoints(subs, false)) + sync_pending_stops(subs); + if (subs->sync_endpoint) { +- err = snd_usb_endpoint_configure(chip, subs->sync_endpoint); ++ err = snd_usb_endpoint_prepare(chip, subs->sync_endpoint); + if (err < 0) + return err; + } +- err = snd_usb_endpoint_configure(chip, subs->data_endpoint); ++ err = snd_usb_endpoint_prepare(chip, subs->data_endpoint); + if (err < 0) + return err; + snd_usb_set_format_quirk(subs, subs->cur_audiofmt); + } else { + if (subs->sync_endpoint) { +- err = snd_usb_endpoint_configure(chip, subs->sync_endpoint); ++ err = snd_usb_endpoint_prepare(chip, subs->sync_endpoint); + if (err < 0) + return err; + } +@@ -551,7 +551,13 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, + subs->cur_audiofmt = fmt; + mutex_unlock(&chip->mutex); + +- ret = configure_endpoints(chip, subs); ++ if (subs->sync_endpoint) { ++ ret = snd_usb_endpoint_set_params(chip, subs->sync_endpoint); ++ if (ret < 0) ++ goto unlock; ++ } ++ ++ ret = snd_usb_endpoint_set_params(chip, subs->data_endpoint); + + unlock: + if (ret < 0) +diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c +index 9bfead5efc4c1..5b4d8f5eade20 100644 +--- a/sound/usb/quirks.c ++++ b/sound/usb/quirks.c +@@ -1764,7 +1764,7 @@ bool snd_usb_registration_quirk(struct snd_usb_audio *chip, int iface) + + for (q = registration_quirks; q->usb_id; q++) + if (chip->usb_id == q->usb_id) +- return iface != q->interface; ++ return iface < q->interface; + + /* Register as normal */ + return false; +diff --git a/sound/usb/stream.c b/sound/usb/stream.c +index ceb93d798182c..f10f4e6d3fb85 100644 +--- a/sound/usb/stream.c ++++ b/sound/usb/stream.c +@@ -495,6 +495,10 @@ static int __snd_usb_add_audio_stream(struct snd_usb_audio *chip, + return 0; + } + } ++ ++ if (chip->card->registered) ++ chip->need_delayed_register = true; ++ + /* look for an empty stream */ + list_for_each_entry(as, &chip->pcm_list, list) { + if (as->fmt_type != fp->fmt_type) +@@ -502,9 +506,6 @@ static int __snd_usb_add_audio_stream(struct snd_usb_audio *chip, + subs = &as->substream[stream]; + if (subs->ep_num) + continue; +- if (snd_device_get_state(chip->card, as->pcm) != +- SNDRV_DEV_BUILD) +- chip->need_delayed_register = true; + err = snd_pcm_new_stream(as->pcm, stream, 1); + if (err < 0) + return err; +@@ -1105,7 +1106,7 @@ static int __snd_usb_parse_audio_interface(struct snd_usb_audio *chip, + * Dallas DS4201 workaround: It presents 5 altsettings, but the last + * one misses syncpipe, and does not produce any sound. + */ +- if (chip->usb_id == USB_ID(0x04fa, 0x4201)) ++ if (chip->usb_id == USB_ID(0x04fa, 0x4201) && num >= 4) + num = 4; + + for (i = 0; i < num; i++) { +diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c +index e6c98a6e3908e..6b1bafe267a42 100644 +--- a/tools/lib/perf/evlist.c ++++ b/tools/lib/perf/evlist.c +@@ -486,6 +486,7 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, + if (ops->idx) + ops->idx(evlist, evsel, mp, idx); + ++ pr_debug("idx %d: mmapping fd %d\n", idx, *output); + if (ops->mmap(map, mp, *output, evlist_cpu) < 0) + return -1; + +@@ -494,6 +495,7 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, + if (!idx) + perf_evlist__set_mmap_first(evlist, map, overwrite); + } else { ++ pr_debug("idx %d: set output fd %d -> %d\n", idx, fd, *output); + if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, *output) != 0) + return -1; + +@@ -519,6 +521,48 @@ mmap_per_evsel(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, + return 0; + } + ++static int ++mmap_per_thread(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, ++ struct perf_mmap_param *mp) ++{ ++ int nr_threads = perf_thread_map__nr(evlist->threads); ++ int nr_cpus = perf_cpu_map__nr(evlist->all_cpus); ++ int cpu, thread, idx = 0; ++ int nr_mmaps = 0; ++ ++ pr_debug("%s: nr cpu values (may include -1) %d nr threads %d\n", ++ __func__, nr_cpus, nr_threads); ++ ++ /* per-thread mmaps */ ++ for (thread = 0; thread < nr_threads; thread++, idx++) { ++ int output = -1; ++ int output_overwrite = -1; ++ ++ if (mmap_per_evsel(evlist, ops, idx, mp, 0, thread, &output, ++ &output_overwrite, &nr_mmaps)) ++ goto out_unmap; ++ } ++ ++ /* system-wide mmaps i.e. per-cpu */ ++ for (cpu = 1; cpu < nr_cpus; cpu++, idx++) { ++ int output = -1; ++ int output_overwrite = -1; ++ ++ if (mmap_per_evsel(evlist, ops, idx, mp, cpu, 0, &output, ++ &output_overwrite, &nr_mmaps)) ++ goto out_unmap; ++ } ++ ++ if (nr_mmaps != evlist->nr_mmaps) ++ pr_err("Miscounted nr_mmaps %d vs %d\n", nr_mmaps, evlist->nr_mmaps); ++ ++ return 0; ++ ++out_unmap: ++ perf_evlist__munmap(evlist); ++ return -1; ++} ++ + static int + mmap_per_cpu(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, + struct perf_mmap_param *mp) +@@ -528,6 +572,8 @@ mmap_per_cpu(struct perf_evlist *evlist, struct perf_evlist_mmap_ops *ops, + int nr_mmaps = 0; + int cpu, thread; + ++ pr_debug("%s: nr cpu values %d nr threads %d\n", __func__, nr_cpus, nr_threads); ++ + for (cpu = 0; cpu < nr_cpus; cpu++) { + int output = -1; + int output_overwrite = -1; +@@ -569,6 +615,7 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, + struct perf_evlist_mmap_ops *ops, + struct perf_mmap_param *mp) + { ++ const struct perf_cpu_map *cpus = evlist->all_cpus; + struct perf_evsel *evsel; + + if (!ops || !ops->get || !ops->mmap) +@@ -588,6 +635,9 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, + if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0) + return -ENOMEM; + ++ if (perf_cpu_map__empty(cpus)) ++ return mmap_per_thread(evlist, ops, mp); ++ + return mmap_per_cpu(evlist, ops, mp); + } + +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index 31c719f99f66e..5d87e0b0d85f9 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -162,32 +162,34 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, + + /* + * Unfortunately these have to be hard coded because the noreturn +- * attribute isn't provided in ELF data. ++ * attribute isn't provided in ELF data. Keep 'em sorted. + */ + static const char * const global_noreturns[] = { ++ "__invalid_creds", ++ "__module_put_and_kthread_exit", ++ "__reiserfs_panic", + "__stack_chk_fail", +- "panic", ++ "__ubsan_handle_builtin_unreachable", ++ "cpu_bringup_and_idle", ++ "cpu_startup_entry", + "do_exit", ++ "do_group_exit", + "do_task_dead", +- "kthread_exit", +- "make_task_dead", +- "__module_put_and_kthread_exit", ++ "ex_handler_msr_mce", ++ "fortify_panic", + "kthread_complete_and_exit", +- "__reiserfs_panic", ++ "kthread_exit", ++ "kunit_try_catch_throw", + "lbug_with_loc", +- "fortify_panic", +- "usercopy_abort", + "machine_real_restart", ++ "make_task_dead", ++ "panic", + "rewind_stack_and_make_dead", +- "kunit_try_catch_throw", +- "xen_start_kernel", +- "cpu_bringup_and_idle", +- "do_group_exit", ++ "sev_es_terminate", ++ "snp_abort", + "stop_this_cpu", +- "__invalid_creds", +- "cpu_startup_entry", +- "__ubsan_handle_builtin_unreachable", +- "ex_handler_msr_mce", ++ "usercopy_abort", ++ "xen_start_kernel", + }; + + if (!func) +diff --git a/tools/perf/arch/x86/util/evlist.c b/tools/perf/arch/x86/util/evlist.c +index 68f681ad54c1e..777bdf182a582 100644 +--- a/tools/perf/arch/x86/util/evlist.c ++++ b/tools/perf/arch/x86/util/evlist.c +@@ -8,8 +8,13 @@ + #define TOPDOWN_L1_EVENTS "{slots,topdown-retiring,topdown-bad-spec,topdown-fe-bound,topdown-be-bound}" + #define TOPDOWN_L2_EVENTS "{slots,topdown-retiring,topdown-bad-spec,topdown-fe-bound,topdown-be-bound,topdown-heavy-ops,topdown-br-mispredict,topdown-fetch-lat,topdown-mem-bound}" + +-int arch_evlist__add_default_attrs(struct evlist *evlist) ++int arch_evlist__add_default_attrs(struct evlist *evlist, ++ struct perf_event_attr *attrs, ++ size_t nr_attrs) + { ++ if (nr_attrs) ++ return __evlist__add_default_attrs(evlist, attrs, nr_attrs); ++ + if (!pmu_have_event("cpu", "slots")) + return 0; + +diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c +index 9a71f0330137e..68c878b4e5e4c 100644 +--- a/tools/perf/builtin-record.c ++++ b/tools/perf/builtin-record.c +@@ -1892,14 +1892,18 @@ static int record__synthesize(struct record *rec, bool tail) + + err = perf_event__synthesize_bpf_events(session, process_synthesized_event, + machine, opts); +- if (err < 0) ++ if (err < 0) { + pr_warning("Couldn't synthesize bpf events.\n"); ++ err = 0; ++ } + + if (rec->opts.synth & PERF_SYNTH_CGROUP) { + err = perf_event__synthesize_cgroups(tool, process_synthesized_event, + machine); +- if (err < 0) ++ if (err < 0) { + pr_warning("Couldn't synthesize cgroup events.\n"); ++ err = 0; ++ } + } + + if (rec->opts.nr_threads_synthesize > 1) { +diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c +index c689054002cca..26a572c160d6f 100644 +--- a/tools/perf/builtin-script.c ++++ b/tools/perf/builtin-script.c +@@ -441,6 +441,9 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) + struct perf_event_attr *attr = &evsel->core.attr; + bool allow_user_set; + ++ if (evsel__is_dummy_event(evsel)) ++ return 0; ++ + if (perf_header__has_feat(&session->header, HEADER_STAT)) + return 0; + +diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c +index 5f0333a8acd8a..82e14faecc3e4 100644 +--- a/tools/perf/builtin-stat.c ++++ b/tools/perf/builtin-stat.c +@@ -1778,6 +1778,9 @@ static int add_default_attributes(void) + (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + }; ++ ++ struct perf_event_attr default_null_attrs[] = {}; ++ + /* Set attrs if no event is selected and !null_run: */ + if (stat_config.null_run) + return 0; +@@ -1941,6 +1944,9 @@ setup_metrics: + free(str); + } + ++ if (!stat_config.topdown_level) ++ stat_config.topdown_level = TOPDOWN_MAX_LEVEL; ++ + if (!evsel_list->core.nr_entries) { + if (target__has_cpu(&target)) + default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK; +@@ -1957,9 +1963,8 @@ setup_metrics: + } + if (evlist__add_default_attrs(evsel_list, default_attrs1) < 0) + return -1; +- +- stat_config.topdown_level = TOPDOWN_MAX_LEVEL; +- if (arch_evlist__add_default_attrs(evsel_list) < 0) ++ /* Platform specific attrs */ ++ if (evlist__add_default_attrs(evsel_list, default_null_attrs) < 0) + return -1; + } + +diff --git a/tools/perf/dlfilters/dlfilter-show-cycles.c b/tools/perf/dlfilters/dlfilter-show-cycles.c +index 9eccc97bff82f..6d47298ebe9f6 100644 +--- a/tools/perf/dlfilters/dlfilter-show-cycles.c ++++ b/tools/perf/dlfilters/dlfilter-show-cycles.c +@@ -98,9 +98,9 @@ int filter_event_early(void *data, const struct perf_dlfilter_sample *sample, vo + static void print_vals(__u64 cycles, __u64 delta) + { + if (delta) +- printf("%10llu %10llu ", cycles, delta); ++ printf("%10llu %10llu ", (unsigned long long)cycles, (unsigned long long)delta); + else +- printf("%10llu %10s ", cycles, ""); ++ printf("%10llu %10s ", (unsigned long long)cycles, ""); + } + + int filter_event(void *data, const struct perf_dlfilter_sample *sample, void *ctx) +diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c +index 48af7d379d822..efa5f006b5c61 100644 +--- a/tools/perf/util/evlist.c ++++ b/tools/perf/util/evlist.c +@@ -342,9 +342,14 @@ int __evlist__add_default_attrs(struct evlist *evlist, struct perf_event_attr *a + return evlist__add_attrs(evlist, attrs, nr_attrs); + } + +-__weak int arch_evlist__add_default_attrs(struct evlist *evlist __maybe_unused) ++__weak int arch_evlist__add_default_attrs(struct evlist *evlist, ++ struct perf_event_attr *attrs, ++ size_t nr_attrs) + { +- return 0; ++ if (!nr_attrs) ++ return 0; ++ ++ return __evlist__add_default_attrs(evlist, attrs, nr_attrs); + } + + struct evsel *evlist__find_tracepoint_by_id(struct evlist *evlist, int id) +diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h +index 1bde9ccf4e7da..129095c0fe6d3 100644 +--- a/tools/perf/util/evlist.h ++++ b/tools/perf/util/evlist.h +@@ -107,10 +107,13 @@ static inline int evlist__add_default(struct evlist *evlist) + int __evlist__add_default_attrs(struct evlist *evlist, + struct perf_event_attr *attrs, size_t nr_attrs); + ++int arch_evlist__add_default_attrs(struct evlist *evlist, ++ struct perf_event_attr *attrs, ++ size_t nr_attrs); ++ + #define evlist__add_default_attrs(evlist, array) \ +- __evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array)) ++ arch_evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array)) + +-int arch_evlist__add_default_attrs(struct evlist *evlist); + struct evsel *arch_evlist__leader(struct list_head *list); + + int evlist__add_dummy(struct evlist *evlist); diff --git a/sys-kernel/pinephone-sources/files/5.19.9-10.patch b/sys-kernel/pinephone-sources/files/5.19.9-10.patch new file mode 100644 index 0000000..331692b --- /dev/null +++ b/sys-kernel/pinephone-sources/files/5.19.9-10.patch @@ -0,0 +1,1723 @@ +diff --git a/Documentation/devicetree/bindings/iio/gyroscope/bosch,bmg160.yaml b/Documentation/devicetree/bindings/iio/gyroscope/bosch,bmg160.yaml +index b6bbc312a7cf7..1414ba9977c16 100644 +--- a/Documentation/devicetree/bindings/iio/gyroscope/bosch,bmg160.yaml ++++ b/Documentation/devicetree/bindings/iio/gyroscope/bosch,bmg160.yaml +@@ -24,8 +24,10 @@ properties: + + interrupts: + minItems: 1 ++ maxItems: 2 + description: + Should be configured with type IRQ_TYPE_EDGE_RISING. ++ If two interrupts are provided, expected order is INT1 and INT2. + + required: + - compatible +diff --git a/Documentation/input/joydev/joystick.rst b/Documentation/input/joydev/joystick.rst +index f615906a0821b..6d721396717a2 100644 +--- a/Documentation/input/joydev/joystick.rst ++++ b/Documentation/input/joydev/joystick.rst +@@ -517,6 +517,7 @@ All I-Force devices are supported by the iforce module. This includes: + * AVB Mag Turbo Force + * AVB Top Shot Pegasus + * AVB Top Shot Force Feedback Racing Wheel ++* Boeder Force Feedback Wheel + * Logitech WingMan Force + * Logitech WingMan Force Wheel + * Guillemot Race Leader Force Feedback +diff --git a/Makefile b/Makefile +index 1f27c4bd09e67..33a9b6b547c47 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 5 + PATCHLEVEL = 19 +-SUBLEVEL = 9 ++SUBLEVEL = 10 + EXTRAVERSION = + NAME = Superb Owl + +diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig +index 62b5b07fa4e1c..ca64bf5f5b038 100644 +--- a/arch/loongarch/Kconfig ++++ b/arch/loongarch/Kconfig +@@ -36,6 +36,7 @@ config LOONGARCH + select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION ++ select ARCH_KEEP_MEMBLOCK + select ARCH_MIGHT_HAVE_PC_PARPORT + select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_SPARSEMEM_ENABLE +diff --git a/arch/loongarch/include/asm/acpi.h b/arch/loongarch/include/asm/acpi.h +index 62044cd5b7bc5..825c2519b9d1f 100644 +--- a/arch/loongarch/include/asm/acpi.h ++++ b/arch/loongarch/include/asm/acpi.h +@@ -15,7 +15,7 @@ extern int acpi_pci_disabled; + extern int acpi_noirq; + + #define acpi_os_ioremap acpi_os_ioremap +-void __init __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); ++void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); + + static inline void disable_acpi(void) + { +diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c +index bb729ee8a2370..796a24055a942 100644 +--- a/arch/loongarch/kernel/acpi.c ++++ b/arch/loongarch/kernel/acpi.c +@@ -113,7 +113,7 @@ void __init __acpi_unmap_table(void __iomem *map, unsigned long size) + early_memunmap(map, size); + } + +-void __init __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) ++void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) + { + if (!memblock_is_memory(phys)) + return ioremap(phys, size); +diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c +index 7094a68c9b832..3c3fbff0b8f86 100644 +--- a/arch/loongarch/mm/init.c ++++ b/arch/loongarch/mm/init.c +@@ -131,18 +131,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params) + return ret; + } + +-#ifdef CONFIG_NUMA +-int memory_add_physaddr_to_nid(u64 start) +-{ +- int nid; +- +- nid = pa_to_nid(start); +- return nid; +-} +-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +-#endif +- +-#ifdef CONFIG_MEMORY_HOTREMOVE + void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) + { + unsigned long start_pfn = start >> PAGE_SHIFT; +@@ -154,6 +142,16 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) + page += vmem_altmap_offset(altmap); + __remove_pages(start_pfn, nr_pages, altmap); + } ++ ++#ifdef CONFIG_NUMA ++int memory_add_physaddr_to_nid(u64 start) ++{ ++ int nid; ++ ++ nid = pa_to_nid(start); ++ return nid; ++} ++EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + #endif + #endif + +diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c +index 356226c7ebbdc..aa1ba803659cd 100644 +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -5907,47 +5907,18 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + const struct kvm_memory_slot *memslot, + int start_level) + { +- bool flush = false; +- + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); +- flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, +- start_level, KVM_MAX_HUGEPAGE_LEVEL, +- false); ++ slot_handle_level(kvm, memslot, slot_rmap_write_protect, ++ start_level, KVM_MAX_HUGEPAGE_LEVEL, false); + write_unlock(&kvm->mmu_lock); + } + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); +- flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); ++ kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); + read_unlock(&kvm->mmu_lock); + } +- +- /* +- * Flush TLBs if any SPTEs had to be write-protected to ensure that +- * guest writes are reflected in the dirty bitmap before the memslot +- * update completes, i.e. before enabling dirty logging is visible to +- * userspace. +- * +- * Perform the TLB flush outside the mmu_lock to reduce the amount of +- * time the lock is held. However, this does mean that another CPU can +- * now grab mmu_lock and encounter a write-protected SPTE while CPUs +- * still have a writable mapping for the associated GFN in their TLB. +- * +- * This is safe but requires KVM to be careful when making decisions +- * based on the write-protection status of an SPTE. Specifically, KVM +- * also write-protects SPTEs to monitor changes to guest page tables +- * during shadow paging, and must guarantee no CPUs can write to those +- * page before the lock is dropped. As mentioned in the previous +- * paragraph, a write-protected SPTE is no guarantee that CPU cannot +- * perform writes. So to determine if a TLB flush is truly required, KVM +- * will clear a separate software-only bit (MMU-writable) and skip the +- * flush if-and-only-if this bit was already clear. +- * +- * See is_writable_pte() for more details. +- */ +- if (flush) +- kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); + } + + /* Must be called with the mmu_lock held in write-mode. */ +@@ -6070,32 +6041,30 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + const struct kvm_memory_slot *memslot) + { +- bool flush = false; +- + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + /* + * Clear dirty bits only on 4k SPTEs since the legacy MMU only + * support dirty logging at a 4k granularity. + */ +- flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); ++ slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); + write_unlock(&kvm->mmu_lock); + } + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); +- flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); ++ kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); + read_unlock(&kvm->mmu_lock); + } + + /* ++ * The caller will flush the TLBs after this function returns. ++ * + * It's also safe to flush TLBs out of mmu lock here as currently this + * function is only used for dirty logging, in which case flushing TLB + * out of mmu lock also guarantees no dirty pages will be lost in + * dirty_bitmap. + */ +- if (flush) +- kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); + } + + void kvm_mmu_zap_all(struct kvm *kvm) +diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h +index f80dbb628df57..e09bdcf1e47c5 100644 +--- a/arch/x86/kvm/mmu/spte.h ++++ b/arch/x86/kvm/mmu/spte.h +@@ -326,7 +326,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, + } + + /* +- * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: ++ * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: + * + * 1. To intercept writes for dirty logging. KVM write-protects huge pages + * so that they can be split be split down into the dirty logging +@@ -344,8 +344,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, + * read-only memslot or guest memory backed by a read-only VMA. Writes to + * such pages are disallowed entirely. + * +- * To keep track of why a given SPTE is write-protected, KVM uses 2 +- * software-only bits in the SPTE: ++ * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this ++ * case, the SPTE is access-protected, not just write-protected! ++ * ++ * For cases #1 and #4, KVM can safely make such SPTEs writable without taking ++ * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. ++ * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits ++ * in the SPTE: + * + * shadow_mmu_writable_mask, aka MMU-writable - + * Cleared on SPTEs that KVM is currently write-protecting for shadow paging +@@ -374,7 +379,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, + * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging + * (which does not clear the MMU-writable bit), does not flush TLBs before + * dropping the lock, as it only needs to synchronize guest writes with the +- * dirty bitmap. ++ * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for ++ * access-tracking via the clear_young() MMU notifier also does not flush TLBs. + * + * So, there is the problem: clearing the MMU-writable bit can encounter a + * write-protected SPTE while CPUs still have writable mappings for that SPTE +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 55de0d1981e52..5b36866528568 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -12265,6 +12265,50 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + } else { + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); + } ++ ++ /* ++ * Unconditionally flush the TLBs after enabling dirty logging. ++ * A flush is almost always going to be necessary (see below), ++ * and unconditionally flushing allows the helpers to omit ++ * the subtly complex checks when removing write access. ++ * ++ * Do the flush outside of mmu_lock to reduce the amount of ++ * time mmu_lock is held. Flushing after dropping mmu_lock is ++ * safe as KVM only needs to guarantee the slot is fully ++ * write-protected before returning to userspace, i.e. before ++ * userspace can consume the dirty status. ++ * ++ * Flushing outside of mmu_lock requires KVM to be careful when ++ * making decisions based on writable status of an SPTE, e.g. a ++ * !writable SPTE doesn't guarantee a CPU can't perform writes. ++ * ++ * Specifically, KVM also write-protects guest page tables to ++ * monitor changes when using shadow paging, and must guarantee ++ * no CPUs can write to those page before mmu_lock is dropped. ++ * Because CPUs may have stale TLB entries at this point, a ++ * !writable SPTE doesn't guarantee CPUs can't perform writes. ++ * ++ * KVM also allows making SPTES writable outside of mmu_lock, ++ * e.g. to allow dirty logging without taking mmu_lock. ++ * ++ * To handle these scenarios, KVM uses a separate software-only ++ * bit (MMU-writable) to track if a SPTE is !writable due to ++ * a guest page table being write-protected (KVM clears the ++ * MMU-writable flag when write-protecting for shadow paging). ++ * ++ * The use of MMU-writable is also the primary motivation for ++ * the unconditional flush. Because KVM must guarantee that a ++ * CPU doesn't contain stale, writable TLB entries for a ++ * !MMU-writable SPTE, KVM must flush if it encounters any ++ * MMU-writable SPTE regardless of whether the actual hardware ++ * writable bit was set. I.e. KVM is almost guaranteed to need ++ * to flush, while unconditionally flushing allows the "remove ++ * write access" helpers to ignore MMU-writable entirely. ++ * ++ * See is_writable_pte() for more details (the case involving ++ * access-tracked SPTEs is particularly relevant). ++ */ ++ kvm_arch_flush_remote_tlbs_memslot(kvm, new); + } + } + +diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c +index c2d4947844250..510cdec375c4d 100644 +--- a/drivers/acpi/resource.c ++++ b/drivers/acpi/resource.c +@@ -416,6 +416,16 @@ static bool acpi_dev_irq_override(u32 gsi, u8 triggering, u8 polarity, + { + int i; + ++#ifdef CONFIG_X86 ++ /* ++ * IRQ override isn't needed on modern AMD Zen systems and ++ * this override breaks active low IRQs on AMD Ryzen 6000 and ++ * newer systems. Skip it. ++ */ ++ if (boot_cpu_has(X86_FEATURE_ZEN)) ++ return false; ++#endif ++ + for (i = 0; i < ARRAY_SIZE(skip_override_table); i++) { + const struct irq_override_cmp *entry = &skip_override_table[i]; + +diff --git a/drivers/gpio/gpio-104-dio-48e.c b/drivers/gpio/gpio-104-dio-48e.c +index f118ad9bcd33d..0e95351d47d49 100644 +--- a/drivers/gpio/gpio-104-dio-48e.c ++++ b/drivers/gpio/gpio-104-dio-48e.c +@@ -271,6 +271,7 @@ static void dio48e_irq_mask(struct irq_data *data) + dio48egpio->irq_mask &= ~BIT(0); + else + dio48egpio->irq_mask &= ~BIT(1); ++ gpiochip_disable_irq(chip, offset); + + if (!dio48egpio->irq_mask) + /* disable interrupts */ +@@ -298,6 +299,7 @@ static void dio48e_irq_unmask(struct irq_data *data) + iowrite8(0x00, dio48egpio->base + 0xB); + } + ++ gpiochip_enable_irq(chip, offset); + if (offset == 19) + dio48egpio->irq_mask |= BIT(0); + else +@@ -320,12 +322,14 @@ static int dio48e_irq_set_type(struct irq_data *data, unsigned int flow_type) + return 0; + } + +-static struct irq_chip dio48e_irqchip = { ++static const struct irq_chip dio48e_irqchip = { + .name = "104-dio-48e", + .irq_ack = dio48e_irq_ack, + .irq_mask = dio48e_irq_mask, + .irq_unmask = dio48e_irq_unmask, +- .irq_set_type = dio48e_irq_set_type ++ .irq_set_type = dio48e_irq_set_type, ++ .flags = IRQCHIP_IMMUTABLE, ++ GPIOCHIP_IRQ_RESOURCE_HELPERS, + }; + + static irqreturn_t dio48e_irq_handler(int irq, void *dev_id) +@@ -414,7 +418,7 @@ static int dio48e_probe(struct device *dev, unsigned int id) + dio48egpio->chip.set_multiple = dio48e_gpio_set_multiple; + + girq = &dio48egpio->chip.irq; +- girq->chip = &dio48e_irqchip; ++ gpio_irq_chip_set_chip(girq, &dio48e_irqchip); + /* This will let us handle the parent IRQ in the driver */ + girq->parent_handler = NULL; + girq->num_parents = 0; +diff --git a/drivers/gpio/gpio-104-idio-16.c b/drivers/gpio/gpio-104-idio-16.c +index 45f7ad8573e19..a8b7c8eafac5a 100644 +--- a/drivers/gpio/gpio-104-idio-16.c ++++ b/drivers/gpio/gpio-104-idio-16.c +@@ -150,10 +150,11 @@ static void idio_16_irq_mask(struct irq_data *data) + { + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); +- const unsigned long mask = BIT(irqd_to_hwirq(data)); ++ const unsigned long offset = irqd_to_hwirq(data); + unsigned long flags; + +- idio16gpio->irq_mask &= ~mask; ++ idio16gpio->irq_mask &= ~BIT(offset); ++ gpiochip_disable_irq(chip, offset); + + if (!idio16gpio->irq_mask) { + raw_spin_lock_irqsave(&idio16gpio->lock, flags); +@@ -168,11 +169,12 @@ static void idio_16_irq_unmask(struct irq_data *data) + { + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct idio_16_gpio *const idio16gpio = gpiochip_get_data(chip); +- const unsigned long mask = BIT(irqd_to_hwirq(data)); ++ const unsigned long offset = irqd_to_hwirq(data); + const unsigned long prev_irq_mask = idio16gpio->irq_mask; + unsigned long flags; + +- idio16gpio->irq_mask |= mask; ++ gpiochip_enable_irq(chip, offset); ++ idio16gpio->irq_mask |= BIT(offset); + + if (!prev_irq_mask) { + raw_spin_lock_irqsave(&idio16gpio->lock, flags); +@@ -193,12 +195,14 @@ static int idio_16_irq_set_type(struct irq_data *data, unsigned int flow_type) + return 0; + } + +-static struct irq_chip idio_16_irqchip = { ++static const struct irq_chip idio_16_irqchip = { + .name = "104-idio-16", + .irq_ack = idio_16_irq_ack, + .irq_mask = idio_16_irq_mask, + .irq_unmask = idio_16_irq_unmask, +- .irq_set_type = idio_16_irq_set_type ++ .irq_set_type = idio_16_irq_set_type, ++ .flags = IRQCHIP_IMMUTABLE, ++ GPIOCHIP_IRQ_RESOURCE_HELPERS, + }; + + static irqreturn_t idio_16_irq_handler(int irq, void *dev_id) +@@ -275,7 +279,7 @@ static int idio_16_probe(struct device *dev, unsigned int id) + idio16gpio->out_state = 0xFFFF; + + girq = &idio16gpio->chip.irq; +- girq->chip = &idio_16_irqchip; ++ gpio_irq_chip_set_chip(girq, &idio_16_irqchip); + /* This will let us handle the parent IRQ in the driver */ + girq->parent_handler = NULL; + girq->num_parents = 0; +diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c +index 8943cea927642..a2e505a7545cd 100644 +--- a/drivers/gpio/gpio-mockup.c ++++ b/drivers/gpio/gpio-mockup.c +@@ -373,6 +373,13 @@ static void gpio_mockup_debugfs_setup(struct device *dev, + } + } + ++static void gpio_mockup_debugfs_cleanup(void *data) ++{ ++ struct gpio_mockup_chip *chip = data; ++ ++ debugfs_remove_recursive(chip->dbg_dir); ++} ++ + static void gpio_mockup_dispose_mappings(void *data) + { + struct gpio_mockup_chip *chip = data; +@@ -455,7 +462,7 @@ static int gpio_mockup_probe(struct platform_device *pdev) + + gpio_mockup_debugfs_setup(dev, chip); + +- return 0; ++ return devm_add_action_or_reset(dev, gpio_mockup_debugfs_cleanup, chip); + } + + static const struct of_device_id gpio_mockup_of_match[] = { +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c +index ecada5eadfe35..e325150879df7 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c +@@ -66,10 +66,15 @@ static bool is_fru_eeprom_supported(struct amdgpu_device *adev) + return true; + case CHIP_SIENNA_CICHLID: + if (strnstr(atom_ctx->vbios_version, "D603", ++ sizeof(atom_ctx->vbios_version))) { ++ if (strnstr(atom_ctx->vbios_version, "D603GLXE", + sizeof(atom_ctx->vbios_version))) +- return true; +- else ++ return false; ++ else ++ return true; ++ } else { + return false; ++ } + default: + return false; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +index 2b00f8fe15a89..b19bf0c3f3737 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +@@ -2372,7 +2372,7 @@ static int psp_load_smu_fw(struct psp_context *psp) + static bool fw_load_skip_check(struct psp_context *psp, + struct amdgpu_firmware_info *ucode) + { +- if (!ucode->fw) ++ if (!ucode->fw || !ucode->ucode_size) + return true; + + if (ucode->ucode_id == AMDGPU_UCODE_ID_SMC && +diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +index 9cde13b07dd26..d9a5209aa8433 100644 +--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c ++++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +@@ -382,11 +382,27 @@ static int smu_v13_0_7_append_powerplay_table(struct smu_context *smu) + return 0; + } + ++static int smu_v13_0_7_get_pptable_from_pmfw(struct smu_context *smu, ++ void **table, ++ uint32_t *size) ++{ ++ struct smu_table_context *smu_table = &smu->smu_table; ++ void *combo_pptable = smu_table->combo_pptable; ++ int ret = 0; ++ ++ ret = smu_cmn_get_combo_pptable(smu); ++ if (ret) ++ return ret; ++ ++ *table = combo_pptable; ++ *size = sizeof(struct smu_13_0_7_powerplay_table); ++ ++ return 0; ++} + + static int smu_v13_0_7_setup_pptable(struct smu_context *smu) + { + struct smu_table_context *smu_table = &smu->smu_table; +- void *combo_pptable = smu_table->combo_pptable; + struct amdgpu_device *adev = smu->adev; + int ret = 0; + +@@ -395,18 +411,11 @@ static int smu_v13_0_7_setup_pptable(struct smu_context *smu) + * be used directly by driver. To get the raw pptable, we need to + * rely on the combo pptable(and its revelant SMU message). + */ +- if (adev->scpm_enabled) { +- ret = smu_cmn_get_combo_pptable(smu); +- if (ret) +- return ret; +- +- smu->smu_table.power_play_table = combo_pptable; +- smu->smu_table.power_play_table_size = sizeof(struct smu_13_0_7_powerplay_table); +- } else { +- ret = smu_v13_0_setup_pptable(smu); +- if (ret) +- return ret; +- } ++ ret = smu_v13_0_7_get_pptable_from_pmfw(smu, ++ &smu_table->power_play_table, ++ &smu_table->power_play_table_size); ++ if (ret) ++ return ret; + + ret = smu_v13_0_7_store_powerplay_table(smu); + if (ret) +diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c +index a92ffde53f0b3..db2f847c8535f 100644 +--- a/drivers/gpu/drm/msm/msm_rd.c ++++ b/drivers/gpu/drm/msm/msm_rd.c +@@ -196,6 +196,9 @@ static int rd_open(struct inode *inode, struct file *file) + file->private_data = rd; + rd->open = true; + ++ /* Reset fifo to clear any previously unread data: */ ++ rd->fifo.head = rd->fifo.tail = 0; ++ + /* the parsing tools need to know gpu-id to know which + * register database to load. + * +diff --git a/drivers/hid/intel-ish-hid/ishtp-hid.h b/drivers/hid/intel-ish-hid/ishtp-hid.h +index 6a5cc11aefd89..35dddc5015b37 100644 +--- a/drivers/hid/intel-ish-hid/ishtp-hid.h ++++ b/drivers/hid/intel-ish-hid/ishtp-hid.h +@@ -105,7 +105,7 @@ struct report_list { + * @multi_packet_cnt: Count of fragmented packet count + * + * This structure is used to store completion flags and per client data like +- * like report description, number of HID devices etc. ++ * report description, number of HID devices etc. + */ + struct ishtp_cl_data { + /* completion flags */ +diff --git a/drivers/hid/intel-ish-hid/ishtp/client.c b/drivers/hid/intel-ish-hid/ishtp/client.c +index 405e0d5212cc8..df0a825694f52 100644 +--- a/drivers/hid/intel-ish-hid/ishtp/client.c ++++ b/drivers/hid/intel-ish-hid/ishtp/client.c +@@ -626,13 +626,14 @@ static void ishtp_cl_read_complete(struct ishtp_cl_rb *rb) + } + + /** +- * ipc_tx_callback() - IPC tx callback function ++ * ipc_tx_send() - IPC tx send function + * @prm: Pointer to client device instance + * +- * Send message over IPC either first time or on callback on previous message +- * completion ++ * Send message over IPC. Message will be split into fragments ++ * if message size is bigger than IPC FIFO size, and all ++ * fragments will be sent one by one. + */ +-static void ipc_tx_callback(void *prm) ++static void ipc_tx_send(void *prm) + { + struct ishtp_cl *cl = prm; + struct ishtp_cl_tx_ring *cl_msg; +@@ -677,32 +678,41 @@ static void ipc_tx_callback(void *prm) + list); + rem = cl_msg->send_buf.size - cl->tx_offs; + +- ishtp_hdr.host_addr = cl->host_client_id; +- ishtp_hdr.fw_addr = cl->fw_client_id; +- ishtp_hdr.reserved = 0; +- pmsg = cl_msg->send_buf.data + cl->tx_offs; ++ while (rem > 0) { ++ ishtp_hdr.host_addr = cl->host_client_id; ++ ishtp_hdr.fw_addr = cl->fw_client_id; ++ ishtp_hdr.reserved = 0; ++ pmsg = cl_msg->send_buf.data + cl->tx_offs; ++ ++ if (rem <= dev->mtu) { ++ /* Last fragment or only one packet */ ++ ishtp_hdr.length = rem; ++ ishtp_hdr.msg_complete = 1; ++ /* Submit to IPC queue with no callback */ ++ ishtp_write_message(dev, &ishtp_hdr, pmsg); ++ cl->tx_offs = 0; ++ cl->sending = 0; + +- if (rem <= dev->mtu) { +- ishtp_hdr.length = rem; +- ishtp_hdr.msg_complete = 1; +- cl->sending = 0; +- list_del_init(&cl_msg->list); /* Must be before write */ +- spin_unlock_irqrestore(&cl->tx_list_spinlock, tx_flags); +- /* Submit to IPC queue with no callback */ +- ishtp_write_message(dev, &ishtp_hdr, pmsg); +- spin_lock_irqsave(&cl->tx_free_list_spinlock, tx_free_flags); +- list_add_tail(&cl_msg->list, &cl->tx_free_list.list); +- ++cl->tx_ring_free_size; +- spin_unlock_irqrestore(&cl->tx_free_list_spinlock, +- tx_free_flags); +- } else { +- /* Send IPC fragment */ +- spin_unlock_irqrestore(&cl->tx_list_spinlock, tx_flags); +- cl->tx_offs += dev->mtu; +- ishtp_hdr.length = dev->mtu; +- ishtp_hdr.msg_complete = 0; +- ishtp_send_msg(dev, &ishtp_hdr, pmsg, ipc_tx_callback, cl); ++ break; ++ } else { ++ /* Send ipc fragment */ ++ ishtp_hdr.length = dev->mtu; ++ ishtp_hdr.msg_complete = 0; ++ /* All fregments submitted to IPC queue with no callback */ ++ ishtp_write_message(dev, &ishtp_hdr, pmsg); ++ cl->tx_offs += dev->mtu; ++ rem = cl_msg->send_buf.size - cl->tx_offs; ++ } + } ++ ++ list_del_init(&cl_msg->list); ++ spin_unlock_irqrestore(&cl->tx_list_spinlock, tx_flags); ++ ++ spin_lock_irqsave(&cl->tx_free_list_spinlock, tx_free_flags); ++ list_add_tail(&cl_msg->list, &cl->tx_free_list.list); ++ ++cl->tx_ring_free_size; ++ spin_unlock_irqrestore(&cl->tx_free_list_spinlock, ++ tx_free_flags); + } + + /** +@@ -720,7 +730,7 @@ static void ishtp_cl_send_msg_ipc(struct ishtp_device *dev, + return; + + cl->tx_offs = 0; +- ipc_tx_callback(cl); ++ ipc_tx_send(cl); + ++cl->send_msg_cnt_ipc; + } + +diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c +index d003ad864ee44..a6e5d350a94ce 100644 +--- a/drivers/infiniband/hw/irdma/uk.c ++++ b/drivers/infiniband/hw/irdma/uk.c +@@ -497,7 +497,8 @@ int irdma_uk_send(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, + FIELD_PREP(IRDMAQPSQ_IMMDATA, info->imm_data)); + i = 0; + } else { +- qp->wqe_ops.iw_set_fragment(wqe, 0, op_info->sg_list, ++ qp->wqe_ops.iw_set_fragment(wqe, 0, ++ frag_cnt ? op_info->sg_list : NULL, + qp->swqe_polarity); + i = 1; + } +diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c +index 08371a80fdc26..be189e0525de6 100644 +--- a/drivers/infiniband/hw/mlx5/cq.c ++++ b/drivers/infiniband/hw/mlx5/cq.c +@@ -523,6 +523,10 @@ repoll: + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); ++ if (wc->status != IB_WC_WR_FLUSH_ERR && ++ (*cur_qp)->type == MLX5_IB_QPT_REG_UMR) ++ dev->umrc.state = MLX5_UMR_STATE_RECOVER; ++ + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); +diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c +index 63c89a72cc352..bb13164124fdb 100644 +--- a/drivers/infiniband/hw/mlx5/main.c ++++ b/drivers/infiniband/hw/mlx5/main.c +@@ -4336,7 +4336,7 @@ static int mlx5r_probe(struct auxiliary_device *adev, + dev->mdev = mdev; + dev->num_ports = num_ports; + +- if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_init_enabled(mdev)) ++ if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) + profile = &raw_eth_profile; + else + profile = &pf_profile; +diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h +index 998b67509a533..c2cca032a6ed4 100644 +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -717,13 +717,24 @@ struct mlx5_ib_umr_context { + struct completion done; + }; + ++enum { ++ MLX5_UMR_STATE_UNINIT, ++ MLX5_UMR_STATE_ACTIVE, ++ MLX5_UMR_STATE_RECOVER, ++ MLX5_UMR_STATE_ERR, ++}; ++ + struct umr_common { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; +- /* control access to UMR QP ++ /* Protects from UMR QP overflow + */ + struct semaphore sem; ++ /* Protects from using UMR while the UMR is not active ++ */ ++ struct mutex lock; ++ unsigned int state; + }; + + struct mlx5_cache_ent { +diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c +index 3a48364c09181..d5105b5c9979b 100644 +--- a/drivers/infiniband/hw/mlx5/umr.c ++++ b/drivers/infiniband/hw/mlx5/umr.c +@@ -176,6 +176,8 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); ++ mutex_init(&dev->umrc.lock); ++ dev->umrc.state = MLX5_UMR_STATE_ACTIVE; + + return 0; + +@@ -190,11 +192,38 @@ destroy_pd: + + void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) + { ++ if (dev->umrc.state == MLX5_UMR_STATE_UNINIT) ++ return; + ib_destroy_qp(dev->umrc.qp); + ib_free_cq(dev->umrc.cq); + ib_dealloc_pd(dev->umrc.pd); + } + ++static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) ++{ ++ struct umr_common *umrc = &dev->umrc; ++ struct ib_qp_attr attr; ++ int err; ++ ++ attr.qp_state = IB_QPS_RESET; ++ err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); ++ if (err) { ++ mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); ++ goto err; ++ } ++ ++ err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); ++ if (err) ++ goto err; ++ ++ umrc->state = MLX5_UMR_STATE_ACTIVE; ++ return 0; ++ ++err: ++ umrc->state = MLX5_UMR_STATE_ERR; ++ return err; ++} ++ + static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, + struct mlx5r_umr_wqe *wqe, bool with_data) + { +@@ -231,7 +260,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, + + id.ib_cqe = cqe; + mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0, +- MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR); ++ MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR); + + mlx5r_ring_db(qp, 1, ctrl); + +@@ -270,17 +299,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, + mlx5r_umr_init_context(&umr_context); + + down(&umrc->sem); +- err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, +- with_data); +- if (err) +- mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); +- else { +- wait_for_completion(&umr_context.done); +- if (umr_context.status != IB_WC_SUCCESS) { +- mlx5_ib_warn(dev, "reg umr failed (%u)\n", +- umr_context.status); ++ while (true) { ++ mutex_lock(&umrc->lock); ++ if (umrc->state == MLX5_UMR_STATE_ERR) { ++ mutex_unlock(&umrc->lock); + err = -EFAULT; ++ break; ++ } ++ ++ if (umrc->state == MLX5_UMR_STATE_RECOVER) { ++ mutex_unlock(&umrc->lock); ++ usleep_range(3000, 5000); ++ continue; ++ } ++ ++ err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, ++ with_data); ++ mutex_unlock(&umrc->lock); ++ if (err) { ++ mlx5_ib_warn(dev, "UMR post send failed, err %d\n", ++ err); ++ break; + } ++ ++ wait_for_completion(&umr_context.done); ++ ++ if (umr_context.status == IB_WC_SUCCESS) ++ break; ++ ++ if (umr_context.status == IB_WC_WR_FLUSH_ERR) ++ continue; ++ ++ WARN_ON_ONCE(1); ++ mlx5_ib_warn(dev, ++ "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n", ++ umr_context.status); ++ mutex_lock(&umrc->lock); ++ err = mlx5r_umr_recover(dev); ++ mutex_unlock(&umrc->lock); ++ if (err) ++ mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", ++ err); ++ err = -EFAULT; ++ break; + } + up(&umrc->sem); + return err; +diff --git a/drivers/input/joystick/iforce/iforce-main.c b/drivers/input/joystick/iforce/iforce-main.c +index b2a68bc9f0b4d..b86de1312512b 100644 +--- a/drivers/input/joystick/iforce/iforce-main.c ++++ b/drivers/input/joystick/iforce/iforce-main.c +@@ -50,6 +50,7 @@ static struct iforce_device iforce_device[] = { + { 0x046d, 0xc291, "Logitech WingMan Formula Force", btn_wheel, abs_wheel, ff_iforce }, + { 0x05ef, 0x020a, "AVB Top Shot Pegasus", btn_joystick_avb, abs_avb_pegasus, ff_iforce }, + { 0x05ef, 0x8884, "AVB Mag Turbo Force", btn_wheel, abs_wheel, ff_iforce }, ++ { 0x05ef, 0x8886, "Boeder Force Feedback Wheel", btn_wheel, abs_wheel, ff_iforce }, + { 0x05ef, 0x8888, "AVB Top Shot Force Feedback Racing Wheel", btn_wheel, abs_wheel, ff_iforce }, //? + { 0x061c, 0xc0a4, "ACT LABS Force RS", btn_wheel, abs_wheel, ff_iforce }, //? + { 0x061c, 0xc084, "ACT LABS Force RS", btn_wheel, abs_wheel, ff_iforce }, +diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c +index 40ac3a78d90ef..c0464959cbcdb 100644 +--- a/drivers/iommu/intel/iommu.c ++++ b/drivers/iommu/intel/iommu.c +@@ -168,38 +168,6 @@ static phys_addr_t root_entry_uctp(struct root_entry *re) + return re->hi & VTD_PAGE_MASK; + } + +-static inline void context_clear_pasid_enable(struct context_entry *context) +-{ +- context->lo &= ~(1ULL << 11); +-} +- +-static inline bool context_pasid_enabled(struct context_entry *context) +-{ +- return !!(context->lo & (1ULL << 11)); +-} +- +-static inline void context_set_copied(struct context_entry *context) +-{ +- context->hi |= (1ull << 3); +-} +- +-static inline bool context_copied(struct context_entry *context) +-{ +- return !!(context->hi & (1ULL << 3)); +-} +- +-static inline bool __context_present(struct context_entry *context) +-{ +- return (context->lo & 1); +-} +- +-bool context_present(struct context_entry *context) +-{ +- return context_pasid_enabled(context) ? +- __context_present(context) : +- __context_present(context) && !context_copied(context); +-} +- + static inline void context_set_present(struct context_entry *context) + { + context->lo |= 1; +@@ -247,6 +215,26 @@ static inline void context_clear_entry(struct context_entry *context) + context->hi = 0; + } + ++static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) ++{ ++ if (!iommu->copied_tables) ++ return false; ++ ++ return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); ++} ++ ++static inline void ++set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) ++{ ++ set_bit(((long)bus << 8) | devfn, iommu->copied_tables); ++} ++ ++static inline void ++clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) ++{ ++ clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); ++} ++ + /* + * This domain is a statically identity mapping domain. + * 1. This domain creats a static 1:1 mapping to all usable memory. +@@ -644,6 +632,13 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, + struct context_entry *context; + u64 *entry; + ++ /* ++ * Except that the caller requested to allocate a new entry, ++ * returning a copied context entry makes no sense. ++ */ ++ if (!alloc && context_copied(iommu, bus, devfn)) ++ return NULL; ++ + entry = &root->lo; + if (sm_supported(iommu)) { + if (devfn >= 0x80) { +@@ -1770,6 +1765,11 @@ static void free_dmar_iommu(struct intel_iommu *iommu) + iommu->domain_ids = NULL; + } + ++ if (iommu->copied_tables) { ++ bitmap_free(iommu->copied_tables); ++ iommu->copied_tables = NULL; ++ } ++ + g_iommus[iommu->seq_id] = NULL; + + /* free context mapping */ +@@ -1978,7 +1978,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, + goto out_unlock; + + ret = 0; +- if (context_present(context)) ++ if (context_present(context) && !context_copied(iommu, bus, devfn)) + goto out_unlock; + + /* +@@ -1990,7 +1990,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, + * in-flight DMA will exist, and we don't need to worry anymore + * hereafter. + */ +- if (context_copied(context)) { ++ if (context_copied(iommu, bus, devfn)) { + u16 did_old = context_domain_id(context); + + if (did_old < cap_ndoms(iommu->cap)) { +@@ -2001,6 +2001,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, + iommu->flush.flush_iotlb(iommu, did_old, 0, 0, + DMA_TLB_DSI_FLUSH); + } ++ ++ clear_context_copied(iommu, bus, devfn); + } + + context_clear_entry(context); +@@ -2783,32 +2785,14 @@ static int copy_context_table(struct intel_iommu *iommu, + /* Now copy the context entry */ + memcpy(&ce, old_ce + idx, sizeof(ce)); + +- if (!__context_present(&ce)) ++ if (!context_present(&ce)) + continue; + + did = context_domain_id(&ce); + if (did >= 0 && did < cap_ndoms(iommu->cap)) + set_bit(did, iommu->domain_ids); + +- /* +- * We need a marker for copied context entries. This +- * marker needs to work for the old format as well as +- * for extended context entries. +- * +- * Bit 67 of the context entry is used. In the old +- * format this bit is available to software, in the +- * extended format it is the PGE bit, but PGE is ignored +- * by HW if PASIDs are disabled (and thus still +- * available). +- * +- * So disable PASIDs first and then mark the entry +- * copied. This means that we don't copy PASID +- * translations from the old kernel, but this is fine as +- * faults there are not fatal. +- */ +- context_clear_pasid_enable(&ce); +- context_set_copied(&ce); +- ++ set_context_copied(iommu, bus, devfn); + new_ce[idx] = ce; + } + +@@ -2835,8 +2819,8 @@ static int copy_translation_tables(struct intel_iommu *iommu) + bool new_ext, ext; + + rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); +- ext = !!(rtaddr_reg & DMA_RTADDR_RTT); +- new_ext = !!ecap_ecs(iommu->ecap); ++ ext = !!(rtaddr_reg & DMA_RTADDR_SMT); ++ new_ext = !!sm_supported(iommu); + + /* + * The RTT bit can only be changed when translation is disabled, +@@ -2847,6 +2831,10 @@ static int copy_translation_tables(struct intel_iommu *iommu) + if (new_ext != ext) + return -EINVAL; + ++ iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); ++ if (!iommu->copied_tables) ++ return -ENOMEM; ++ + old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; + if (!old_rt_phys) + return -EINVAL; +diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c +index c28f8cc00d1cf..a9cc85882b315 100644 +--- a/drivers/net/ethernet/broadcom/tg3.c ++++ b/drivers/net/ethernet/broadcom/tg3.c +@@ -18076,16 +18076,20 @@ static void tg3_shutdown(struct pci_dev *pdev) + struct net_device *dev = pci_get_drvdata(pdev); + struct tg3 *tp = netdev_priv(dev); + ++ tg3_reset_task_cancel(tp); ++ + rtnl_lock(); ++ + netif_device_detach(dev); + + if (netif_running(dev)) + dev_close(dev); + +- if (system_state == SYSTEM_POWER_OFF) +- tg3_power_down(tp); ++ tg3_power_down(tp); + + rtnl_unlock(); ++ ++ pci_disable_device(pdev); + } + + /** +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c +index cfb8bedba5124..079fa44ada71e 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c +@@ -289,6 +289,10 @@ int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id) + sw_owner_id[i]); + } + ++ if (MLX5_CAP_GEN_2_MAX(dev, sw_vhca_id_valid) && ++ dev->priv.sw_vhca_id > 0) ++ MLX5_SET(init_hca_in, in, sw_vhca_id, dev->priv.sw_vhca_id); ++ + return mlx5_cmd_exec_in(dev, init_hca, in); + } + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c +index 616207c3b187a..6c8bb74bd8fc6 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c +@@ -90,6 +90,8 @@ module_param_named(prof_sel, prof_sel, uint, 0444); + MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); + + static u32 sw_owner_id[4]; ++#define MAX_SW_VHCA_ID (BIT(__mlx5_bit_sz(cmd_hca_cap_2, sw_vhca_id)) - 1) ++static DEFINE_IDA(sw_vhca_ida); + + enum { + MLX5_ATOMIC_REQ_MODE_BE = 0x0, +@@ -499,6 +501,49 @@ static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev) + return err; + } + ++bool mlx5_is_roce_on(struct mlx5_core_dev *dev) ++{ ++ struct devlink *devlink = priv_to_devlink(dev); ++ union devlink_param_value val; ++ int err; ++ ++ err = devlink_param_driverinit_value_get(devlink, ++ DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, ++ &val); ++ ++ if (!err) ++ return val.vbool; ++ ++ mlx5_core_dbg(dev, "Failed to get param. err = %d\n", err); ++ return MLX5_CAP_GEN(dev, roce); ++} ++EXPORT_SYMBOL(mlx5_is_roce_on); ++ ++static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx) ++{ ++ void *set_hca_cap; ++ int err; ++ ++ if (!MLX5_CAP_GEN_MAX(dev, hca_cap_2)) ++ return 0; ++ ++ err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL_2); ++ if (err) ++ return err; ++ ++ if (!MLX5_CAP_GEN_2_MAX(dev, sw_vhca_id_valid) || ++ !(dev->priv.sw_vhca_id > 0)) ++ return 0; ++ ++ set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, ++ capability); ++ memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_GENERAL_2]->cur, ++ MLX5_ST_SZ_BYTES(cmd_hca_cap_2)); ++ MLX5_SET(cmd_hca_cap_2, set_hca_cap, sw_vhca_id_valid, 1); ++ ++ return set_caps(dev, set_ctx, MLX5_CAP_GENERAL_2); ++} ++ + static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) + { + struct mlx5_profile *prof = &dev->profile; +@@ -577,7 +622,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) + MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix)); + + if (MLX5_CAP_GEN(dev, roce_rw_supported)) +- MLX5_SET(cmd_hca_cap, set_hca_cap, roce, mlx5_is_roce_init_enabled(dev)); ++ MLX5_SET(cmd_hca_cap, set_hca_cap, roce, ++ mlx5_is_roce_on(dev)); + + max_uc_list = max_uc_list_get_devlink_param(dev); + if (max_uc_list > 0) +@@ -603,7 +649,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) + */ + static bool is_roce_fw_disabled(struct mlx5_core_dev *dev) + { +- return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_init_enabled(dev)) || ++ return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_on(dev)) || + (!MLX5_CAP_GEN(dev, roce_rw_supported) && !MLX5_CAP_GEN(dev, roce)); + } + +@@ -669,6 +715,13 @@ static int set_hca_cap(struct mlx5_core_dev *dev) + goto out; + } + ++ memset(set_ctx, 0, set_sz); ++ err = handle_hca_cap_2(dev, set_ctx); ++ if (err) { ++ mlx5_core_err(dev, "handle_hca_cap_2 failed\n"); ++ goto out; ++ } ++ + out: + kfree(set_ctx); + return err; +@@ -1512,6 +1565,18 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) + if (err) + goto err_hca_caps; + ++ /* The conjunction of sw_vhca_id with sw_owner_id will be a global ++ * unique id per function which uses mlx5_core. ++ * Those values are supplied to FW as part of the init HCA command to ++ * be used by both driver and FW when it's applicable. ++ */ ++ dev->priv.sw_vhca_id = ida_alloc_range(&sw_vhca_ida, 1, ++ MAX_SW_VHCA_ID, ++ GFP_KERNEL); ++ if (dev->priv.sw_vhca_id < 0) ++ mlx5_core_err(dev, "failed to allocate sw_vhca_id, err=%d\n", ++ dev->priv.sw_vhca_id); ++ + return 0; + + err_hca_caps: +@@ -1537,6 +1602,9 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev) + { + struct mlx5_priv *priv = &dev->priv; + ++ if (priv->sw_vhca_id > 0) ++ ida_free(&sw_vhca_ida, dev->priv.sw_vhca_id); ++ + mlx5_hca_caps_free(dev); + mlx5_adev_cleanup(dev); + mlx5_pagealloc_cleanup(dev); +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c +index ac020cb780727..d5c3173250309 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c +@@ -1086,9 +1086,17 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, + goto free; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1); +- MLX5_SET(modify_nic_vport_context_in, in, +- nic_vport_context.affiliated_vhca_id, +- MLX5_CAP_GEN(master_mdev, vhca_id)); ++ if (MLX5_CAP_GEN_2(master_mdev, sw_vhca_id_valid)) { ++ MLX5_SET(modify_nic_vport_context_in, in, ++ nic_vport_context.vhca_id_type, VHCA_ID_TYPE_SW); ++ MLX5_SET(modify_nic_vport_context_in, in, ++ nic_vport_context.affiliated_vhca_id, ++ MLX5_CAP_GEN_2(master_mdev, sw_vhca_id)); ++ } else { ++ MLX5_SET(modify_nic_vport_context_in, in, ++ nic_vport_context.affiliated_vhca_id, ++ MLX5_CAP_GEN(master_mdev, vhca_id)); ++ } + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliation_criteria, + MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria)); +diff --git a/drivers/net/ieee802154/cc2520.c b/drivers/net/ieee802154/cc2520.c +index 1e1f40f628a02..c69b87d3837da 100644 +--- a/drivers/net/ieee802154/cc2520.c ++++ b/drivers/net/ieee802154/cc2520.c +@@ -504,6 +504,7 @@ cc2520_tx(struct ieee802154_hw *hw, struct sk_buff *skb) + goto err_tx; + + if (status & CC2520_STATUS_TX_UNDERFLOW) { ++ rc = -EINVAL; + dev_err(&priv->spi->dev, "cc2520 tx underflow exception\n"); + goto err_tx; + } +diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c +index 2de09ad5bac03..e11f70911acc1 100644 +--- a/drivers/net/usb/cdc_ether.c ++++ b/drivers/net/usb/cdc_ether.c +@@ -777,6 +777,13 @@ static const struct usb_device_id products[] = { + }, + #endif + ++/* Lenovo ThinkPad OneLink+ Dock (based on Realtek RTL8153) */ ++{ ++ USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x3054, USB_CLASS_COMM, ++ USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), ++ .driver_info = 0, ++}, ++ + /* ThinkPad USB-C Dock (based on Realtek RTL8153) */ + { + USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x3062, USB_CLASS_COMM, +diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c +index d142ac8fcf6e2..688905ea0a6d3 100644 +--- a/drivers/net/usb/r8152.c ++++ b/drivers/net/usb/r8152.c +@@ -770,6 +770,7 @@ enum rtl8152_flags { + RX_EPROTO, + }; + ++#define DEVICE_ID_THINKPAD_ONELINK_PLUS_DOCK 0x3054 + #define DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2 0x3082 + #define DEVICE_ID_THINKPAD_USB_C_DONGLE 0x720c + #define DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2 0xa387 +@@ -9581,6 +9582,7 @@ static bool rtl8152_supports_lenovo_macpassthru(struct usb_device *udev) + + if (vendor_id == VENDOR_ID_LENOVO) { + switch (product_id) { ++ case DEVICE_ID_THINKPAD_ONELINK_PLUS_DOCK: + case DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2: + case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2: + case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN3: +@@ -9828,6 +9830,7 @@ static const struct usb_device_id rtl8152_table[] = { + REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x0927), + REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101), + REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x304f), ++ REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3054), + REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3062), + REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3069), + REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x3082), +diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c +index 73d9fcba3b1c0..9f6614f7dbeb1 100644 +--- a/drivers/nvme/host/pci.c ++++ b/drivers/nvme/host/pci.c +@@ -3517,6 +3517,8 @@ static const struct pci_device_id nvme_id_table[] = { + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, ++ { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ ++ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), +diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c +index dc3b4dc8fe08b..a3694a32f6d52 100644 +--- a/drivers/nvme/target/tcp.c ++++ b/drivers/nvme/target/tcp.c +@@ -1506,6 +1506,9 @@ static void nvmet_tcp_state_change(struct sock *sk) + goto done; + + switch (sk->sk_state) { ++ case TCP_FIN_WAIT2: ++ case TCP_LAST_ACK: ++ break; + case TCP_FIN_WAIT1: + case TCP_CLOSE_WAIT: + case TCP_CLOSE: +diff --git a/drivers/peci/cpu.c b/drivers/peci/cpu.c +index 68eb61c65d345..de4a7b3e5966e 100644 +--- a/drivers/peci/cpu.c ++++ b/drivers/peci/cpu.c +@@ -188,8 +188,6 @@ static void adev_release(struct device *dev) + { + struct auxiliary_device *adev = to_auxiliary_dev(dev); + +- auxiliary_device_uninit(adev); +- + kfree(adev->name); + kfree(adev); + } +@@ -234,6 +232,7 @@ static void unregister_adev(void *_adev) + struct auxiliary_device *adev = _adev; + + auxiliary_device_delete(adev); ++ auxiliary_device_uninit(adev); + } + + static int devm_adev_add(struct device *dev, int idx) +diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c +index 513de1f54e2d7..933b96e243b84 100644 +--- a/drivers/perf/arm_pmu_platform.c ++++ b/drivers/perf/arm_pmu_platform.c +@@ -117,7 +117,7 @@ static int pmu_parse_irqs(struct arm_pmu *pmu) + + if (num_irqs == 1) { + int irq = platform_get_irq(pdev, 0); +- if (irq && irq_is_percpu_devid(irq)) ++ if ((irq > 0) && irq_is_percpu_devid(irq)) + return pmu_parse_percpu_irq(pmu, irq); + } + +diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c +index ce2bd88feeaa8..08019c6ccc9ca 100644 +--- a/drivers/platform/surface/surface_aggregator_registry.c ++++ b/drivers/platform/surface/surface_aggregator_registry.c +@@ -556,6 +556,9 @@ static const struct acpi_device_id ssam_platform_hub_match[] = { + /* Surface Laptop Go 1 */ + { "MSHW0118", (unsigned long)ssam_node_group_slg1 }, + ++ /* Surface Laptop Go 2 */ ++ { "MSHW0290", (unsigned long)ssam_node_group_slg1 }, ++ + /* Surface Laptop Studio */ + { "MSHW0123", (unsigned long)ssam_node_group_sls }, + +diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c +index 9c6943e401a6c..0fbcaffabbfc7 100644 +--- a/drivers/platform/x86/acer-wmi.c ++++ b/drivers/platform/x86/acer-wmi.c +@@ -99,6 +99,7 @@ static const struct key_entry acer_wmi_keymap[] __initconst = { + {KE_KEY, 0x22, {KEY_PROG2} }, /* Arcade */ + {KE_KEY, 0x23, {KEY_PROG3} }, /* P_Key */ + {KE_KEY, 0x24, {KEY_PROG4} }, /* Social networking_Key */ ++ {KE_KEY, 0x27, {KEY_HELP} }, + {KE_KEY, 0x29, {KEY_PROG3} }, /* P_Key for TM8372 */ + {KE_IGNORE, 0x41, {KEY_MUTE} }, + {KE_IGNORE, 0x42, {KEY_PREVIOUSSONG} }, +@@ -112,7 +113,13 @@ static const struct key_entry acer_wmi_keymap[] __initconst = { + {KE_IGNORE, 0x48, {KEY_VOLUMEUP} }, + {KE_IGNORE, 0x49, {KEY_VOLUMEDOWN} }, + {KE_IGNORE, 0x4a, {KEY_VOLUMEDOWN} }, +- {KE_IGNORE, 0x61, {KEY_SWITCHVIDEOMODE} }, ++ /* ++ * 0x61 is KEY_SWITCHVIDEOMODE. Usually this is a duplicate input event ++ * with the "Video Bus" input device events. But sometimes it is not ++ * a dup. Map it to KEY_UNKNOWN instead of using KE_IGNORE so that ++ * udev/hwdb can override it on systems where it is not a dup. ++ */ ++ {KE_KEY, 0x61, {KEY_UNKNOWN} }, + {KE_IGNORE, 0x62, {KEY_BRIGHTNESSUP} }, + {KE_IGNORE, 0x63, {KEY_BRIGHTNESSDOWN} }, + {KE_KEY, 0x64, {KEY_SWITCHVIDEOMODE} }, /* Display Switch */ +diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c +index 62ce198a34631..a0f31624aee97 100644 +--- a/drivers/platform/x86/asus-wmi.c ++++ b/drivers/platform/x86/asus-wmi.c +@@ -107,7 +107,7 @@ module_param(fnlock_default, bool, 0444); + #define WMI_EVENT_MASK 0xFFFF + + #define FAN_CURVE_POINTS 8 +-#define FAN_CURVE_BUF_LEN (FAN_CURVE_POINTS * 2) ++#define FAN_CURVE_BUF_LEN 32 + #define FAN_CURVE_DEV_CPU 0x00 + #define FAN_CURVE_DEV_GPU 0x01 + /* Mask to determine if setting temperature or percentage */ +@@ -2208,8 +2208,10 @@ static int fan_curve_get_factory_default(struct asus_wmi *asus, u32 fan_dev) + curves = &asus->custom_fan_curves[fan_idx]; + err = asus_wmi_evaluate_method_buf(asus->dsts_id, fan_dev, mode, buf, + FAN_CURVE_BUF_LEN); +- if (err) ++ if (err) { ++ pr_warn("%s (0x%08x) failed: %d\n", __func__, fan_dev, err); + return err; ++ } + + fan_curve_copy_from_buf(curves, buf); + curves->device_id = fan_dev; +@@ -2227,9 +2229,6 @@ static int fan_curve_check_present(struct asus_wmi *asus, bool *available, + + err = fan_curve_get_factory_default(asus, fan_dev); + if (err) { +- pr_debug("fan_curve_get_factory_default(0x%08x) failed: %d\n", +- fan_dev, err); +- /* Don't cause probe to fail on devices without fan-curves */ + return 0; + } + +diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h +index 4051c8cd0cd8a..23ab3b048d9be 100644 +--- a/drivers/usb/storage/unusual_uas.h ++++ b/drivers/usb/storage/unusual_uas.h +@@ -62,6 +62,13 @@ UNUSUAL_DEV(0x0984, 0x0301, 0x0128, 0x0128, + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_IGNORE_UAS), + ++/* Reported-by: Tom Hu */ ++UNUSUAL_DEV(0x0b05, 0x1932, 0x0000, 0x9999, ++ "ASUS", ++ "External HDD", ++ USB_SC_DEVICE, USB_PR_DEVICE, NULL, ++ US_FL_IGNORE_UAS), ++ + /* Reported-by: David Webb */ + UNUSUAL_DEV(0x0bc2, 0x331a, 0x0000, 0x9999, + "Seagate", +diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h +index 5fcf89faa31ab..d72626d71258f 100644 +--- a/include/linux/intel-iommu.h ++++ b/include/linux/intel-iommu.h +@@ -196,7 +196,6 @@ + #define ecap_dis(e) (((e) >> 27) & 0x1) + #define ecap_nest(e) (((e) >> 26) & 0x1) + #define ecap_mts(e) (((e) >> 25) & 0x1) +-#define ecap_ecs(e) (((e) >> 24) & 0x1) + #define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) + #define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16) + #define ecap_coherent(e) ((e) & 0x1) +@@ -264,7 +263,6 @@ + #define DMA_GSTS_CFIS (((u32)1) << 23) + + /* DMA_RTADDR_REG */ +-#define DMA_RTADDR_RTT (((u64)1) << 11) + #define DMA_RTADDR_SMT (((u64)1) << 10) + + /* CCMD_REG */ +@@ -579,6 +577,7 @@ struct intel_iommu { + + #ifdef CONFIG_INTEL_IOMMU + unsigned long *domain_ids; /* bitmap of domains */ ++ unsigned long *copied_tables; /* bitmap of copied tables */ + spinlock_t lock; /* protect context, domain ids */ + struct root_entry *root_entry; /* virtual address */ + +@@ -692,6 +691,11 @@ static inline int nr_pte_to_next_page(struct dma_pte *pte) + (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; + } + ++static inline bool context_present(struct context_entry *context) ++{ ++ return (context->lo & 1); ++} ++ + extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); + + extern int dmar_enable_qi(struct intel_iommu *iommu); +@@ -776,7 +780,6 @@ static inline void intel_iommu_debugfs_init(void) {} + #endif /* CONFIG_INTEL_IOMMU_DEBUGFS */ + + extern const struct attribute_group *intel_iommu_groups[]; +-bool context_present(struct context_entry *context); + struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, + u8 devfn, int alloc); + +diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h +index b0b4ac92354a2..b3ea245faa515 100644 +--- a/include/linux/mlx5/driver.h ++++ b/include/linux/mlx5/driver.h +@@ -606,6 +606,7 @@ struct mlx5_priv { + spinlock_t ctx_lock; + struct mlx5_adev **adev; + int adev_idx; ++ int sw_vhca_id; + struct mlx5_events *events; + + struct mlx5_flow_steering *steering; +@@ -1274,16 +1275,17 @@ enum { + MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, + }; + +-static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev) ++bool mlx5_is_roce_on(struct mlx5_core_dev *dev); ++ ++static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev) + { +- struct devlink *devlink = priv_to_devlink(dev); +- union devlink_param_value val; +- int err; +- +- err = devlink_param_driverinit_value_get(devlink, +- DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, +- &val); +- return err ? MLX5_CAP_GEN(dev, roce) : val.vbool; ++ if (MLX5_CAP_GEN(dev, roce_rw_supported)) ++ return MLX5_CAP_GEN(dev, roce); ++ ++ /* If RoCE cap is read-only in FW, get RoCE state from devlink ++ * in order to support RoCE enable/disable feature ++ */ ++ return mlx5_is_roce_on(dev); + } + + #endif /* MLX5_DRIVER_H */ +diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h +index fd7d083a34d33..6d57e5ec9718d 100644 +--- a/include/linux/mlx5/mlx5_ifc.h ++++ b/include/linux/mlx5/mlx5_ifc.h +@@ -1804,7 +1804,14 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { + u8 max_reformat_remove_size[0x8]; + u8 max_reformat_remove_offset[0x8]; + +- u8 reserved_at_c0[0x740]; ++ u8 reserved_at_c0[0x160]; ++ ++ u8 reserved_at_220[0x1]; ++ u8 sw_vhca_id_valid[0x1]; ++ u8 sw_vhca_id[0xe]; ++ u8 reserved_at_230[0x10]; ++ ++ u8 reserved_at_240[0x5c0]; + }; + + enum mlx5_ifc_flow_destination_type { +@@ -3715,6 +3722,11 @@ struct mlx5_ifc_rmpc_bits { + struct mlx5_ifc_wq_bits wq; + }; + ++enum { ++ VHCA_ID_TYPE_HW = 0, ++ VHCA_ID_TYPE_SW = 1, ++}; ++ + struct mlx5_ifc_nic_vport_context_bits { + u8 reserved_at_0[0x5]; + u8 min_wqe_inline_mode[0x3]; +@@ -3731,8 +3743,8 @@ struct mlx5_ifc_nic_vport_context_bits { + u8 event_on_mc_address_change[0x1]; + u8 event_on_uc_address_change[0x1]; + +- u8 reserved_at_40[0xc]; +- ++ u8 vhca_id_type[0x1]; ++ u8 reserved_at_41[0xb]; + u8 affiliation_criteria[0x4]; + u8 affiliated_vhca_id[0x10]; + +@@ -7189,7 +7201,12 @@ struct mlx5_ifc_init_hca_in_bits { + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + +- u8 reserved_at_40[0x40]; ++ u8 reserved_at_40[0x20]; ++ ++ u8 reserved_at_60[0x2]; ++ u8 sw_vhca_id[0xe]; ++ u8 reserved_at_70[0x10]; ++ + u8 sw_owner_id[4][0x20]; + }; + +diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c +index cbdf0e2bc5ae0..d0fb74b0db1d5 100644 +--- a/net/bluetooth/mgmt.c ++++ b/net/bluetooth/mgmt.c +@@ -4420,6 +4420,22 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, + MGMT_STATUS_NOT_SUPPORTED); + } + ++static u32 get_params_flags(struct hci_dev *hdev, ++ struct hci_conn_params *params) ++{ ++ u32 flags = hdev->conn_flags; ++ ++ /* Devices using RPAs can only be programmed in the acceptlist if ++ * LL Privacy has been enable otherwise they cannot mark ++ * HCI_CONN_FLAG_REMOTE_WAKEUP. ++ */ ++ if ((flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && !use_ll_privacy(hdev) && ++ hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) ++ flags &= ~HCI_CONN_FLAG_REMOTE_WAKEUP; ++ ++ return flags; ++} ++ + static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, + u16 data_len) + { +@@ -4451,10 +4467,10 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, + } else { + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); +- + if (!params) + goto done; + ++ supported_flags = get_params_flags(hdev, params); + current_flags = params->flags; + } + +@@ -4523,38 +4539,35 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, + bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)", + &cp->addr.bdaddr, cp->addr.type); + } +- } else { +- params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, +- le_addr_type(cp->addr.type)); +- if (params) { +- /* Devices using RPAs can only be programmed in the +- * acceptlist LL Privacy has been enable otherwise they +- * cannot mark HCI_CONN_FLAG_REMOTE_WAKEUP. +- */ +- if ((current_flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && +- !use_ll_privacy(hdev) && +- hci_find_irk_by_addr(hdev, ¶ms->addr, +- params->addr_type)) { +- bt_dev_warn(hdev, +- "Cannot set wakeable for RPA"); +- goto unlock; +- } + +- params->flags = current_flags; +- status = MGMT_STATUS_SUCCESS; ++ goto unlock; ++ } + +- /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY +- * has been set. +- */ +- if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY) +- hci_update_passive_scan(hdev); +- } else { +- bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", +- &cp->addr.bdaddr, +- le_addr_type(cp->addr.type)); +- } ++ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, ++ le_addr_type(cp->addr.type)); ++ if (!params) { ++ bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", ++ &cp->addr.bdaddr, le_addr_type(cp->addr.type)); ++ goto unlock; ++ } ++ ++ supported_flags = get_params_flags(hdev, params); ++ ++ if ((supported_flags | current_flags) != supported_flags) { ++ bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", ++ current_flags, supported_flags); ++ goto unlock; + } + ++ params->flags = current_flags; ++ status = MGMT_STATUS_SUCCESS; ++ ++ /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY ++ * has been set. ++ */ ++ if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY) ++ hci_update_passive_scan(hdev); ++ + unlock: + hci_dev_unlock(hdev); + +diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c +index eb204ad36eeec..846588c0070a5 100644 +--- a/net/dsa/tag_hellcreek.c ++++ b/net/dsa/tag_hellcreek.c +@@ -45,7 +45,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, + + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) { +- netdev_warn(dev, "Failed to get source port: %d\n", port); ++ netdev_warn_once(dev, "Failed to get source port: %d\n", port); + return NULL; + } + diff --git a/sys-kernel/pinephone-sources/files/Multi-Gen-LRU-Framework.patch b/sys-kernel/pinephone-sources/files/Multi-Gen-LRU-Framework.patch deleted file mode 100644 index cf365f8..0000000 --- a/sys-kernel/pinephone-sources/files/Multi-Gen-LRU-Framework.patch +++ /dev/null @@ -1,8901 +0,0 @@ -From patchwork Wed Jul 6 22:00:10 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908719 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 94C1DC43334 - for ; Wed, 6 Jul 2022 22:07:01 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 3052B6B0072; Wed, 6 Jul 2022 18:07:01 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id 2B5426B0073; Wed, 6 Jul 2022 18:07:01 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id 1A27C6B0074; Wed, 6 Jul 2022 18:07:01 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com - [216.40.44.11]) - by kanga.kvack.org (Postfix) with ESMTP id 0BC686B0072 - for ; Wed, 6 Jul 2022 18:07:01 -0400 (EDT) -Received: from smtpin02.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay10.hostedemail.com (Postfix) with ESMTP id CA2056A6 - for ; Wed, 6 Jul 2022 22:07:00 +0000 (UTC) -X-FDA: 79658061000.02.A339B96 -Received: from mail-io1-f74.google.com (mail-io1-f74.google.com - [209.85.166.74]) - by imf05.hostedemail.com (Postfix) with ESMTP id 4E8D2100017 - for ; Wed, 6 Jul 2022 22:07:00 +0000 (UTC) -Received: by mail-io1-f74.google.com with SMTP id - bw12-20020a056602398c00b00675895c2e24so8731074iob.19 - for ; Wed, 06 Jul 2022 15:07:00 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=qRI8PXbLcM+5CLpEdu5Szvo90bsJIGjJE2jS009gkGk=; - b=ZmyxY1Zw8XSvfRWkKAW+f4mUNqqtO18FFYBy2MotiZryXwyz9ItbUh9iu4txbliGWV - 2zSpKFQCiNnOAlQ6EcsvQBLjKhLO02wKW9+/0P3DsfIXA4cNhb908dXECrznSmVA8Pnr - F13ODZZAGss1dN9dP7/zz2TweJvGgqjzlw8hpy3C9EXhkGdCEVfFUX5sYsFwHF6ph62j - YFYkt0yEeDGZ6BSKwot0UC5ZcUyd9AqPFg+XD4PWIlU21bbWaLA6eIQAr/1vyvoOUESY - RP+ZlS9AQ2JVmz3TDo8SyWa829c8OgLjNn28DmB38A4um5Ju0lB8q6j6sdVFGsj5iEvp - AFww== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=qRI8PXbLcM+5CLpEdu5Szvo90bsJIGjJE2jS009gkGk=; - b=AOUJsEidUKQJJt8SToVFGAPr3C12fTJoJElAAFE7KxYD410S32sDmau0kREt/LiYkv - dUGJshmCRHkkY36SzjWitE/xBZ1tllr3SMm51k1bORtnqWEcusXlr0UtsGdinQK9ILBh - K59jgkQzsGwJHEpe+Ll9kADhZ7o8oLcSMLrrgD3Fnx7oEtfuAHMKEcBlBkSgMPppXUTd - ulnN01wxGMVZTYZ/yA1sJwjTalfTQgS1jWfhZp1m3A1fJTF4eeWoX2ceAvv5rGOLAiJM - SW/ePavunPzzOpIPmO+dbtG0EWRHGTxBR65eiy6ov/69KgUE7bGwub0yYiOXQKWohY5Z - W3FQ== -X-Gm-Message-State: AJIora8WGdQPm0mObzFDWcXDHpPyK5u2HGFLGwaXGZ5Jsb3uO01h6wcU - F87blpjQgVwnSA0QtoTcFD+utCSKlvM= -X-Google-Smtp-Source: - AGRyM1sDSDGcRnDMwsrjVWQBGg0nLWGv8i6pEMJ10oDW9TLUnp/+bD9AQNZ2+emMVfEkLEmXVjNXufDKAmU= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a05:6638:14cd:b0:33c:c00e:898d with SMTP id - l13-20020a05663814cd00b0033cc00e898dmr26357876jak.143.1657144852078; Wed, 06 - Jul 2022 15:00:52 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:10 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-2-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 01/14] mm: x86, arm64: add arch_has_hw_pte_young() -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Barry Song , Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657145220; a=rsa-sha256; - cv=none; - b=VumvfKCjx5tf93BL/O1DNNiONuUUMaZYR4iOhULdFR4P8YOdhpBtrpKwBsHGR4wUqyMcvI - ToLran37owHd2V3ShTKRPwSH8VjFvggnLlLoA19COIyGitTG9II71uvoVW/BX9CNy0fyvU - cjZkFbkAV2gw14xwh4oA0dBJXiv4wcs= -ARC-Authentication-Results: i=1; - imf05.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=ZmyxY1Zw; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf05.hostedemail.com: domain of - 3FAbGYgYKCFULHM4xB3BB381.zB985AHK-997Ixz7.BE3@flex--yuzhao.bounces.google.com - designates 209.85.166.74 as permitted sender) - smtp.mailfrom=3FAbGYgYKCFULHM4xB3BB381.zB985AHK-997Ixz7.BE3@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657145220; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=qRI8PXbLcM+5CLpEdu5Szvo90bsJIGjJE2jS009gkGk=; - b=qXdH1Ee5JE3ufkBF1syfLTJ4Hf4+XbhNy8Ep7CdbOWtn0impShoppleSgAJd0DjZcGtBPd - BrCXlkc1QnMUpwyPi5WEIjJZZLPAAkBIhwltXoG15zc7F1kIblfi2GpbrcQSpycZKhMp2a - awra7JeixwgTaauxTH0OVnzltL0UkbU= -X-Stat-Signature: y7hoskbhfp1nq6ugnwo8zwjg458t1yhh -X-Rspam-User: -X-Rspamd-Server: rspam12 -X-Rspamd-Queue-Id: 4E8D2100017 -Authentication-Results: imf05.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=ZmyxY1Zw; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf05.hostedemail.com: domain of - 3FAbGYgYKCFULHM4xB3BB381.zB985AHK-997Ixz7.BE3@flex--yuzhao.bounces.google.com - designates 209.85.166.74 as permitted sender) - smtp.mailfrom=3FAbGYgYKCFULHM4xB3BB381.zB985AHK-997Ixz7.BE3@flex--yuzhao.bounces.google.com -X-HE-Tag: 1657145220-618745 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Some architectures automatically set the accessed bit in PTEs, e.g., -x86 and arm64 v8.2. On architectures that do not have this capability, -clearing the accessed bit in a PTE usually triggers a page fault -following the TLB miss of this PTE (to emulate the accessed bit). - -Being aware of this capability can help make better decisions, e.g., -whether to spread the work out over a period of time to reduce bursty -page faults when trying to clear the accessed bit in many PTEs. - -Note that theoretically this capability can be unreliable, e.g., -hotplugged CPUs might be different from builtin ones. Therefore it -should not be used in architecture-independent code that involves -correctness, e.g., to determine whether TLB flushes are required (in -combination with the accessed bit). - -Signed-off-by: Yu Zhao -Reviewed-by: Barry Song -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Acked-by: Will Deacon -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - arch/arm64/include/asm/pgtable.h | 15 ++------------- - arch/x86/include/asm/pgtable.h | 6 +++--- - include/linux/pgtable.h | 13 +++++++++++++ - mm/memory.c | 14 +------------- - 4 files changed, 19 insertions(+), 29 deletions(-) - -diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h -index 0b6632f18364..c46399c0500c 100644 ---- a/arch/arm64/include/asm/pgtable.h -+++ b/arch/arm64/include/asm/pgtable.h -@@ -1066,24 +1066,13 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, - * page after fork() + CoW for pfn mappings. We don't always have a - * hardware-managed access flag on arm64. - */ --static inline bool arch_faults_on_old_pte(void) --{ -- /* The register read below requires a stable CPU to make any sense */ -- cant_migrate(); -- -- return !cpu_has_hw_af(); --} --#define arch_faults_on_old_pte arch_faults_on_old_pte -+#define arch_has_hw_pte_young cpu_has_hw_af - - /* - * Experimentally, it's cheap to set the access flag in hardware and we - * benefit from prefaulting mappings as 'old' to start with. - */ --static inline bool arch_wants_old_prefaulted_pte(void) --{ -- return !arch_faults_on_old_pte(); --} --#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte -+#define arch_wants_old_prefaulted_pte cpu_has_hw_af - - static inline bool pud_sect_supported(void) - { -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index 44e2d6f1dbaa..dc5f7d8ef68a 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -1431,10 +1431,10 @@ static inline bool arch_has_pfn_modify_check(void) - return boot_cpu_has_bug(X86_BUG_L1TF); - } - --#define arch_faults_on_old_pte arch_faults_on_old_pte --static inline bool arch_faults_on_old_pte(void) -+#define arch_has_hw_pte_young arch_has_hw_pte_young -+static inline bool arch_has_hw_pte_young(void) - { -- return false; -+ return true; - } - - #ifdef CONFIG_PAGE_TABLE_CHECK -diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h -index 3cdc16cfd867..8eee31bc9bde 100644 ---- a/include/linux/pgtable.h -+++ b/include/linux/pgtable.h -@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #endif - -+#ifndef arch_has_hw_pte_young -+/* -+ * Return whether the accessed bit is supported on the local CPU. -+ * -+ * This stub assumes accessing through an old PTE triggers a page fault. -+ * Architectures that automatically set the access bit should overwrite it. -+ */ -+static inline bool arch_has_hw_pte_young(void) -+{ -+ return false; -+} -+#endif -+ - #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR - static inline pte_t ptep_get_and_clear(struct mm_struct *mm, - unsigned long address, -diff --git a/mm/memory.c b/mm/memory.c -index 7a089145cad4..49500390b91b 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -125,18 +125,6 @@ int randomize_va_space __read_mostly = - 2; - #endif - --#ifndef arch_faults_on_old_pte --static inline bool arch_faults_on_old_pte(void) --{ -- /* -- * Those arches which don't have hw access flag feature need to -- * implement their own helper. By default, "true" means pagefault -- * will be hit on old pte. -- */ -- return true; --} --#endif -- - #ifndef arch_wants_old_prefaulted_pte - static inline bool arch_wants_old_prefaulted_pte(void) - { -@@ -2862,7 +2850,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, - * On architectures with software "accessed" bits, we would - * take a double page fault, so mark it accessed here. - */ -- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { -+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { - pte_t entry; - - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - -From patchwork Wed Jul 6 22:00:11 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908700 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id CAC87C433EF - for ; Wed, 6 Jul 2022 22:00:56 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 154F66B0073; Wed, 6 Jul 2022 18:00:56 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id 0B6EE6B0074; Wed, 6 Jul 2022 18:00:56 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id E729E6B0075; Wed, 6 Jul 2022 18:00:55 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0016.hostedemail.com - [216.40.44.16]) - by kanga.kvack.org (Postfix) with ESMTP id D9E9C6B0073 - for ; Wed, 6 Jul 2022 18:00:55 -0400 (EDT) -Received: from smtpin22.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay06.hostedemail.com (Postfix) with ESMTP id 9F16534610 - for ; Wed, 6 Jul 2022 22:00:55 +0000 (UTC) -X-FDA: 79658045670.22.8A546BF -Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com - [209.85.219.201]) - by imf12.hostedemail.com (Postfix) with ESMTP id 0D92C40033 - for ; Wed, 6 Jul 2022 22:00:53 +0000 (UTC) -Received: by mail-yb1-f201.google.com with SMTP id - m68-20020a253f47000000b006683bd91962so12445167yba.0 - for ; Wed, 06 Jul 2022 15:00:53 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=Yske1o/9q5kvxCT6Do7fK+m0Z7RMKAIlwEA5dqQMV6o=; - b=ScyLwm63xSUVYY78eVpIKf7E4l6uHPJ8SKqWyYLqNgfcQS9rJpZhYXa+GvIYC8VFxz - 2VFStSncvwevlF5a8SeHX4Xsz1oxV5uuYYiB5ijS1hgFnqmnWUZ92SAkit2dsdOrKkVm - doRskpr19skWYdTit7iDaFWDHSkEjmp1FnyOwnhb4K1iob0FZUGliEmOjr11tQKlaxMl - A7gk8PUbqgtBAB5FxJW674j5ErsQXUNEF0mV9mDiI18iHiW2zTe0Jvp4coFt/YGkO03P - +mGZgU80OTVBNdIcmd9CUSdknj31pHlFfc27NA1Hoqf7YpOu3eL0SW+Jp946t/R7w6FH - wLdA== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=Yske1o/9q5kvxCT6Do7fK+m0Z7RMKAIlwEA5dqQMV6o=; - b=XVonL9oPc/dGS0Lj1bwrmAxlSwptN2oDguArTP7th8VxXdXwHpn2oGKmURoazynnWW - GaxAg33Dr4knllzhF6wCdcowLA++/AgQbdQfMwZEbDkgdPMiKz+9twLafdDp2twVELPc - mZFyE0neVCe8OAOes5N5stgxrIPJyGN+cmejA7EFYbUXD5yKaVHVWEbZ1DKvs+vkVfYH - 4I7Mc++TN9sTNUODcCZv7eNmy5ddfKdhs8ZEqmBzkQQl+6Nyi2IUxEa3YeftDVQx6pqJ - 7oPE/pTcwcjKcRm4Bn+MZj1FE7of9UClcR0Wd4ZoxSRmvPtCnOiV9G4yZyDcGkpaBUWh - +quA== -X-Gm-Message-State: AJIora81vFHYbn7du42CA/tgPDfWrrUd4KS2ldCuwHG08ccHdOGjEtup - i9hDsoVZxoG9FtGAgLFxbBinusC9kmE= -X-Google-Smtp-Source: - AGRyM1uPiyB7rEJDdAsYAVesh6XcxF7m4/NOwgKHx35NtLh0WZv9A8PqKLV1Gu8X5xOooB/DS/0V1C1QSZU= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a25:390:0:b0:66e:b9c7:b46c with SMTP id - 138-20020a250390000000b0066eb9c7b46cmr172371ybd.505.1657144853349; Wed, 06 - Jul 2022 15:00:53 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:11 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-3-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Barry Song , Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144854; a=rsa-sha256; - cv=none; - b=ba2dVYmnfxk8QTCdMBgkSodQNf9QrUzHc+vrIrNl2fGKaUM0VC5LUOkOS0Uam92Z/fgIw+ - J3iBf4wOdpf9YxVZLpvnO/CvPz7LzU7dbaCIsHjkTYZyjSGj5b5H8veJBlUQe2PyEhqktl - KdZlmcrPxuSkAHBseFs2D8j/Mhzx2nw= -ARC-Authentication-Results: i=1; - imf12.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=ScyLwm63; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf12.hostedemail.com: domain of - 3FQbGYgYKCFYMIN5yC4CC492.0CA96BIL-AA8Jy08.CF4@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3FQbGYgYKCFYMIN5yC4CC492.0CA96BIL-AA8Jy08.CF4@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144854; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=Yske1o/9q5kvxCT6Do7fK+m0Z7RMKAIlwEA5dqQMV6o=; - b=TMdhE0VPqYoVfu/UiVKCUJJu+4spbPZ1vrUaXbX8Pa9OP/6dtZAs1KcJMK2kLgdUvhO8E9 - UX8x+Y/myW5EAlPfC2BrKGzFE9TjBUVGeJYESpLIZg2lf658PqCu5GAkoM5vFZRBG80nvF - ObTkOxqzk+IkvR3PYDfWaVzYP1bgmkU= -Authentication-Results: imf12.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=ScyLwm63; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf12.hostedemail.com: domain of - 3FQbGYgYKCFYMIN5yC4CC492.0CA96BIL-AA8Jy08.CF4@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3FQbGYgYKCFYMIN5yC4CC492.0CA96BIL-AA8Jy08.CF4@flex--yuzhao.bounces.google.com -X-Stat-Signature: u9s859meeaiiqe5mxswoozqtkc4fepwx -X-Rspamd-Queue-Id: 0D92C40033 -X-Rspamd-Server: rspam05 -X-Rspam-User: -X-HE-Tag: 1657144853-88353 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Some architectures support the accessed bit in non-leaf PMD entries, -e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it -as part of linear address translation [1]. Page table walkers that -clear the accessed bit may use this capability to reduce their search -space. - -Note that: -1. Although an inline function is preferable, this capability is added - as a configuration option for consistency with the existing macros. -2. Due to the little interest in other varieties, this capability was - only tested on Intel and AMD CPUs. - -Thanks to the following developers for their efforts [2][3]. - Randy Dunlap - Stephen Rothwell - -[1]: Intel 64 and IA-32 Architectures Software Developer's Manual - Volume 3 (June 2021), section 4.8 -[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/ -[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/ - -Signed-off-by: Yu Zhao -Reviewed-by: Barry Song -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - arch/Kconfig | 8 ++++++++ - arch/x86/Kconfig | 1 + - arch/x86/include/asm/pgtable.h | 3 ++- - arch/x86/mm/pgtable.c | 5 ++++- - include/linux/pgtable.h | 4 ++-- - 5 files changed, 17 insertions(+), 4 deletions(-) - -diff --git a/arch/Kconfig b/arch/Kconfig -index fcf9a41a4ef5..eaeec187bd6a 100644 ---- a/arch/Kconfig -+++ b/arch/Kconfig -@@ -1403,6 +1403,14 @@ config DYNAMIC_SIGFRAME - config HAVE_ARCH_NODE_DEV_GROUP - bool - -+config ARCH_HAS_NONLEAF_PMD_YOUNG -+ bool -+ help -+ Architectures that select this option are capable of setting the -+ accessed bit in non-leaf PMD entries when using them as part of linear -+ address translations. Page table walkers that clear the accessed bit -+ may use this capability to reduce their search space. -+ - source "kernel/gcov/Kconfig" - - source "scripts/gcc-plugins/Kconfig" -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index be0b95e51df6..5715111abe13 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -85,6 +85,7 @@ config X86 - select ARCH_HAS_PMEM_API if X86_64 - select ARCH_HAS_PTE_DEVMAP if X86_64 - select ARCH_HAS_PTE_SPECIAL -+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 - select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 - select ARCH_HAS_COPY_MC if X86_64 - select ARCH_HAS_SET_MEMORY -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index dc5f7d8ef68a..5059799bebe3 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) - - static inline int pmd_bad(pmd_t pmd) - { -- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; -+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != -+ (_KERNPG_TABLE & ~_PAGE_ACCESSED); - } - - static inline unsigned long pages_to_mb(unsigned long npg) -diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c -index a932d7712d85..8525f2876fb4 100644 ---- a/arch/x86/mm/pgtable.c -+++ b/arch/x86/mm/pgtable.c -@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, - return ret; - } - --#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) - int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) - { -@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, - - return ret; - } -+#endif -+ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE - int pudp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pud_t *pudp) - { -diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h -index 8eee31bc9bde..9c57c5cc49c2 100644 ---- a/include/linux/pgtable.h -+++ b/include/linux/pgtable.h -@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - #endif - - #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG --#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) - static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) -@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - BUILD_BUG(); - return 0; - } --#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ - #endif - - #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH - -From patchwork Wed Jul 6 22:00:12 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908701 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 05277CCA481 - for ; Wed, 6 Jul 2022 22:00:57 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 7E4186B0074; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id 76ECC6B0075; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id 524486B0078; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com - [216.40.44.11]) - by kanga.kvack.org (Postfix) with ESMTP id 3E4B36B0074 - for ; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) -Received: from smtpin31.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay06.hostedemail.com (Postfix) with ESMTP id 0F88934416 - for ; Wed, 6 Jul 2022 22:00:57 +0000 (UTC) -X-FDA: 79658045754.31.374F01B -Received: from mail-yw1-f201.google.com (mail-yw1-f201.google.com - [209.85.128.201]) - by imf23.hostedemail.com (Postfix) with ESMTP id 7B5CF140071 - for ; Wed, 6 Jul 2022 22:00:55 +0000 (UTC) -Received: by mail-yw1-f201.google.com with SMTP id - 00721157ae682-31c9a49a1a8so63943167b3.9 - for ; Wed, 06 Jul 2022 15:00:55 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=xRTO2a+j5NrVJKtKeScRWRKKBSTrjcMS5t6hiKMzj7E=; - b=jaYbReZJ8uDDLbii1xwhzvdsu6n9p9fFeOoX3rMWV4HRFwikqu+fxkANqP9J1hGdR2 - NeJtlffRYWnnwdndS5aG1Db183fv4nEfSDNZk5Aw1GhS0DDV+irZrJ4sR+RBQ0mlRL0F - PCWg0VVitxpZ5yzJzYAkEO4uHOjww0Tjni9prrUmk4iDUdAeuQHZsQYSGRbR+cGm4i8w - k7/vbxWbkPS/YQ/tq51SCEZjr+bTsFRcUYhsaDMMVhgqvpvMmhh84viZjp9G9W/MZCVp - lhJy7B/1ym1XZ9aYTn0gi9sgQDfh0ksvuw/1a5ib9CO1DG9/pvF0LoK/EKm8nNJ/pZyy - kAfA== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=xRTO2a+j5NrVJKtKeScRWRKKBSTrjcMS5t6hiKMzj7E=; - b=idSd06MTounv54fIacoxrI0eQJJSRslStkggX8APKQWu6w4H6jnTCQKfjZteTt+gtC - gAOrWhpfQgtNIatPBQliE6gNhZivKVIon5hkSCuin7ffPXxx98Zr4xXmYyyQFey9eZEP - bYnELAeg+MpzknUvWf0bHlIZA6PwqWxz1eWAYDCV4dPEApBGDNhC+aYkMFf9EqdJdmDs - 9FEeSbXKN25LUaLiOpKh0js2kdqX4Aukk0uqlH6XX+ZNfNNe0hkK/+OhTievM+jCt5TK - p/+QW8aqDk5Qbxin3B3Tq7oWWNyUaM/L/r7o7nuPHq1VWWbGPqj0YyKzEZLz9+OsyhJD - mEbg== -X-Gm-Message-State: AJIora8tXy61OEQgPe+vKTfstDk3naLrxXDSbXHWqk7IopwvdFEH4Cef - nkdLlvLaW7zOebfRuKKHBEK+WcRl9M8= -X-Google-Smtp-Source: - AGRyM1slRLC7YoaOAJX582u+iAc9V/TxbhI1Hoxnov8FBUQD9MbHWxCLjXxj9TQ7JPp2TjaJTr9hNbuL1Ok= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a05:6902:1206:b0:66e:6e93:366c with SMTP id - s6-20020a056902120600b0066e6e93366cmr11968955ybu.59.1657144854840; Wed, 06 - Jul 2022 15:00:54 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:12 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-4-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 03/14] mm/vmscan.c: refactor shrink_node() -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Barry Song , Miaohe Lin , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144855; a=rsa-sha256; - cv=none; - b=C5Z9j3O02mimmj7Fw9J8fsWNjRE+Y/gPxdz9C+kmFqM/2BgzBkrX0GAW0hgI7dHEM8924m - HqfJJSqt0XV/+xhpN2q4jK0T+02nB4EEXRK//o2vKS+/FvUwh/ucVjQrLyDiacFK43TXI6 - NfK2zkhL3Ol0W61EEn2HSK05MjQkolc= -ARC-Authentication-Results: i=1; - imf23.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=jaYbReZJ; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf23.hostedemail.com: domain of - 3FgbGYgYKCFcNJO6zD5DD5A3.1DBA7CJM-BB9Kz19.DG5@flex--yuzhao.bounces.google.com - designates 209.85.128.201 as permitted sender) - smtp.mailfrom=3FgbGYgYKCFcNJO6zD5DD5A3.1DBA7CJM-BB9Kz19.DG5@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144855; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=xRTO2a+j5NrVJKtKeScRWRKKBSTrjcMS5t6hiKMzj7E=; - b=8oACpQ7ksfTXalrWOXw+va/sQyeVexNkPm99hYfcs2rccyQJRt+TczrAEF/1Dx80ZM4U1c - tQU/+fYGnG8sEjmePqrrHye0U6E7JvxV6YqmuCDVUUaIEEgYqMC0KfEir3FNalMA6JhauV - vCylGdyHJmqBqvDVl9PD0HpFhXgtW3U= -Authentication-Results: imf23.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=jaYbReZJ; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf23.hostedemail.com: domain of - 3FgbGYgYKCFcNJO6zD5DD5A3.1DBA7CJM-BB9Kz19.DG5@flex--yuzhao.bounces.google.com - designates 209.85.128.201 as permitted sender) - smtp.mailfrom=3FgbGYgYKCFcNJO6zD5DD5A3.1DBA7CJM-BB9Kz19.DG5@flex--yuzhao.bounces.google.com -X-Stat-Signature: ogjqd7prxk1deq49x3znr781rgid8amh -X-Rspamd-Queue-Id: 7B5CF140071 -X-Rspamd-Server: rspam05 -X-Rspam-User: -X-HE-Tag: 1657144855-543148 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -This patch refactors shrink_node() to improve readability for the -upcoming changes to mm/vmscan.c. - -Signed-off-by: Yu Zhao -Reviewed-by: Barry Song -Reviewed-by: Miaohe Lin -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - mm/vmscan.c | 198 +++++++++++++++++++++++++++------------------------- - 1 file changed, 104 insertions(+), 94 deletions(-) - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index f7d9a683e3a7..fddb9bd3c6c2 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -2670,6 +2670,109 @@ enum scan_balance { - SCAN_FILE, - }; - -+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) -+{ -+ unsigned long file; -+ struct lruvec *target_lruvec; -+ -+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); -+ -+ /* -+ * Flush the memory cgroup stats, so that we read accurate per-memcg -+ * lruvec stats for heuristics. -+ */ -+ mem_cgroup_flush_stats(); -+ -+ /* -+ * Determine the scan balance between anon and file LRUs. -+ */ -+ spin_lock_irq(&target_lruvec->lru_lock); -+ sc->anon_cost = target_lruvec->anon_cost; -+ sc->file_cost = target_lruvec->file_cost; -+ spin_unlock_irq(&target_lruvec->lru_lock); -+ -+ /* -+ * Target desirable inactive:active list ratios for the anon -+ * and file LRU lists. -+ */ -+ if (!sc->force_deactivate) { -+ unsigned long refaults; -+ -+ refaults = lruvec_page_state(target_lruvec, -+ WORKINGSET_ACTIVATE_ANON); -+ if (refaults != target_lruvec->refaults[0] || -+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) -+ sc->may_deactivate |= DEACTIVATE_ANON; -+ else -+ sc->may_deactivate &= ~DEACTIVATE_ANON; -+ -+ /* -+ * When refaults are being observed, it means a new -+ * workingset is being established. Deactivate to get -+ * rid of any stale active pages quickly. -+ */ -+ refaults = lruvec_page_state(target_lruvec, -+ WORKINGSET_ACTIVATE_FILE); -+ if (refaults != target_lruvec->refaults[1] || -+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) -+ sc->may_deactivate |= DEACTIVATE_FILE; -+ else -+ sc->may_deactivate &= ~DEACTIVATE_FILE; -+ } else -+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; -+ -+ /* -+ * If we have plenty of inactive file pages that aren't -+ * thrashing, try to reclaim those first before touching -+ * anonymous pages. -+ */ -+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); -+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) -+ sc->cache_trim_mode = 1; -+ else -+ sc->cache_trim_mode = 0; -+ -+ /* -+ * Prevent the reclaimer from falling into the cache trap: as -+ * cache pages start out inactive, every cache fault will tip -+ * the scan balance towards the file LRU. And as the file LRU -+ * shrinks, so does the window for rotation from references. -+ * This means we have a runaway feedback loop where a tiny -+ * thrashing file LRU becomes infinitely more attractive than -+ * anon pages. Try to detect this based on file LRU size. -+ */ -+ if (!cgroup_reclaim(sc)) { -+ unsigned long total_high_wmark = 0; -+ unsigned long free, anon; -+ int z; -+ -+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); -+ file = node_page_state(pgdat, NR_ACTIVE_FILE) + -+ node_page_state(pgdat, NR_INACTIVE_FILE); -+ -+ for (z = 0; z < MAX_NR_ZONES; z++) { -+ struct zone *zone = &pgdat->node_zones[z]; -+ -+ if (!managed_zone(zone)) -+ continue; -+ -+ total_high_wmark += high_wmark_pages(zone); -+ } -+ -+ /* -+ * Consider anon: if that's low too, this isn't a -+ * runaway file reclaim problem, but rather just -+ * extreme pressure. Reclaim as per usual then. -+ */ -+ anon = node_page_state(pgdat, NR_INACTIVE_ANON); -+ -+ sc->file_is_tiny = -+ file + free <= total_high_wmark && -+ !(sc->may_deactivate & DEACTIVATE_ANON) && -+ anon >> sc->priority; -+ } -+} -+ - /* - * Determine how aggressively the anon and file LRU lists should be - * scanned. -@@ -3138,109 +3241,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - unsigned long nr_reclaimed, nr_scanned; - struct lruvec *target_lruvec; - bool reclaimable = false; -- unsigned long file; - - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - - again: -- /* -- * Flush the memory cgroup stats, so that we read accurate per-memcg -- * lruvec stats for heuristics. -- */ -- mem_cgroup_flush_stats(); -- - memset(&sc->nr, 0, sizeof(sc->nr)); - - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; - -- /* -- * Determine the scan balance between anon and file LRUs. -- */ -- spin_lock_irq(&target_lruvec->lru_lock); -- sc->anon_cost = target_lruvec->anon_cost; -- sc->file_cost = target_lruvec->file_cost; -- spin_unlock_irq(&target_lruvec->lru_lock); -- -- /* -- * Target desirable inactive:active list ratios for the anon -- * and file LRU lists. -- */ -- if (!sc->force_deactivate) { -- unsigned long refaults; -- -- refaults = lruvec_page_state(target_lruvec, -- WORKINGSET_ACTIVATE_ANON); -- if (refaults != target_lruvec->refaults[0] || -- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) -- sc->may_deactivate |= DEACTIVATE_ANON; -- else -- sc->may_deactivate &= ~DEACTIVATE_ANON; -- -- /* -- * When refaults are being observed, it means a new -- * workingset is being established. Deactivate to get -- * rid of any stale active pages quickly. -- */ -- refaults = lruvec_page_state(target_lruvec, -- WORKINGSET_ACTIVATE_FILE); -- if (refaults != target_lruvec->refaults[1] || -- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) -- sc->may_deactivate |= DEACTIVATE_FILE; -- else -- sc->may_deactivate &= ~DEACTIVATE_FILE; -- } else -- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; -- -- /* -- * If we have plenty of inactive file pages that aren't -- * thrashing, try to reclaim those first before touching -- * anonymous pages. -- */ -- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); -- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) -- sc->cache_trim_mode = 1; -- else -- sc->cache_trim_mode = 0; -- -- /* -- * Prevent the reclaimer from falling into the cache trap: as -- * cache pages start out inactive, every cache fault will tip -- * the scan balance towards the file LRU. And as the file LRU -- * shrinks, so does the window for rotation from references. -- * This means we have a runaway feedback loop where a tiny -- * thrashing file LRU becomes infinitely more attractive than -- * anon pages. Try to detect this based on file LRU size. -- */ -- if (!cgroup_reclaim(sc)) { -- unsigned long total_high_wmark = 0; -- unsigned long free, anon; -- int z; -- -- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); -- file = node_page_state(pgdat, NR_ACTIVE_FILE) + -- node_page_state(pgdat, NR_INACTIVE_FILE); -- -- for (z = 0; z < MAX_NR_ZONES; z++) { -- struct zone *zone = &pgdat->node_zones[z]; -- if (!managed_zone(zone)) -- continue; -- -- total_high_wmark += high_wmark_pages(zone); -- } -- -- /* -- * Consider anon: if that's low too, this isn't a -- * runaway file reclaim problem, but rather just -- * extreme pressure. Reclaim as per usual then. -- */ -- anon = node_page_state(pgdat, NR_INACTIVE_ANON); -- -- sc->file_is_tiny = -- file + free <= total_high_wmark && -- !(sc->may_deactivate & DEACTIVATE_ANON) && -- anon >> sc->priority; -- } -+ prepare_scan_count(pgdat, sc); - - shrink_node_memcgs(pgdat, sc); - - -From patchwork Wed Jul 6 22:00:13 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908702 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id E07C1C43334 - for ; Wed, 6 Jul 2022 22:00:59 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id E767D6B0075; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id E26676B0078; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id CC6F16B007B; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com - [216.40.44.11]) - by kanga.kvack.org (Postfix) with ESMTP id B2AAC6B0075 - for ; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) -Received: from smtpin09.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay11.hostedemail.com (Postfix) with ESMTP id 94CBE80B22 - for ; Wed, 6 Jul 2022 22:00:57 +0000 (UTC) -X-FDA: 79658045754.09.169939E -Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com - [209.85.219.201]) - by imf07.hostedemail.com (Postfix) with ESMTP id 3DA0B4002F - for ; Wed, 6 Jul 2022 22:00:57 +0000 (UTC) -Received: by mail-yb1-f201.google.com with SMTP id - l6-20020a25bf86000000b00668c915a3f2so12477298ybk.4 - for ; Wed, 06 Jul 2022 15:00:56 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=GyLn4alIs4ulph3wPn6kG/6c2qN7BZlAW/LU7V/wtB8=; - b=fqvYzFJd6diatCK9xOi35jB4AbI1jOxd0dc3zbIWXBRd/oZCSL2ChL+LrZ+NDYE03d - TIPGwoUneWvzbc4OXeOfpb0FtGxmdhwy/nlPnMgq+BH+J79K/39lDuK/WznYk1HI+hzN - zL7bsRal3Q8YUC5jRMId0XoVcP/vuEU/M54E4rAJ15EBntL/F6yfHEySvrSBBtWZhnt0 - 90gyXGuo//w+Jc0ez+vgTHQxHk3TDIFEvyNKpltir9acA6/j0jGHYEfhC/r1UrED+Tt8 - m1PcqYkXSdSfGsO4GbojXKICNGmqT0/82l34NKy0jmCO9o+gJUnrEIDeiTyPT8jYjdXn - eGJQ== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=GyLn4alIs4ulph3wPn6kG/6c2qN7BZlAW/LU7V/wtB8=; - b=acqAzpv5+5bfyJLdmlUD3HKE74OO89v1YBTzb7kB54GormxgfqmSVm1QB75bSQFjxy - iRQ9yyQgUru50WX+ppP3B5N0K0edy9kKYXC5SmAf9PX7QbH0T3UYfJi5KDO5H7cptgny - 6VYcZXgdQ/ammtN/NFjOwImJ1NyoXMVSMkwXMJFoH/hDV1+/EGHNkG40d3ui2cNlIGer - 8oJrkMcYwG1L0yl5Lv1F9vCPZCaUecfxMZFvc0McrXP6BtB3ww3KPyipUsx80uRBu4PN - j9j1OZPr59Q+bLCF+TDiL77E2CPhZvzXxkw0VDq91eoHiQQwNRK+035yqGfK0i2derto - 4qrg== -X-Gm-Message-State: AJIora8TvaxLFo7c8jNCAkmO2MPTlGSpywySY2xnOvzKMd8WGYtTu0Lg - H/Iowy+yks3pdn4k6EN6JEFx/cAzaSo= -X-Google-Smtp-Source: - AGRyM1vs1GQqyrE7d9mjEL5MTBaBKoftkODnUWU3nZauu0DiGFKm6nQgOePB+L8kJ6BOwhlufj40Jzp1R4k= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a25:d043:0:b0:66e:31d6:4606 with SMTP id - h64-20020a25d043000000b0066e31d64606mr25539292ybg.241.1657144856519; Wed, 06 - Jul 2022 15:00:56 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:13 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-5-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 04/14] Revert "include/linux/mm_inline.h: fold - __update_lru_size() into its sole caller" -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Miaohe Lin , Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144857; a=rsa-sha256; - cv=none; - b=VytvHlKkiiUMJbwpI1Paeu5xydng7JksWkoUmOEgZClwpKUFRcoyC2S4kA6s+p3bljEt2Y - 85v0iGMF2ImZomoiEOeODI88v8cnakz2h3vV0KOYBRLWcUp1MoAAGc9/CatS2RZcyEZAsJ - 16N+Z8RlZvTLC+lg1BEaccSkBnce95w= -ARC-Authentication-Results: i=1; - imf07.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=fqvYzFJd; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf07.hostedemail.com: domain of - 3GAbGYgYKCFkPLQ81F7FF7C5.3FDC9ELO-DDBM13B.FI7@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3GAbGYgYKCFkPLQ81F7FF7C5.3FDC9ELO-DDBM13B.FI7@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144857; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=GyLn4alIs4ulph3wPn6kG/6c2qN7BZlAW/LU7V/wtB8=; - b=eWydr8xqhEMOpzfboenYQahizL48uc/GlcGbQBWuOSIiHMGD4xSQylZi9Tf6qFNtPbIcfn - a/FGlP15AWqiWFjYLC/dUvNrh+6vOaN3WEAagxoFslzh2bv3QfjEGhMrlwdaaPWaec5Ive - jXzUgyTbXLI8/pbmo1scWGTYiETc8FI= -X-Rspam-User: -X-Rspamd-Server: rspam02 -X-Rspamd-Queue-Id: 3DA0B4002F -Authentication-Results: imf07.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=fqvYzFJd; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf07.hostedemail.com: domain of - 3GAbGYgYKCFkPLQ81F7FF7C5.3FDC9ELO-DDBM13B.FI7@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3GAbGYgYKCFkPLQ81F7FF7C5.3FDC9ELO-DDBM13B.FI7@flex--yuzhao.bounces.google.com -X-Stat-Signature: 6b6krqahmtizrdq5upykdpctczw885w8 -X-HE-Tag: 1657144857-930305 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -This patch undoes the following refactor: -commit 289ccba18af4 ("include/linux/mm_inline.h: fold __update_lru_size() into its sole caller") - -The upcoming changes to include/linux/mm_inline.h will reuse -__update_lru_size(). - -Signed-off-by: Yu Zhao -Reviewed-by: Miaohe Lin -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - include/linux/mm_inline.h | 9 ++++++++- - 1 file changed, 8 insertions(+), 1 deletion(-) - -diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h -index 7b25b53c474a..fb8aadb81cd6 100644 ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -34,7 +34,7 @@ static inline int page_is_file_lru(struct page *page) - return folio_is_file_lru(page_folio(page)); - } - --static __always_inline void update_lru_size(struct lruvec *lruvec, -+static __always_inline void __update_lru_size(struct lruvec *lruvec, - enum lru_list lru, enum zone_type zid, - long nr_pages) - { -@@ -43,6 +43,13 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, - __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); - __mod_zone_page_state(&pgdat->node_zones[zid], - NR_ZONE_LRU_BASE + lru, nr_pages); -+} -+ -+static __always_inline void update_lru_size(struct lruvec *lruvec, -+ enum lru_list lru, enum zone_type zid, -+ long nr_pages) -+{ -+ __update_lru_size(lruvec, lru, zid, nr_pages); - #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); - #endif - -From patchwork Wed Jul 6 22:00:14 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908703 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 8DA63CCA481 - for ; Wed, 6 Jul 2022 22:01:01 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 005E26B007D; Wed, 6 Jul 2022 18:01:00 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id E9B606B007B; Wed, 6 Jul 2022 18:00:59 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id CC4B46B007D; Wed, 6 Jul 2022 18:00:59 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0010.hostedemail.com - [216.40.44.10]) - by kanga.kvack.org (Postfix) with ESMTP id BA0E36B0078 - for ; Wed, 6 Jul 2022 18:00:59 -0400 (EDT) -Received: from smtpin23.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay06.hostedemail.com (Postfix) with ESMTP id 7E56E34725 - for ; Wed, 6 Jul 2022 22:00:59 +0000 (UTC) -X-FDA: 79658045838.23.084DC6A -Received: from mail-io1-f74.google.com (mail-io1-f74.google.com - [209.85.166.74]) - by imf02.hostedemail.com (Postfix) with ESMTP id B44F280009 - for ; Wed, 6 Jul 2022 22:00:58 +0000 (UTC) -Received: by mail-io1-f74.google.com with SMTP id - h7-20020a05660224c700b0067898a33ceaso3543048ioe.13 - for ; Wed, 06 Jul 2022 15:00:58 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=y74IoQ1Un60Xq7yBx41XqudQ9pnNmGNgLv+0SiGV5r4=; - b=p293qdy+AJ1NK8wVIFYa38QTJD9CsNtfxZWrFxc99swgPytMvFTFgMhkjdcKezzZie - yrDLuqEO4g2bHuYcfru6gtGl/vlEBzugJSUw9t9SSuHD0KPbwuSBuj6k/Z4E6o/3VSjs - nmEwp3FaQzQrq+AvQ75NBZLJcjJnu2S/L2SRP5n2jtLL27l7UQfJTw+nlDEN61Y6wnKm - cTbYVguOwFUEjdFi2ghze0M0n87A9CNsBCyQHS9wRzczRWbW6m+LMwO/fsge9KEjZcyq - WUlwLSCnJuEi3hDOUrhrpLVnbT1LO6KIzff4/TXK4ud4HZ+BORPfFQeF2zBQpAIt8foH - VdwQ== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=y74IoQ1Un60Xq7yBx41XqudQ9pnNmGNgLv+0SiGV5r4=; - b=EWxosmI9v8pE6BcCI05EJUbjYKXIh1pLe5c6N0so7Z1m/PNwBo99ulR0RkBk+8Fmsr - 84tyZLy+Pyf35k8XxobhAfMup7WIk+mjRYfkHpt7/gxC3CR3vJEm+WqI+pDZ614pUCR3 - N6ibQFwTKBqpvRJDUwhkC5n8ePOAIkrRwZz7JGXj4eiWWJSZGxUDhwqV9gi7CHQfo8Lr - yHt/gyUcmJDvTu8Fy8aP7r187IjoODs5rbqKu518ZAL20ceKmq+HT3FFv02CyDgkXObe - H8JjcI1Ovt/TvJlosala45+Ckpmt3TNX1+aCLmAaarDpkTxNHVYvWWYlylLQp+itl3t2 - Fj5w== -X-Gm-Message-State: AJIora8xQHdkFAa4pcN+RMWZYlVPfQhLR90DF0MW6/oxA9WDgXVAJA1y - HfRDCS36QboLTpSfrlTvo6hF0/eqWKQ= -X-Google-Smtp-Source: - AGRyM1vTPigokpkBMxkuw/ymV5qWW3cjnNF2AOB7Hi8viYhEQm+kOAzrEtDgBoJ1BwoaUWa5EKU0D3T6qsI= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a05:6638:2114:b0:33e:8e12:e5ee with SMTP id - n20-20020a056638211400b0033e8e12e5eemr22734068jaj.281.1657144858015; Wed, 06 - Jul 2022 15:00:58 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:14 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-6-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 05/14] mm: multi-gen LRU: groundwork -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144858; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=y74IoQ1Un60Xq7yBx41XqudQ9pnNmGNgLv+0SiGV5r4=; - b=oTaGMrRapdp+kh/gHJqHBTnDMs7aDxVAwUqI4ZEVOcywjneTv7eRea/YKenPJ3SUgTezbZ - dNcy3RS0hdffVNFHBdEbaanNSV29TaH7bgFF9LgENwvVZGMsc8+qWj5aOjTjXDI7lU9B66 - y9zhOk8IIuWtM+JWdB8zZv2w73QNkgI= -ARC-Authentication-Results: i=1; - imf02.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=p293qdy+; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf02.hostedemail.com: domain of - 3GgbGYgYKCFsRNSA3H9HH9E7.5HFEBGNQ-FFDO35D.HK9@flex--yuzhao.bounces.google.com - designates 209.85.166.74 as permitted sender) - smtp.mailfrom=3GgbGYgYKCFsRNSA3H9HH9E7.5HFEBGNQ-FFDO35D.HK9@flex--yuzhao.bounces.google.com -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144858; a=rsa-sha256; - cv=none; - b=5jGHjx/lCilMy07izrrxr4gRusLHe7TVfK6eNjlXnYmVVS2CdTSjlX6iI6cgO5jY/6Otqp - 7rnHHbSwj6t4vUkRkbfhWehDTUsU9TXEcEaZ8NHjLgX8tJZID/D2dcfA1Z/Ae/1iB6tbQa - vuAWajsuByCUT6SlkfXfwe+TOdR4BNI= -X-Rspamd-Server: rspam11 -X-Rspam-User: -X-Stat-Signature: gxdxu955m74iz4kx3hhyt9yhtcgejep5 -X-Rspamd-Queue-Id: B44F280009 -Authentication-Results: imf02.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=p293qdy+; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf02.hostedemail.com: domain of - 3GgbGYgYKCFsRNSA3H9HH9E7.5HFEBGNQ-FFDO35D.HK9@flex--yuzhao.bounces.google.com - designates 209.85.166.74 as permitted sender) - smtp.mailfrom=3GgbGYgYKCFsRNSA3H9HH9E7.5HFEBGNQ-FFDO35D.HK9@flex--yuzhao.bounces.google.com -X-HE-Tag: 1657144858-162393 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Evictable pages are divided into multiple generations for each lruvec. -The youngest generation number is stored in lrugen->max_seq for both -anon and file types as they are aged on an equal footing. The oldest -generation numbers are stored in lrugen->min_seq[] separately for anon -and file types as clean file pages can be evicted regardless of swap -constraints. These three variables are monotonically increasing. - -Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits -in order to fit into the gen counter in folio->flags. Each truncated -generation number is an index to lrugen->lists[]. The sliding window -technique is used to track at least MIN_NR_GENS and at most -MAX_NR_GENS generations. The gen counter stores a value within [1, -MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it -stores 0. - -There are two conceptually independent procedures: "the aging", which -produces young generations, and "the eviction", which consumes old -generations. They form a closed-loop system, i.e., "the page reclaim". -Both procedures can be invoked from userspace for the purposes of -working set estimation and proactive reclaim. These techniques are -commonly used to optimize job scheduling (bin packing) in data -centers [1][2]. - -To avoid confusion, the terms "hot" and "cold" will be applied to the -multi-gen LRU, as a new convention; the terms "active" and "inactive" -will be applied to the active/inactive LRU, as usual. - -The protection of hot pages and the selection of cold pages are based -on page access channels and patterns. There are two access channels: -one through page tables and the other through file descriptors. The -protection of the former channel is by design stronger because: -1. The uncertainty in determining the access patterns of the former - channel is higher due to the approximation of the accessed bit. -2. The cost of evicting the former channel is higher due to the TLB - flushes required and the likelihood of encountering the dirty bit. -3. The penalty of underprotecting the former channel is higher because - applications usually do not prepare themselves for major page - faults like they do for blocked I/O. E.g., GUI applications - commonly use dedicated I/O threads to avoid blocking rendering - threads. -There are also two access patterns: one with temporal locality and the -other without. For the reasons listed above, the former channel is -assumed to follow the former pattern unless VM_SEQ_READ or -VM_RAND_READ is present; the latter channel is assumed to follow the -latter pattern unless outlying refaults have been observed [3][4]. - -The next patch will address the "outlying refaults". Three macros, -i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are -added in this patch to make the entire patchset less diffy. - -A page is added to the youngest generation on faulting. The aging -needs to check the accessed bit at least twice before handing this -page over to the eviction. The first check takes care of the accessed -bit set on the initial fault; the second check makes sure this page -has not been used since then. This protocol, AKA second chance, -requires a minimum of two generations, hence MIN_NR_GENS. - -[1] https://dl.acm.org/doi/10.1145/3297858.3304053 -[2] https://dl.acm.org/doi/10.1145/3503222.3507731 -[3] https://lwn.net/Articles/495543/ -[4] https://lwn.net/Articles/815342/ - -Signed-off-by: Yu Zhao -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - fs/fuse/dev.c | 3 +- - include/linux/mm.h | 2 + - include/linux/mm_inline.h | 175 ++++++++++++++++++++++++++++++ - include/linux/mmzone.h | 100 +++++++++++++++++ - include/linux/page-flags-layout.h | 13 ++- - include/linux/page-flags.h | 4 +- - include/linux/sched.h | 4 + - kernel/bounds.c | 5 + - mm/Kconfig | 8 ++ - mm/huge_memory.c | 3 +- - mm/memcontrol.c | 2 + - mm/memory.c | 25 +++++ - mm/mm_init.c | 6 +- - mm/mmzone.c | 2 + - mm/swap.c | 9 +- - mm/vmscan.c | 75 +++++++++++++ - 16 files changed, 423 insertions(+), 13 deletions(-) - -diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c -index 0e537e580dc1..5d36015071d2 100644 ---- a/fs/fuse/dev.c -+++ b/fs/fuse/dev.c -@@ -777,7 +777,8 @@ static int fuse_check_page(struct page *page) - 1 << PG_active | - 1 << PG_workingset | - 1 << PG_reclaim | -- 1 << PG_waiters))) { -+ 1 << PG_waiters | -+ LRU_GEN_MASK | LRU_REFS_MASK))) { - dump_page(page, "fuse: trying to steal weird page"); - return 1; - } -diff --git a/include/linux/mm.h b/include/linux/mm.h -index cf3d0d673f6b..ed5393e5930d 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -1060,6 +1060,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); - #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) - #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) - #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) -+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) -+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) - - /* - * Define the bit shifts to access each section. For non-existent -diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h -index fb8aadb81cd6..2ff703900fd0 100644 ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -40,6 +40,9 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, - { - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - -+ lockdep_assert_held(&lruvec->lru_lock); -+ WARN_ON_ONCE(nr_pages != (int)nr_pages); -+ - __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); - __mod_zone_page_state(&pgdat->node_zones[zid], - NR_ZONE_LRU_BASE + lru, nr_pages); -@@ -101,11 +104,177 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) - return lru; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static inline bool lru_gen_enabled(void) -+{ -+ return true; -+} -+ -+static inline bool lru_gen_in_fault(void) -+{ -+ return current->in_lru_fault; -+} -+ -+static inline int lru_gen_from_seq(unsigned long seq) -+{ -+ return seq % MAX_NR_GENS; -+} -+ -+static inline int folio_lru_gen(struct folio *folio) -+{ -+ unsigned long flags = READ_ONCE(folio->flags); -+ -+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) -+{ -+ unsigned long max_seq = lruvec->lrugen.max_seq; -+ -+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); -+ -+ /* see the comment on MIN_NR_GENS */ -+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); -+} -+ -+static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, -+ int old_gen, int new_gen) -+{ -+ int type = folio_is_file_lru(folio); -+ int zone = folio_zonenum(folio); -+ int delta = folio_nr_pages(folio); -+ enum lru_list lru = type * LRU_INACTIVE_FILE; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); -+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); -+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); -+ -+ if (old_gen >= 0) -+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], -+ lrugen->nr_pages[old_gen][type][zone] - delta); -+ if (new_gen >= 0) -+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], -+ lrugen->nr_pages[new_gen][type][zone] + delta); -+ -+ /* addition */ -+ if (old_gen < 0) { -+ if (lru_gen_is_active(lruvec, new_gen)) -+ lru += LRU_ACTIVE; -+ __update_lru_size(lruvec, lru, zone, delta); -+ return; -+ } -+ -+ /* deletion */ -+ if (new_gen < 0) { -+ if (lru_gen_is_active(lruvec, old_gen)) -+ lru += LRU_ACTIVE; -+ __update_lru_size(lruvec, lru, zone, -delta); -+ return; -+ } -+} -+ -+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) -+{ -+ unsigned long seq; -+ unsigned long flags; -+ int gen = folio_lru_gen(folio); -+ int type = folio_is_file_lru(folio); -+ int zone = folio_zonenum(folio); -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); -+ -+ if (folio_test_unevictable(folio)) -+ return false; -+ /* -+ * There are three common cases for this page: -+ * 1. If it's hot, e.g., freshly faulted in or previously hot and -+ * migrated, add it to the youngest generation. -+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page -+ * not in swapcache or a dirty page pending writeback, add it to the -+ * second oldest generation. -+ * 3. Everything else (clean, cold) is added to the oldest generation. -+ */ -+ if (folio_test_active(folio)) -+ seq = lrugen->max_seq; -+ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || -+ (folio_test_reclaim(folio) && -+ (folio_test_dirty(folio) || folio_test_writeback(folio)))) -+ seq = lrugen->min_seq[type] + 1; -+ else -+ seq = lrugen->min_seq[type]; -+ -+ gen = lru_gen_from_seq(seq); -+ flags = (gen + 1UL) << LRU_GEN_PGOFF; -+ /* see the comment on MIN_NR_GENS about PG_active */ -+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); -+ -+ lru_gen_update_size(lruvec, folio, -1, gen); -+ /* for folio_rotate_reclaimable() */ -+ if (reclaiming) -+ list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); -+ else -+ list_add(&folio->lru, &lrugen->lists[gen][type][zone]); -+ -+ return true; -+} -+ -+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) -+{ -+ unsigned long flags; -+ int gen = folio_lru_gen(folio); -+ -+ if (gen < 0) -+ return false; -+ -+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); -+ -+ /* for folio_migrate_flags() */ -+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; -+ flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); -+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ -+ lru_gen_update_size(lruvec, folio, gen, -1); -+ list_del(&folio->lru); -+ -+ return true; -+} -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline bool lru_gen_enabled(void) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_in_fault(void) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static __always_inline - void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) - { - enum lru_list lru = folio_lru_list(folio); - -+ if (lru_gen_add_folio(lruvec, folio, false)) -+ return; -+ - update_lru_size(lruvec, lru, folio_zonenum(folio), - folio_nr_pages(folio)); - if (lru != LRU_UNEVICTABLE) -@@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) - { - enum lru_list lru = folio_lru_list(folio); - -+ if (lru_gen_add_folio(lruvec, folio, true)) -+ return; -+ - update_lru_size(lruvec, lru, folio_zonenum(folio), - folio_nr_pages(folio)); - /* This is not expected to be used on LRU_UNEVICTABLE */ -@@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) - { - enum lru_list lru = folio_lru_list(folio); - -+ if (lru_gen_del_folio(lruvec, folio, false)) -+ return; -+ - if (lru != LRU_UNEVICTABLE) - list_del(&folio->lru); - update_lru_size(lruvec, lru, folio_zonenum(folio), -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index aab70355d64f..c90c2282044e 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -314,6 +314,102 @@ enum lruvec_flags { - */ - }; - -+#endif /* !__GENERATING_BOUNDS_H */ -+ -+/* -+ * Evictable pages are divided into multiple generations. The youngest and the -+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing. -+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An -+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the -+ * corresponding generation. The gen counter in folio->flags stores gen+1 while -+ * a page is on one of lrugen->lists[]. Otherwise it stores 0. -+ * -+ * A page is added to the youngest generation on faulting. The aging needs to -+ * check the accessed bit at least twice before handing this page over to the -+ * eviction. The first check takes care of the accessed bit set on the initial -+ * fault; the second check makes sure this page hasn't been used since then. -+ * This process, AKA second chance, requires a minimum of two generations, -+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive -+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the -+ * rest of generations, if they exist, are considered inactive. See -+ * lru_gen_is_active(). -+ * -+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that -+ * the aging needs not to worry about it. And it's set again when a page -+ * considered active is isolated for non-reclaiming purposes, e.g., migration. -+ * See lru_gen_add_folio() and lru_gen_del_folio(). -+ * -+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the -+ * number of categories of the active/inactive LRU when keeping track of -+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits -+ * in folio->flags. -+ */ -+#define MIN_NR_GENS 2U -+#define MAX_NR_GENS 4U -+ -+#ifndef __GENERATING_BOUNDS_H -+ -+struct lruvec; -+ -+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) -+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -+ -+#ifdef CONFIG_LRU_GEN -+ -+enum { -+ LRU_GEN_ANON, -+ LRU_GEN_FILE, -+}; -+ -+/* -+ * The youngest generation number is stored in max_seq for both anon and file -+ * types as they are aged on an equal footing. The oldest generation numbers are -+ * stored in min_seq[] separately for anon and file types as clean file pages -+ * can be evicted regardless of swap constraints. -+ * -+ * Normally anon and file min_seq are in sync. But if swapping is constrained, -+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon -+ * min_seq behind. -+ * -+ * The number of pages in each generation is eventually consistent and therefore -+ * can be transiently negative. -+ */ -+struct lru_gen_struct { -+ /* the aging increments the youngest generation number */ -+ unsigned long max_seq; -+ /* the eviction increments the oldest generation numbers */ -+ unsigned long min_seq[ANON_AND_FILE]; -+ /* the multi-gen LRU lists, lazily sorted on eviction */ -+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the multi-gen LRU sizes, eventually consistent */ -+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+}; -+ -+void lru_gen_init_lruvec(struct lruvec *lruvec); -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg); -+void lru_gen_exit_memcg(struct mem_cgroup *memcg); -+#endif -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_init_lruvec(struct lruvec *lruvec) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+} -+ -+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) -+{ -+} -+#endif -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct lruvec { - struct list_head lists[NR_LRU_LISTS]; - /* per lruvec lru_lock for memcg */ -@@ -331,6 +427,10 @@ struct lruvec { - unsigned long refaults[ANON_AND_FILE]; - /* Various lruvec state flags (enum lruvec_flags) */ - unsigned long flags; -+#ifdef CONFIG_LRU_GEN -+ /* evictable pages divided into generations */ -+ struct lru_gen_struct lrugen; -+#endif - #ifdef CONFIG_MEMCG - struct pglist_data *pgdat; - #endif -diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h -index ef1e3e736e14..240905407a18 100644 ---- a/include/linux/page-flags-layout.h -+++ b/include/linux/page-flags-layout.h -@@ -55,7 +55,8 @@ - #define SECTIONS_WIDTH 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ -+ <= BITS_PER_LONG - NR_PAGEFLAGS - #define NODES_WIDTH NODES_SHIFT - #elif defined(CONFIG_SPARSEMEM_VMEMMAP) - #error "Vmemmap: No space for nodes field in page flags" -@@ -89,8 +90,8 @@ - #define LAST_CPUPID_SHIFT 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ -- <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS - #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT - #else - #define LAST_CPUPID_WIDTH 0 -@@ -100,10 +101,12 @@ - #define LAST_CPUPID_NOT_IN_PAGE_FLAGS - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ -- > BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS - #error "Not enough bits in page flags" - #endif - -+#define LRU_REFS_WIDTH 0 -+ - #endif - #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ -diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h -index e66f7aa3191d..8d466d724852 100644 ---- a/include/linux/page-flags.h -+++ b/include/linux/page-flags.h -@@ -1059,7 +1059,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) - 1UL << PG_private | 1UL << PG_private_2 | \ - 1UL << PG_writeback | 1UL << PG_reserved | \ - 1UL << PG_slab | 1UL << PG_active | \ -- 1UL << PG_unevictable | __PG_MLOCKED) -+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) - - /* - * Flags checked when a page is prepped for return by the page allocator. -@@ -1070,7 +1070,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) - * alloc-free cycle to prevent from reusing the page. - */ - #define PAGE_FLAGS_CHECK_AT_PREP \ -- (PAGEFLAGS_MASK & ~__PG_HWPOISON) -+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) - - #define PAGE_FLAGS_PRIVATE \ - (1UL << PG_private | 1UL << PG_private_2) -diff --git a/include/linux/sched.h b/include/linux/sched.h -index c46f3a63b758..744340a96ace 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -912,6 +912,10 @@ struct task_struct { - #ifdef CONFIG_MEMCG - unsigned in_user_fault:1; - #endif -+#ifdef CONFIG_LRU_GEN -+ /* whether the LRU algorithm may apply to this access */ -+ unsigned in_lru_fault:1; -+#endif - #ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; - #endif -diff --git a/kernel/bounds.c b/kernel/bounds.c -index 9795d75b09b2..5ee60777d8e4 100644 ---- a/kernel/bounds.c -+++ b/kernel/bounds.c -@@ -22,6 +22,11 @@ int main(void) - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); - #endif - DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); -+#ifdef CONFIG_LRU_GEN -+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); -+#else -+ DEFINE(LRU_GEN_WIDTH, 0); -+#endif - /* End of constants */ - - return 0; -diff --git a/mm/Kconfig b/mm/Kconfig -index 169e64192e48..cee109f3128a 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -1130,6 +1130,14 @@ config PTE_MARKER_UFFD_WP - purposes. It is required to enable userfaultfd write protection on - file-backed memory types like shmem and hugetlbfs. - -+config LRU_GEN -+ bool "Multi-Gen LRU" -+ depends on MMU -+ # make sure folio->flags has enough spare bits -+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP -+ help -+ A high performance LRU implementation to overcommit memory. -+ - source "mm/damon/Kconfig" - - endmenu -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 834f288b3769..5500583e35b8 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -2370,7 +2370,8 @@ static void __split_huge_page_tail(struct page *head, int tail, - #ifdef CONFIG_64BIT - (1L << PG_arch_2) | - #endif -- (1L << PG_dirty))); -+ (1L << PG_dirty) | -+ LRU_GEN_MASK | LRU_REFS_MASK)); - - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 618c366a2f07..7d58e8a73ece 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -5105,6 +5105,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) - - static void mem_cgroup_free(struct mem_cgroup *memcg) - { -+ lru_gen_exit_memcg(memcg); - memcg_wb_domain_exit(memcg); - __mem_cgroup_free(memcg); - } -@@ -5163,6 +5164,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) - memcg->deferred_split_queue.split_queue_len = 0; - #endif - idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); -+ lru_gen_init_memcg(memcg); - return memcg; - fail: - mem_cgroup_id_remove(memcg); -diff --git a/mm/memory.c b/mm/memory.c -index 49500390b91b..85d3961c2bd5 100644 ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -5091,6 +5091,27 @@ static inline void mm_account_fault(struct pt_regs *regs, - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); - } - -+#ifdef CONFIG_LRU_GEN -+static void lru_gen_enter_fault(struct vm_area_struct *vma) -+{ -+ /* the LRU algorithm doesn't apply to sequential or random reads */ -+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); -+} -+ -+static void lru_gen_exit_fault(void) -+{ -+ current->in_lru_fault = false; -+} -+#else -+static void lru_gen_enter_fault(struct vm_area_struct *vma) -+{ -+} -+ -+static void lru_gen_exit_fault(void) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - /* - * By the time we get here, we already hold the mm semaphore - * -@@ -5122,11 +5143,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - if (flags & FAULT_FLAG_USER) - mem_cgroup_enter_user_fault(); - -+ lru_gen_enter_fault(vma); -+ - if (unlikely(is_vm_hugetlb_page(vma))) - ret = hugetlb_fault(vma->vm_mm, vma, address, flags); - else - ret = __handle_mm_fault(vma, address, flags); - -+ lru_gen_exit_fault(); -+ - if (flags & FAULT_FLAG_USER) { - mem_cgroup_exit_user_fault(); - /* -diff --git a/mm/mm_init.c b/mm/mm_init.c -index 9ddaf0e1b0ab..0d7b2bd2454a 100644 ---- a/mm/mm_init.c -+++ b/mm/mm_init.c -@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) - - shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH -- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; -+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", -- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", -+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", - SECTIONS_WIDTH, - NODES_WIDTH, - ZONES_WIDTH, - LAST_CPUPID_WIDTH, - KASAN_TAG_WIDTH, -+ LRU_GEN_WIDTH, -+ LRU_REFS_WIDTH, - NR_PAGEFLAGS); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", -diff --git a/mm/mmzone.c b/mm/mmzone.c -index 0ae7571e35ab..68e1511be12d 100644 ---- a/mm/mmzone.c -+++ b/mm/mmzone.c -@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec) - * Poison its list head, so that any operations on it would crash. - */ - list_del(&lruvec->lists[LRU_UNEVICTABLE]); -+ -+ lru_gen_init_lruvec(lruvec); - } - - #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) -diff --git a/mm/swap.c b/mm/swap.c -index 034bb24879a3..b062729b340f 100644 ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -460,6 +460,11 @@ void folio_add_lru(struct folio *folio) - VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - -+ /* see the comment in lru_gen_add_folio() */ -+ if (lru_gen_enabled() && !folio_test_unevictable(folio) && -+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) -+ folio_set_active(folio); -+ - folio_get(folio); - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); -@@ -551,7 +556,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) - - static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) - { -- if (PageActive(page) && !PageUnevictable(page)) { -+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - int nr_pages = thp_nr_pages(page); - - del_page_from_lru_list(page, lruvec); -@@ -666,7 +671,7 @@ void deactivate_file_folio(struct folio *folio) - */ - void deactivate_page(struct page *page) - { -- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { -+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); -diff --git a/mm/vmscan.c b/mm/vmscan.c -index fddb9bd3c6c2..1fcc0feed985 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -2992,6 +2992,81 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, - return can_demote(pgdat->node_id, sc); - } - -+#ifdef CONFIG_LRU_GEN -+ -+/****************************************************************************** -+ * shorthand helpers -+ ******************************************************************************/ -+ -+#define for_each_gen_type_zone(gen, type, zone) \ -+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ -+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) -+ -+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid) -+{ -+ struct pglist_data *pgdat = NODE_DATA(nid); -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) { -+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; -+ -+ /* for hotadd_new_pgdat() */ -+ if (!lruvec->pgdat) -+ lruvec->pgdat = pgdat; -+ -+ return lruvec; -+ } -+#endif -+ VM_WARN_ON_ONCE(!mem_cgroup_disabled()); -+ -+ return pgdat ? &pgdat->__lruvec : NULL; -+} -+ -+/****************************************************************************** -+ * initialization -+ ******************************************************************************/ -+ -+void lru_gen_init_lruvec(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ lrugen->max_seq = MIN_NR_GENS + 1; -+ -+ for_each_gen_type_zone(gen, type, zone) -+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+} -+ -+void lru_gen_exit_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(memcg, nid); -+ -+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, -+ sizeof(lruvec->lrugen.nr_pages))); -+ } -+} -+#endif -+ -+static int __init init_lru_gen(void) -+{ -+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); -+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ -+ return 0; -+}; -+late_initcall(init_lru_gen); -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { - unsigned long nr[NR_LRU_LISTS]; - -From patchwork Wed Jul 6 22:00:15 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908704 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id BC3F5C43334 - for ; Wed, 6 Jul 2022 22:01:04 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 12E3C8E0002; Wed, 6 Jul 2022 18:01:02 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id 0B84D8E0001; Wed, 6 Jul 2022 18:01:02 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id DAFB38E0002; Wed, 6 Jul 2022 18:01:01 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0016.hostedemail.com - [216.40.44.16]) - by kanga.kvack.org (Postfix) with ESMTP id C1CFC8E0001 - for ; Wed, 6 Jul 2022 18:01:01 -0400 (EDT) -Received: from smtpin27.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay02.hostedemail.com (Postfix) with ESMTP id 8F7AA33A54 - for ; Wed, 6 Jul 2022 22:01:01 +0000 (UTC) -X-FDA: 79658045922.27.E9F2FD6 -Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com - [209.85.219.201]) - by imf27.hostedemail.com (Postfix) with ESMTP id C38524001B - for ; Wed, 6 Jul 2022 22:01:00 +0000 (UTC) -Received: by mail-yb1-f201.google.com with SMTP id - u17-20020a258411000000b0066dfb22644eso11129264ybk.6 - for ; Wed, 06 Jul 2022 15:01:00 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=y0ZxSqiIOv2HRYm553wZrJx5fChLkGPPbLO1qwZgmyQ=; - b=k4czIYvx4CiuCTGm0ZE5CP3ROAwcGkVPLViBUVhaVvkR7uaNKMq35oiGoZrpr9wmyA - 3m25Gt55w07/Zl+RDxl25UcbFclUuv1IhW8RxSswLcgrHkQRPfvrY4sHXWvh8Zx9tcVy - 57vPZrwMAdg5KxxrjfPcq/qdHGTF/uyJnTdFe8v4GztZ5hfTrusX1wVVySS9zGZ/5Iow - Nd9yluqy3C3Vy/90KJx2guGDz9MOF3sU6l1ICpYZ9vNR6C8Rq/+pMVqKsY9lUtmogcQ9 - 4GYcy0Nvop1G8oE5zpjlPJBv9NQtnMO9nw2qaCn4RWoOH37nG4jPSXNMIBpa8zn061RW - FgQg== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=y0ZxSqiIOv2HRYm553wZrJx5fChLkGPPbLO1qwZgmyQ=; - b=363aZyaQrxgNeIHPuTaRjAavjP4F8EO5sILVoIfz7F8ymEnUrJ32Mjc0rSEnA9jAM9 - iqLFSFIFzIMWUuTmljy1fFNDL7A2uNdDlNrJRCZ/gZAbXFjDT2j5Dl8E8XzpIRlJl7vn - ZRDzjDR9sVo08B1nyi1AhFP4nb1L2qE8qvTpXkzENYGDSZu7h/AALKU8/CiZpj9hkDwo - lzh5wc2ycnx5mXpDF9ieinUDPgG+YeeoSleAk0FgtV+zyB5xkdIruPNpSueQff+ct/G8 - 5c1VriWHEfaNMcJkuREKnILndD3W/O2w5mkcChSKsKDm9/nd8r2q/EJUgq2xca+u+CAl - jEVQ== -X-Gm-Message-State: AJIora/mFkYkqX5X5k4lKDoDCm8/beFZvhHqMOoV25In9oaB8n7Wpsnu - TVd/VrxMHywvnAlU0/ugaxgGqWgltsw= -X-Google-Smtp-Source: - AGRyM1sRDxtae2IlBgXPvJfXEts8Wxw8Va1kZtVIMGzblX4Mg8zS6Ie6RM5yT6WBMCN4GAE5u4jJ09Jf3oM= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a81:ab4d:0:b0:31c:8655:2207 with SMTP id - d13-20020a81ab4d000000b0031c86552207mr26036050ywk.389.1657144860068; Wed, 06 - Jul 2022 15:01:00 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:15 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-7-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 06/14] mm: multi-gen LRU: minimal implementation -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144861; a=rsa-sha256; - cv=none; - b=oTd2WxyeO8ccfm0UBIeIMD+jFWftz29Vc+53VsIdewSZAb8/4ceMzzXxauEqqrmAUtsLQ4 - sWFeVaIcSbnT8ZbgPae4FumiKT2ISp4qcEqBL74ek6P+YSnzhBoTUB4RYYRJ4JqS5sa2rW - hk5QljWrRnJjE4lY/D16EloP8YSx7T8= -ARC-Authentication-Results: i=1; - imf27.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=k4czIYvx; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf27.hostedemail.com: domain of - 3HAbGYgYKCF0TPUC5JBJJBG9.7JHGDIPS-HHFQ57F.JMB@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3HAbGYgYKCF0TPUC5JBJJBG9.7JHGDIPS-HHFQ57F.JMB@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144861; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=y0ZxSqiIOv2HRYm553wZrJx5fChLkGPPbLO1qwZgmyQ=; - b=wTs1b9ocf9FcHn9gYIlmuegnIgTo09PHZ8oYnB8j0wXjHhk0Al+NkNixxAvIfaCH4rMsxI - ErhpOzYCe9rwuJ5BAQvblyNUvN2Y5/i9ASXhp2bGy5PaMkTpI8OeOqjiGL9EQQonR3t7UB - j3QLmoVYs9VO0LxxgVoIQEv9nGf6zf8= -X-Stat-Signature: 653crzffxniht38wad94goaie9ebgi3z -X-Rspam-User: -X-Rspamd-Server: rspam12 -X-Rspamd-Queue-Id: C38524001B -Authentication-Results: imf27.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=k4czIYvx; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf27.hostedemail.com: domain of - 3HAbGYgYKCF0TPUC5JBJJBG9.7JHGDIPS-HHFQ57F.JMB@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3HAbGYgYKCF0TPUC5JBJJBG9.7JHGDIPS-HHFQ57F.JMB@flex--yuzhao.bounces.google.com -X-HE-Tag: 1657144860-126552 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -To avoid confusion, the terms "promotion" and "demotion" will be -applied to the multi-gen LRU, as a new convention; the terms -"activation" and "deactivation" will be applied to the active/inactive -LRU, as usual. - -The aging produces young generations. Given an lruvec, it increments -max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS. The aging -promotes hot pages to the youngest generation when it finds them -accessed through page tables; the demotion of cold pages happens -consequently when it increments max_seq. Promotion in the aging path -does not involve any LRU list operations, only the updates of the gen -counter and lrugen->nr_pages[]; demotion, unless as the result of the -increment of max_seq, requires LRU list operations, e.g., -lru_deactivate_fn(). The aging has the complexity O(nr_hot_pages), -since it is only interested in hot pages. - -The eviction consumes old generations. Given an lruvec, it increments -min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes -empty. A feedback loop modeled after the PID controller monitors -refaults over anon and file types and decides which type to evict when -both types are available from the same generation. - -The protection of pages accessed multiple times through file -descriptors takes place in the eviction path. Each generation is -divided into multiple tiers. A page accessed N times through file -descriptors is in tier order_base_2(N). Tiers do not have dedicated -lrugen->lists[], only bits in folio->flags. The aforementioned -feedback loop also monitors refaults over all tiers and decides when -to protect pages in which tiers (N>1), using the first tier (N=0,1) as -a baseline. The first tier contains single-use unmapped clean pages, -which are most likely the best choices. In contrast to promotion in -the aging path, the protection of a page in the eviction path is -achieved by moving this page to the next generation, i.e., min_seq+1, -if the feedback loop decides so. This approach has the following -advantages: -1. It removes the cost of activation in the buffered access path by - inferring whether pages accessed multiple times through file - descriptors are statistically hot and thus worth protecting in the - eviction path. -2. It takes pages accessed through page tables into account and avoids - overprotecting pages accessed multiple times through file - descriptors. (Pages accessed through page tables are in the first - tier, since N=0.) -3. More tiers provide better protection for pages accessed more than - twice through file descriptors, when under heavy buffered I/O - workloads. - -Server benchmark results: - Single workload: - fio (buffered I/O): +[30, 32]% - IOPS BW - 5.19-rc1: 2673k 10.2GiB/s - patch1-6: 3491k 13.3GiB/s - - Single workload: - memcached (anon): -[4, 6]% - Ops/sec KB/sec - 5.19-rc1: 1161501.04 45177.25 - patch1-6: 1106168.46 43025.04 - - Configurations: - CPU: two Xeon 6154 - Mem: total 256G - - Node 1 was only used as a ram disk to reduce the variance in the - results. - - patch drivers/block/brd.c < gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE; - > page = alloc_pages_node(1, gfp_flags, 0); - EOF - - cat >>/etc/systemd/system.conf <>/etc/memcached.conf </sys/fs/cgroup/user.slice/test/memory.max - echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs - fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \ - --buffered=1 --ioengine=io_uring --iodepth=128 \ - --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ - --rw=randread --random_distribution=random --norandommap \ - --time_based --ramp_time=10m --runtime=5m --group_reporting - - cat memcached.sh - modprobe brd rd_nr=1 rd_size=113246208 - swapoff -a - mkswap /dev/ram0 - swapon /dev/ram0 - - memtier_benchmark -S /var/run/memcached/memcached.sock \ - -P memcache_binary -n allkeys --key-minimum=1 \ - --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \ - --ratio 1:0 --pipeline 8 -d 2000 - - memtier_benchmark -S /var/run/memcached/memcached.sock \ - -P memcache_binary -n allkeys --key-minimum=1 \ - --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \ - --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed - -Client benchmark results: - kswapd profiles: - 5.19-rc1 - 40.33% page_vma_mapped_walk (overhead) - 21.80% lzo1x_1_do_compress (real work) - 7.53% do_raw_spin_lock - 3.95% _raw_spin_unlock_irq - 2.52% vma_interval_tree_iter_next - 2.37% folio_referenced_one - 2.28% vma_interval_tree_subtree_search - 1.97% anon_vma_interval_tree_iter_first - 1.60% ptep_clear_flush - 1.06% __zram_bvec_write - - patch1-6 - 39.03% lzo1x_1_do_compress (real work) - 18.47% page_vma_mapped_walk (overhead) - 6.74% _raw_spin_unlock_irq - 3.97% do_raw_spin_lock - 2.49% ptep_clear_flush - 2.48% anon_vma_interval_tree_iter_first - 1.92% folio_referenced_one - 1.88% __zram_bvec_write - 1.48% memmove - 1.31% vma_interval_tree_iter_next - - Configurations: - CPU: single Snapdragon 7c - Mem: total 4G - - Chrome OS MemoryPressure [1] - -[1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/ - -Signed-off-by: Yu Zhao -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - include/linux/mm_inline.h | 36 ++ - include/linux/mmzone.h | 41 ++ - include/linux/page-flags-layout.h | 5 +- - kernel/bounds.c | 2 + - mm/Kconfig | 11 + - mm/swap.c | 39 ++ - mm/vmscan.c | 810 +++++++++++++++++++++++++++++- - mm/workingset.c | 110 +++- - 8 files changed, 1044 insertions(+), 10 deletions(-) - -diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h -index 2ff703900fd0..f2b2296a42f9 100644 ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -121,6 +121,33 @@ static inline int lru_gen_from_seq(unsigned long seq) - return seq % MAX_NR_GENS; - } - -+static inline int lru_hist_from_seq(unsigned long seq) -+{ -+ return seq % NR_HIST_GENS; -+} -+ -+static inline int lru_tier_from_refs(int refs) -+{ -+ VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); -+ -+ /* see the comment in folio_lru_refs() */ -+ return order_base_2(refs + 1); -+} -+ -+static inline int folio_lru_refs(struct folio *folio) -+{ -+ unsigned long flags = READ_ONCE(folio->flags); -+ bool workingset = flags & BIT(PG_workingset); -+ -+ /* -+ * Return the number of accesses beyond PG_referenced, i.e., N-1 if the -+ * total number of accesses is N>1, since N=0,1 both map to the first -+ * tier. lru_tier_from_refs() will account for this off-by-one. Also see -+ * the comment on MAX_NR_TIERS. -+ */ -+ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; -+} -+ - static inline int folio_lru_gen(struct folio *folio) - { - unsigned long flags = READ_ONCE(folio->flags); -@@ -173,6 +200,15 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli - __update_lru_size(lruvec, lru, zone, -delta); - return; - } -+ -+ /* promotion */ -+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { -+ __update_lru_size(lruvec, lru, zone, -delta); -+ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); -+ } -+ -+ /* demotion requires isolation, e.g., lru_deactivate_fn() */ -+ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); - } - - static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index c90c2282044e..0d76222501ed 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -347,6 +347,28 @@ enum lruvec_flags { - #define MIN_NR_GENS 2U - #define MAX_NR_GENS 4U - -+/* -+ * Each generation is divided into multiple tiers. A page accessed N times -+ * through file descriptors is in tier order_base_2(N). A page in the first tier -+ * (N=0,1) is marked by PG_referenced unless it was faulted in through page -+ * tables or read ahead. A page in any other tier (N>1) is marked by -+ * PG_referenced and PG_workingset. This implies a minimum of two tiers is -+ * supported without using additional bits in folio->flags. -+ * -+ * In contrast to moving across generations which requires the LRU lock, moving -+ * across tiers only involves atomic operations on folio->flags and therefore -+ * has a negligible cost in the buffered access path. In the eviction path, -+ * comparisons of refaulted/(evicted+protected) from the first tier and the -+ * rest infer whether pages accessed multiple times through file descriptors -+ * are statistically hot and thus worth protecting. -+ * -+ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the -+ * number of categories of the active/inactive LRU when keeping track of -+ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in -+ * folio->flags. -+ */ -+#define MAX_NR_TIERS 4U -+ - #ifndef __GENERATING_BOUNDS_H - - struct lruvec; -@@ -361,6 +383,16 @@ enum { - LRU_GEN_FILE, - }; - -+#define MIN_LRU_BATCH BITS_PER_LONG -+#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) -+ -+/* whether to keep historical stats from evicted generations */ -+#ifdef CONFIG_LRU_GEN_STATS -+#define NR_HIST_GENS MAX_NR_GENS -+#else -+#define NR_HIST_GENS 1U -+#endif -+ - /* - * The youngest generation number is stored in max_seq for both anon and file - * types as they are aged on an equal footing. The oldest generation numbers are -@@ -383,6 +415,15 @@ struct lru_gen_struct { - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; - /* the multi-gen LRU sizes, eventually consistent */ - long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the exponential moving average of refaulted */ -+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the exponential moving average of evicted+protected */ -+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the first tier doesn't need protection, hence the minus one */ -+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; -+ /* can be modified without holding the LRU lock */ -+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; -+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; - }; - - void lru_gen_init_lruvec(struct lruvec *lruvec); -diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h -index 240905407a18..7d79818dc065 100644 ---- a/include/linux/page-flags-layout.h -+++ b/include/linux/page-flags-layout.h -@@ -106,7 +106,10 @@ - #error "Not enough bits in page flags" - #endif - --#define LRU_REFS_WIDTH 0 -+/* see the comment on MAX_NR_TIERS */ -+#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \ -+ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ -+ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) - - #endif - #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ -diff --git a/kernel/bounds.c b/kernel/bounds.c -index 5ee60777d8e4..b529182e8b04 100644 ---- a/kernel/bounds.c -+++ b/kernel/bounds.c -@@ -24,8 +24,10 @@ int main(void) - DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); - #ifdef CONFIG_LRU_GEN - DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); -+ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2); - #else - DEFINE(LRU_GEN_WIDTH, 0); -+ DEFINE(__LRU_REFS_WIDTH, 0); - #endif - /* End of constants */ - -diff --git a/mm/Kconfig b/mm/Kconfig -index cee109f3128a..a93478acf341 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -1130,6 +1130,7 @@ config PTE_MARKER_UFFD_WP - purposes. It is required to enable userfaultfd write protection on - file-backed memory types like shmem and hugetlbfs. - -+# multi-gen LRU { - config LRU_GEN - bool "Multi-Gen LRU" - depends on MMU -@@ -1138,6 +1139,16 @@ config LRU_GEN - help - A high performance LRU implementation to overcommit memory. - -+config LRU_GEN_STATS -+ bool "Full stats for debugging" -+ depends on LRU_GEN -+ help -+ Do not enable this option unless you plan to look at historical stats -+ from evicted generations for debugging purpose. -+ -+ This option has a per-memcg and per-node memory overhead. -+# } -+ - source "mm/damon/Kconfig" - - endmenu -diff --git a/mm/swap.c b/mm/swap.c -index b062729b340f..67e7962fbacc 100644 ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -405,6 +405,40 @@ static void __lru_cache_activate_folio(struct folio *folio) - local_unlock(&lru_pvecs.lock); - } - -+#ifdef CONFIG_LRU_GEN -+static void folio_inc_refs(struct folio *folio) -+{ -+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags); -+ -+ if (folio_test_unevictable(folio)) -+ return; -+ -+ if (!folio_test_referenced(folio)) { -+ folio_set_referenced(folio); -+ return; -+ } -+ -+ if (!folio_test_workingset(folio)) { -+ folio_set_workingset(folio); -+ return; -+ } -+ -+ /* see the comment on MAX_NR_TIERS */ -+ do { -+ new_flags = old_flags & LRU_REFS_MASK; -+ if (new_flags == LRU_REFS_MASK) -+ break; -+ -+ new_flags += BIT(LRU_REFS_PGOFF); -+ new_flags |= old_flags & ~LRU_REFS_MASK; -+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); -+} -+#else -+static void folio_inc_refs(struct folio *folio) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - /* - * Mark a page as having seen activity. - * -@@ -417,6 +451,11 @@ static void __lru_cache_activate_folio(struct folio *folio) - */ - void folio_mark_accessed(struct folio *folio) - { -+ if (lru_gen_enabled()) { -+ folio_inc_refs(folio); -+ return; -+ } -+ - if (!folio_test_referenced(folio)) { - folio_set_referenced(folio); - } else if (folio_test_unevictable(folio)) { -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 1fcc0feed985..f768d61e7b85 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -1273,9 +1273,11 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, - - if (folio_test_swapcache(folio)) { - swp_entry_t swap = folio_swap_entry(folio); -- mem_cgroup_swapout(folio, swap); -+ -+ /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ - if (reclaimed && !mapping_exiting(mapping)) - shadow = workingset_eviction(folio, target_memcg); -+ mem_cgroup_swapout(folio, swap); - __delete_from_swap_cache(&folio->page, swap, shadow); - xa_unlock_irq(&mapping->i_pages); - put_swap_page(&folio->page, swap); -@@ -2675,6 +2677,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) - unsigned long file; - struct lruvec *target_lruvec; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - - /* -@@ -2998,6 +3003,17 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, - * shorthand helpers - ******************************************************************************/ - -+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) -+ -+#define DEFINE_MAX_SEQ(lruvec) \ -+ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) -+ -+#define DEFINE_MIN_SEQ(lruvec) \ -+ unsigned long min_seq[ANON_AND_FILE] = { \ -+ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ -+ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ -+ } -+ - #define for_each_gen_type_zone(gen, type, zone) \ - for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ - for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -@@ -3023,6 +3039,764 @@ static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int ni - return pgdat ? &pgdat->__lruvec : NULL; - } - -+static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ if (!can_demote(pgdat->node_id, sc) && -+ mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) -+ return 0; -+ -+ return mem_cgroup_swappiness(memcg); -+} -+ -+static int get_nr_gens(struct lruvec *lruvec, int type) -+{ -+ return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; -+} -+ -+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) -+{ -+ /* see the comment on lru_gen_struct */ -+ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && -+ get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && -+ get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; -+} -+ -+/****************************************************************************** -+ * refault feedback loop -+ ******************************************************************************/ -+ -+/* -+ * A feedback loop based on Proportional-Integral-Derivative (PID) controller. -+ * -+ * The P term is refaulted/(evicted+protected) from a tier in the generation -+ * currently being evicted; the I term is the exponential moving average of the -+ * P term over the generations previously evicted, using the smoothing factor -+ * 1/2; the D term isn't supported. -+ * -+ * The setpoint (SP) is always the first tier of one type; the process variable -+ * (PV) is either any tier of the other type or any other tier of the same -+ * type. -+ * -+ * The error is the difference between the SP and the PV; the correction is to -+ * turn off protection when SP>PV or turn on protection when SPlrugen; -+ int hist = lru_hist_from_seq(lrugen->min_seq[type]); -+ -+ pos->refaulted = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ pos->total = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ pos->total += lrugen->protected[hist][type][tier - 1]; -+ pos->gain = gain; -+} -+ -+static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) -+{ -+ int hist, tier; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; -+ unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; -+ -+ lockdep_assert_held(&lruvec->lru_lock); -+ -+ if (!carryover && !clear) -+ return; -+ -+ hist = lru_hist_from_seq(seq); -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ if (carryover) { -+ unsigned long sum; -+ -+ sum = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); -+ -+ sum = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ sum += lrugen->protected[hist][type][tier - 1]; -+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); -+ } -+ -+ if (clear) { -+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); -+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); -+ if (tier) -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); -+ } -+ } -+} -+ -+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) -+{ -+ /* -+ * Return true if the PV has a limited number of refaults or a lower -+ * refaulted/total than the SP. -+ */ -+ return pv->refaulted < MIN_LRU_BATCH || -+ pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= -+ (sp->refaulted + 1) * pv->total * pv->gain; -+} -+ -+/****************************************************************************** -+ * the aging -+ ******************************************************************************/ -+ -+/* protect pages accessed multiple times through file descriptors */ -+static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) -+{ -+ int type = folio_is_file_lru(folio); -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags); -+ -+ VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); -+ -+ do { -+ new_gen = (old_gen + 1) % MAX_NR_GENS; -+ -+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); -+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; -+ /* for folio_end_writeback() */ -+ if (reclaiming) -+ new_flags |= BIT(PG_reclaim); -+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); -+ -+ lru_gen_update_size(lruvec, folio, old_gen, new_gen); -+ -+ return new_gen; -+} -+ -+static void inc_min_seq(struct lruvec *lruvec, int type) -+{ -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ reset_ctrl_pos(lruvec, type, true); -+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); -+} -+ -+static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) -+{ -+ int gen, type, zone; -+ bool success = false; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); -+ -+ /* find the oldest populated generation */ -+ for (type = !can_swap; type < ANON_AND_FILE; type++) { -+ while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { -+ gen = lru_gen_from_seq(min_seq[type]); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ if (!list_empty(&lrugen->lists[gen][type][zone])) -+ goto next; -+ } -+ -+ min_seq[type]++; -+ } -+next: -+ ; -+ } -+ -+ /* see the comment on lru_gen_struct */ -+ if (can_swap) { -+ min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); -+ min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); -+ } -+ -+ for (type = !can_swap; type < ANON_AND_FILE; type++) { -+ if (min_seq[type] == lrugen->min_seq[type]) -+ continue; -+ -+ reset_ctrl_pos(lruvec, type, true); -+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); -+ success = true; -+ } -+ -+ return success; -+} -+ -+static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) -+{ -+ int prev, next; -+ int type, zone; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); -+ -+ if (max_seq != lrugen->max_seq) -+ goto unlock; -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) -+ continue; -+ -+ VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); -+ -+ inc_min_seq(lruvec, type); -+ } -+ -+ /* -+ * Update the active/inactive LRU sizes for compatibility. Both sides of -+ * the current max_seq need to be covered, since max_seq+1 can overlap -+ * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do -+ * overlap, cold/hot inversion happens. -+ */ -+ prev = lru_gen_from_seq(lrugen->max_seq - 1); -+ next = lru_gen_from_seq(lrugen->max_seq + 1); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ enum lru_list lru = type * LRU_INACTIVE_FILE; -+ long delta = lrugen->nr_pages[prev][type][zone] - -+ lrugen->nr_pages[next][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ __update_lru_size(lruvec, lru, zone, delta); -+ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); -+ } -+ } -+ -+ for (type = 0; type < ANON_AND_FILE; type++) -+ reset_ctrl_pos(lruvec, type, false); -+ -+ /* make sure preceding modifications appear */ -+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); -+unlock: -+ spin_unlock_irq(&lruvec->lru_lock); -+} -+ -+static unsigned long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq, -+ unsigned long *min_seq, bool can_swap, bool *need_aging) -+{ -+ int gen, type, zone; -+ unsigned long old = 0; -+ unsigned long young = 0; -+ unsigned long total = 0; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ for (type = !can_swap; type < ANON_AND_FILE; type++) { -+ unsigned long seq; -+ -+ for (seq = min_seq[type]; seq <= max_seq; seq++) { -+ unsigned long size = 0; -+ -+ gen = lru_gen_from_seq(seq); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) -+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); -+ -+ total += size; -+ if (seq == max_seq) -+ young += size; -+ if (seq + MIN_NR_GENS == max_seq) -+ old += size; -+ } -+ } -+ -+ /* -+ * The aging tries to be lazy to reduce the overhead. On the other hand, -+ * the eviction stalls when the number of generations reaches -+ * MIN_NR_GENS. So ideally, there should be MIN_NR_GENS+1 generations, -+ * hence the first two if's. -+ * -+ * Also it's ideal to spread pages out evenly, meaning 1/(MIN_NR_GENS+1) -+ * of the total number of pages for each generation. A reasonable range -+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The -+ * eviction cares about the lower bound of cold pages, whereas the aging -+ * cares about the upper bound of hot pages. -+ */ -+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) -+ *need_aging = true; -+ else if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) -+ *need_aging = false; -+ else if (young * MIN_NR_GENS > total) -+ *need_aging = true; -+ else if (old * (MIN_NR_GENS + 2) < total) -+ *need_aging = true; -+ else -+ *need_aging = false; -+ -+ return total; -+} -+ -+static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ bool need_aging; -+ unsigned long nr_to_scan; -+ int swappiness = get_swappiness(lruvec, sc); -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_WARN_ON_ONCE(sc->memcg_low_reclaim); -+ -+ mem_cgroup_calculate_protection(NULL, memcg); -+ -+ if (mem_cgroup_below_min(memcg)) -+ return; -+ -+ nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, swappiness, &need_aging); -+ if (!nr_to_scan) -+ return; -+ -+ nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0; -+ -+ if (nr_to_scan && need_aging) -+ inc_max_seq(lruvec, max_seq, swappiness); -+} -+ -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+ struct mem_cgroup *memcg; -+ -+ VM_WARN_ON_ONCE(!current_is_kswapd()); -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ -+ age_lruvec(lruvec, sc); -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+} -+ -+/****************************************************************************** -+ * the eviction -+ ******************************************************************************/ -+ -+static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) -+{ -+ bool success; -+ int gen = folio_lru_gen(folio); -+ int type = folio_is_file_lru(folio); -+ int zone = folio_zonenum(folio); -+ int delta = folio_nr_pages(folio); -+ int refs = folio_lru_refs(folio); -+ int tier = lru_tier_from_refs(refs); -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); -+ -+ /* unevictable */ -+ if (!folio_evictable(folio)) { -+ success = lru_gen_del_folio(lruvec, folio, true); -+ VM_WARN_ON_ONCE_FOLIO(!success, folio); -+ folio_set_unevictable(folio); -+ lruvec_add_folio(lruvec, folio); -+ __count_vm_events(UNEVICTABLE_PGCULLED, delta); -+ return true; -+ } -+ -+ /* dirty lazyfree */ -+ if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { -+ success = lru_gen_del_folio(lruvec, folio, true); -+ VM_WARN_ON_ONCE_FOLIO(!success, folio); -+ folio_set_swapbacked(folio); -+ lruvec_add_folio_tail(lruvec, folio); -+ return true; -+ } -+ -+ /* protected */ -+ if (tier > tier_idx) { -+ int hist = lru_hist_from_seq(lrugen->min_seq[type]); -+ -+ gen = folio_inc_gen(lruvec, folio, false); -+ list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); -+ -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], -+ lrugen->protected[hist][type][tier - 1] + delta); -+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); -+ return true; -+ } -+ -+ /* waiting for writeback */ -+ if (folio_test_locked(folio) || folio_test_writeback(folio) || -+ (type == LRU_GEN_FILE && folio_test_dirty(folio))) { -+ gen = folio_inc_gen(lruvec, folio, true); -+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]); -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) -+{ -+ bool success; -+ -+ /* unmapping inhibited */ -+ if (!sc->may_unmap && folio_mapped(folio)) -+ return false; -+ -+ /* swapping inhibited */ -+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && -+ (folio_test_dirty(folio) || -+ (folio_test_anon(folio) && !folio_test_swapcache(folio)))) -+ return false; -+ -+ /* raced with release_pages() */ -+ if (!folio_try_get(folio)) -+ return false; -+ -+ /* raced with another isolation */ -+ if (!folio_test_clear_lru(folio)) { -+ folio_put(folio); -+ return false; -+ } -+ -+ /* see the comment on MAX_NR_TIERS */ -+ if (!folio_test_referenced(folio)) -+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); -+ -+ /* for shrink_page_list() */ -+ folio_clear_reclaim(folio); -+ folio_clear_referenced(folio); -+ -+ success = lru_gen_del_folio(lruvec, folio, true); -+ VM_WARN_ON_ONCE_FOLIO(!success, folio); -+ -+ return true; -+} -+ -+static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, -+ int type, int tier, struct list_head *list) -+{ -+ int gen, zone; -+ enum vm_event_item item; -+ int sorted = 0; -+ int scanned = 0; -+ int isolated = 0; -+ int remaining = MAX_LRU_BATCH; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ -+ VM_WARN_ON_ONCE(!list_empty(list)); -+ -+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) -+ return 0; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ for (zone = sc->reclaim_idx; zone >= 0; zone--) { -+ LIST_HEAD(moved); -+ int skipped = 0; -+ struct list_head *head = &lrugen->lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct folio *folio = lru_to_folio(head); -+ int delta = folio_nr_pages(folio); -+ -+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); -+ -+ scanned += delta; -+ -+ if (sort_folio(lruvec, folio, tier)) -+ sorted += delta; -+ else if (isolate_folio(lruvec, folio, sc)) { -+ list_add(&folio->lru, list); -+ isolated += delta; -+ } else { -+ list_move(&folio->lru, &moved); -+ skipped += delta; -+ } -+ -+ if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) -+ break; -+ } -+ -+ if (skipped) { -+ list_splice(&moved, head); -+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); -+ } -+ -+ if (!remaining || isolated >= MIN_LRU_BATCH) -+ break; -+ } -+ -+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; -+ if (!cgroup_reclaim(sc)) { -+ __count_vm_events(item, isolated); -+ __count_vm_events(PGREFILL, sorted); -+ } -+ __count_memcg_events(memcg, item, isolated); -+ __count_memcg_events(memcg, PGREFILL, sorted); -+ __count_vm_events(PGSCAN_ANON + type, isolated); -+ -+ /* -+ * There might not be eligible pages due to reclaim_idx, may_unmap and -+ * may_writepage. Check the remaining to prevent livelock if it's not -+ * making progress. -+ */ -+ return isolated || !remaining ? scanned : 0; -+} -+ -+static int get_tier_idx(struct lruvec *lruvec, int type) -+{ -+ int tier; -+ struct ctrl_pos sp, pv; -+ -+ /* -+ * To leave a margin for fluctuations, use a larger gain factor (1:2). -+ * This value is chosen because any other tier would have at least twice -+ * as many refaults as the first tier. -+ */ -+ read_ctrl_pos(lruvec, type, 0, 1, &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, 2, &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ return tier - 1; -+} -+ -+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) -+{ -+ int type, tier; -+ struct ctrl_pos sp, pv; -+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; -+ -+ /* -+ * Compare the first tier of anon with that of file to determine which -+ * type to scan. Also need to compare other tiers of the selected type -+ * with the first tier of the other type to determine the last tier (of -+ * the selected type) to evict. -+ */ -+ read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); -+ read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); -+ type = positive_ctrl_err(&sp, &pv); -+ -+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ *tier_idx = tier - 1; -+ -+ return type; -+} -+ -+static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ int *type_scanned, struct list_head *list) -+{ -+ int i; -+ int type; -+ int scanned; -+ int tier = -1; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ /* -+ * Try to make the obvious choice first. When anon and file are both -+ * available from the same generation, interpret swappiness 1 as file -+ * first and 200 as anon first. -+ */ -+ if (!swappiness) -+ type = LRU_GEN_FILE; -+ else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) -+ type = LRU_GEN_ANON; -+ else if (swappiness == 1) -+ type = LRU_GEN_FILE; -+ else if (swappiness == 200) -+ type = LRU_GEN_ANON; -+ else -+ type = get_type_to_scan(lruvec, swappiness, &tier); -+ -+ for (i = !swappiness; i < ANON_AND_FILE; i++) { -+ if (tier < 0) -+ tier = get_tier_idx(lruvec, type); -+ -+ scanned = scan_folios(lruvec, sc, type, tier, list); -+ if (scanned) -+ break; -+ -+ type = !type; -+ tier = -1; -+ } -+ -+ *type_scanned = type; -+ -+ return scanned; -+} -+ -+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ int type; -+ int scanned; -+ int reclaimed; -+ LIST_HEAD(list); -+ struct folio *folio; -+ enum vm_event_item item; -+ struct reclaim_stat stat; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); -+ -+ scanned += try_to_inc_min_seq(lruvec, swappiness); -+ -+ if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) -+ scanned = 0; -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ if (list_empty(&list)) -+ return scanned; -+ -+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); -+ -+ list_for_each_entry(folio, &list, lru) { -+ /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ -+ if (folio_test_workingset(folio)) -+ folio_set_referenced(folio); -+ -+ /* don't add rejected pages to the oldest generation */ -+ if (folio_test_reclaim(folio) && -+ (folio_test_dirty(folio) || folio_test_writeback(folio))) -+ folio_clear_active(folio); -+ else -+ folio_set_active(folio); -+ } -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ move_pages_to_lru(lruvec, &list); -+ -+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; -+ if (!cgroup_reclaim(sc)) -+ __count_vm_events(item, reclaimed); -+ __count_memcg_events(memcg, item, reclaimed); -+ __count_vm_events(PGSTEAL_ANON + type, reclaimed); -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ mem_cgroup_uncharge_list(&list); -+ free_unref_page_list(&list); -+ -+ sc->nr_reclaimed += reclaimed; -+ -+ return scanned; -+} -+ -+static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, -+ bool can_swap, unsigned long reclaimed) -+{ -+ int priority; -+ bool need_aging; -+ unsigned long nr_to_scan; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (mem_cgroup_below_min(memcg) || -+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) -+ return 0; -+ -+ nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, &need_aging); -+ if (!nr_to_scan) -+ return 0; -+ -+ /* adjust priority if memcg is offline or the target is met */ -+ if (!mem_cgroup_online(memcg)) -+ priority = 0; -+ else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim) -+ priority = DEF_PRIORITY; -+ else -+ priority = sc->priority; -+ -+ nr_to_scan >>= priority; -+ if (!nr_to_scan) -+ return 0; -+ -+ if (!need_aging) -+ return nr_to_scan; -+ -+ /* skip the aging path at the default priority */ -+ if (priority == DEF_PRIORITY) -+ goto done; -+ -+ /* leave the work to lru_gen_age_node() */ -+ if (current_is_kswapd()) -+ return 0; -+ -+ inc_max_seq(lruvec, max_seq, can_swap); -+done: -+ return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; -+} -+ -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ struct blk_plug plug; -+ unsigned long scanned = 0; -+ unsigned long reclaimed = sc->nr_reclaimed; -+ -+ lru_add_drain(); -+ -+ blk_start_plug(&plug); -+ -+ while (true) { -+ int delta; -+ int swappiness; -+ unsigned long nr_to_scan; -+ -+ if (sc->may_swap) -+ swappiness = get_swappiness(lruvec, sc); -+ else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) -+ swappiness = 1; -+ else -+ swappiness = 0; -+ -+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed); -+ if (!nr_to_scan) -+ break; -+ -+ delta = evict_folios(lruvec, sc, swappiness); -+ if (!delta) -+ break; -+ -+ scanned += delta; -+ if (scanned >= nr_to_scan) -+ break; -+ -+ cond_resched(); -+ } -+ -+ blk_finish_plug(&plug); -+} -+ - /****************************************************************************** - * initialization - ******************************************************************************/ -@@ -3065,6 +3839,16 @@ static int __init init_lru_gen(void) - }; - late_initcall(init_lru_gen); - -+#else /* !CONFIG_LRU_GEN */ -+ -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+} -+ -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+} -+ - #endif /* CONFIG_LRU_GEN */ - - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -3078,6 +3862,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - struct blk_plug plug; - bool scan_adjusted; - -+ if (lru_gen_enabled()) { -+ lru_gen_shrink_lruvec(lruvec, sc); -+ return; -+ } -+ - get_scan_count(lruvec, sc, nr); - - /* Record the original scan target for proportional adjustments later */ -@@ -3582,6 +4371,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) - struct lruvec *target_lruvec; - unsigned long refaults; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; -@@ -3946,12 +4738,17 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, - } - #endif - --static void age_active_anon(struct pglist_data *pgdat, -+static void kswapd_age_node(struct pglist_data *pgdat, - struct scan_control *sc) - { - struct mem_cgroup *memcg; - struct lruvec *lruvec; - -+ if (lru_gen_enabled()) { -+ lru_gen_age_node(pgdat, sc); -+ return; -+ } -+ - if (!can_age_anon_pages(pgdat, sc)) - return; - -@@ -4271,12 +5068,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) - sc.may_swap = !nr_boost_reclaim; - - /* -- * Do some background aging of the anon list, to give -- * pages a chance to be referenced before reclaiming. All -- * pages are rotated regardless of classzone as this is -- * about consistent aging. -+ * Do some background aging, to give pages a chance to be -+ * referenced before reclaiming. All pages are rotated -+ * regardless of classzone as this is about consistent aging. - */ -- age_active_anon(pgdat, &sc); -+ kswapd_age_node(pgdat, &sc); - - /* - * If we're getting trouble reclaiming, start doing writepage -diff --git a/mm/workingset.c b/mm/workingset.c -index 592569a8974c..84a9e0ab04ad 100644 ---- a/mm/workingset.c -+++ b/mm/workingset.c -@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; - static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, - bool workingset) - { -- eviction >>= bucket_order; - eviction &= EVICTION_MASK; - eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; - eviction = (eviction << NODES_SHIFT) | pgdat->node_id; -@@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - - *memcgidp = memcgid; - *pgdat = NODE_DATA(nid); -- *evictionp = entry << bucket_order; -+ *evictionp = entry; - *workingsetp = workingset; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static void *lru_gen_eviction(struct folio *folio) -+{ -+ int hist; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lru_gen_struct *lrugen; -+ int type = folio_is_file_lru(folio); -+ int delta = folio_nr_pages(folio); -+ int refs = folio_lru_refs(folio); -+ int tier = lru_tier_from_refs(refs); -+ struct mem_cgroup *memcg = folio_memcg(folio); -+ struct pglist_data *pgdat = folio_pgdat(folio); -+ -+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); -+ -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->lrugen; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); -+ -+ hist = lru_hist_from_seq(min_seq); -+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); -+ -+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); -+} -+ -+static void lru_gen_refault(struct folio *folio, void *shadow) -+{ -+ int hist, tier, refs; -+ int memcg_id; -+ bool workingset; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lru_gen_struct *lrugen; -+ struct mem_cgroup *memcg; -+ struct pglist_data *pgdat; -+ int type = folio_is_file_lru(folio); -+ int delta = folio_nr_pages(folio); -+ -+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); -+ -+ if (pgdat != folio_pgdat(folio)) -+ return; -+ -+ rcu_read_lock(); -+ -+ memcg = folio_memcg_rcu(folio); -+ if (memcg_id != mem_cgroup_id(memcg)) -+ goto unlock; -+ -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->lrugen; -+ -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) -+ goto unlock; -+ -+ hist = lru_hist_from_seq(min_seq); -+ /* see the comment in folio_lru_refs() */ -+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; -+ tier = lru_tier_from_refs(refs); -+ -+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); -+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); -+ -+ /* -+ * Count the following two cases as stalls: -+ * 1. For pages accessed through page tables, hotter pages pushed out -+ * hot pages which refaulted immediately. -+ * 2. For pages accessed multiple times through file descriptors, -+ * numbers of accesses might have been out of the range. -+ */ -+ if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) { -+ folio_set_workingset(folio); -+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); -+ } -+unlock: -+ rcu_read_unlock(); -+} -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static void *lru_gen_eviction(struct folio *folio) -+{ -+ return NULL; -+} -+ -+static void lru_gen_refault(struct folio *folio, void *shadow) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - /** - * workingset_age_nonresident - age non-resident entries as LRU ages - * @lruvec: the lruvec that was aged -@@ -264,10 +360,14 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) - VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - -+ if (lru_gen_enabled()) -+ return lru_gen_eviction(folio); -+ - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->nonresident_age); -+ eviction >>= bucket_order; - workingset_age_nonresident(lruvec, folio_nr_pages(folio)); - return pack_shadow(memcgid, pgdat, eviction, - folio_test_workingset(folio)); -@@ -298,7 +398,13 @@ void workingset_refault(struct folio *folio, void *shadow) - int memcgid; - long nr; - -+ if (lru_gen_enabled()) { -+ lru_gen_refault(folio, shadow); -+ return; -+ } -+ - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); -+ eviction <<= bucket_order; - - rcu_read_lock(); - /* - -From patchwork Wed Jul 6 22:00:16 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908705 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 14A6EC433EF - for ; Wed, 6 Jul 2022 22:01:08 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id B6A108E0003; Wed, 6 Jul 2022 18:01:04 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id AA0BB8E0001; Wed, 6 Jul 2022 18:01:04 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id 91B3F8E0003; Wed, 6 Jul 2022 18:01:04 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0015.hostedemail.com - [216.40.44.15]) - by kanga.kvack.org (Postfix) with ESMTP id 7CF7B8E0001 - for ; Wed, 6 Jul 2022 18:01:04 -0400 (EDT) -Received: from smtpin10.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay01.hostedemail.com (Postfix) with ESMTP id 3A6B760CA7 - for ; Wed, 6 Jul 2022 22:01:04 +0000 (UTC) -X-FDA: 79658046048.10.E729B6A -Received: from mail-io1-f73.google.com (mail-io1-f73.google.com - [209.85.166.73]) - by imf12.hostedemail.com (Postfix) with ESMTP id 654DC40027 - for ; Wed, 6 Jul 2022 22:01:02 +0000 (UTC) -Received: by mail-io1-f73.google.com with SMTP id - k1-20020a5d8741000000b00678ad1103e7so2775981iol.21 - for ; Wed, 06 Jul 2022 15:01:02 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=XKxfVy7bUFxJEpcf3Ov7jSHxjMmYVb5gb0hRKtMPzZA=; - b=KBFBaieE6U899pZedfVW186dkzrS93jrjdIku8VfT3EELG4tmwSu4pbA8t8KgFrkX7 - PNLYIcjPRwCltpKZ41cpDa72lID2PMQjd0C5UzA1EP6Fozv39FS8efLoCNj0H5GROfCg - QMeGKWc2c6xuBh73e/hz1kG0ddQk8uDEqQzdd1hwg6GKOeAe0e98I4co7JiaxOzZQyVa - H3rcYT5ECNNWjJIqW6rJYkUeALUPQkQ6SiSCuxFVIHVt/LqtAYlBu4IfaEL80m1SvJmZ - XAzuOW4B/+BDlzSPXhxDXR3iWNFF0evXZaEn2Xyp6i9pgpXVhqsXUcbDrh/yv+aznbGp - vfZw== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=XKxfVy7bUFxJEpcf3Ov7jSHxjMmYVb5gb0hRKtMPzZA=; - b=wEDdfX1Ca1wRAghCjDtN/218Ar7YzHk1SswV9PWlO2rWdVGpEgzcJ4I72IScKmI1ak - byWww5GGZKkp2et7daVUaRaYUyNsN+JvzNTS4ZkA1+KUJp4sBdk6TL4F/+sKxhpfeXRW - 4O7rUvWmL/AcEhzhNNzOrk8NgMwLZJxEbmWumi3GZaaNwvSzvi/ZnQ4iy7QczcGNuoOD - vHTTjgYxcTl+FO55iQfNa8RXZ2EK68p8Q3s6TnE+enNd0MlFosYB8z+tz3T9tsSSAk7D - L5g2KMDag6shNFwrkU67N/AO/rmf0tvQvfgZZtmfM+fNoppSNpfEruaUg6sf+al7cZaq - Y9Jg== -X-Gm-Message-State: AJIora9J02U2PbT6ikgYVNH5lZ4bF/dPn/RAZ8KsVxnRgQbrFGcLQrwU - eoZuUGfIebQ11JrsGluVoFQjXZwzJak= -X-Google-Smtp-Source: - AGRyM1sRwdKObor4YUQFmQ9ta0XlmIxSt4ZUo1xITqK6sTS54jUHQ2ZrB8LtbtvMYYrpomN4w49bZKlSJRk= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a05:6e02:148c:b0:2dc:38ae:5c6a with SMTP id - n12-20020a056e02148c00b002dc38ae5c6amr2363805ilk.115.1657144861728; Wed, 06 - Jul 2022 15:01:01 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:16 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-8-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 07/14] mm: multi-gen LRU: exploit locality in rmap -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Barry Song , Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144862; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=XKxfVy7bUFxJEpcf3Ov7jSHxjMmYVb5gb0hRKtMPzZA=; - b=78gzoYQUEpZ/3nhPL81S9IoTS+tamEn/8D7ioIFwlboSYOhcwIufnOyPh57lBQoFdANuof - SnLww4J7TveiCJa5kFHPwj8xzXM0ANKbJmf4o4cLIVitPhVH7z6V5EFfj457OWAKTjIo6b - NZ86RpRkjWLByNbszbZPPLUZQi27u4U= -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144862; a=rsa-sha256; - cv=none; - b=gbyvyPJZ1QVIBcx+YUE+JKq+Cj69MF+XU4E+AoEjDiVevGW0fLXZdcgIYKhIeTQ4VReTbP - TYy+UJJ7mp48jYOJ43EUlODLaxeez62GyJ6+OwE5GLOHlgIg1MIlrrlMmwrW3B3t4byGOx - 58gkmlSjFRcnFO6DMq3ACgJdURFm7Fo= -ARC-Authentication-Results: i=1; - imf12.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=KBFBaieE; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf12.hostedemail.com: domain of - 3HQbGYgYKCF4UQVD6KCKKCHA.8KIHEJQT-IIGR68G.KNC@flex--yuzhao.bounces.google.com - designates 209.85.166.73 as permitted sender) - smtp.mailfrom=3HQbGYgYKCF4UQVD6KCKKCHA.8KIHEJQT-IIGR68G.KNC@flex--yuzhao.bounces.google.com -X-Rspam-User: -Authentication-Results: imf12.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=KBFBaieE; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf12.hostedemail.com: domain of - 3HQbGYgYKCF4UQVD6KCKKCHA.8KIHEJQT-IIGR68G.KNC@flex--yuzhao.bounces.google.com - designates 209.85.166.73 as permitted sender) - smtp.mailfrom=3HQbGYgYKCF4UQVD6KCKKCHA.8KIHEJQT-IIGR68G.KNC@flex--yuzhao.bounces.google.com -X-Rspamd-Server: rspam06 -X-Rspamd-Queue-Id: 654DC40027 -X-Stat-Signature: upc9kmwzt4t6z55coguwsrw11efy9iir -X-HE-Tag: 1657144862-661235 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Searching the rmap for PTEs mapping each page on an LRU list (to test -and clear the accessed bit) can be expensive because pages from -different VMAs (PA space) are not cache friendly to the rmap (VA -space). For workloads mostly using mapped pages, searching the rmap -can incur the highest CPU cost in the reclaim path. - -This patch exploits spatial locality to reduce the trips into the -rmap. When shrink_page_list() walks the rmap and finds a young PTE, a -new function lru_gen_look_around() scans at most BITS_PER_LONG-1 -adjacent PTEs. On finding another young PTE, it clears the accessed -bit and updates the gen counter of the page mapped by this PTE to -(max_seq%MAX_NR_GENS)+1. - -Server benchmark results: - Single workload: - fio (buffered I/O): no change - - Single workload: - memcached (anon): +[3, 5]% - Ops/sec KB/sec - patch1-6: 1106168.46 43025.04 - patch1-7: 1147696.57 44640.29 - - Configurations: - no change - -Client benchmark results: - kswapd profiles: - patch1-6 - 39.03% lzo1x_1_do_compress (real work) - 18.47% page_vma_mapped_walk (overhead) - 6.74% _raw_spin_unlock_irq - 3.97% do_raw_spin_lock - 2.49% ptep_clear_flush - 2.48% anon_vma_interval_tree_iter_first - 1.92% folio_referenced_one - 1.88% __zram_bvec_write - 1.48% memmove - 1.31% vma_interval_tree_iter_next - - patch1-7 - 48.16% lzo1x_1_do_compress (real work) - 8.20% page_vma_mapped_walk (overhead) - 7.06% _raw_spin_unlock_irq - 2.92% ptep_clear_flush - 2.53% __zram_bvec_write - 2.11% do_raw_spin_lock - 2.02% memmove - 1.93% lru_gen_look_around - 1.56% free_unref_page_list - 1.40% memset - - Configurations: - no change - -Signed-off-by: Yu Zhao -Acked-by: Barry Song -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - include/linux/memcontrol.h | 31 +++++++ - include/linux/mm.h | 5 + - include/linux/mmzone.h | 6 ++ - mm/internal.h | 1 + - mm/memcontrol.c | 1 + - mm/rmap.c | 6 ++ - mm/swap.c | 4 +- - mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++ - 8 files changed, 236 insertions(+), 2 deletions(-) - -diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h -index 9ecead1042b9..9d0fea17f9ef 100644 ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -444,6 +444,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) - * - LRU isolation - * - lock_page_memcg() - * - exclusive reference -+ * - mem_cgroup_trylock_pages() - * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. -@@ -505,6 +506,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) - * - LRU isolation - * - lock_page_memcg() - * - exclusive reference -+ * - mem_cgroup_trylock_pages() - * - * For a kmem page a caller should hold an rcu read lock to protect memcg - * associated with a kmem page from being released. -@@ -950,6 +952,23 @@ void unlock_page_memcg(struct page *page); - - void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); - -+/* try to stablize folio_memcg() for all the pages in a memcg */ -+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -+{ -+ rcu_read_lock(); -+ -+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) -+ return true; -+ -+ rcu_read_unlock(); -+ return false; -+} -+ -+static inline void mem_cgroup_unlock_pages(void) -+{ -+ rcu_read_unlock(); -+} -+ - /* idx can be of type enum memcg_stat_item or node_stat_item */ - static inline void mod_memcg_state(struct mem_cgroup *memcg, - int idx, int val) -@@ -1401,6 +1420,18 @@ static inline void folio_memcg_unlock(struct folio *folio) - { - } - -+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -+{ -+ /* to match folio_memcg_rcu() */ -+ rcu_read_lock(); -+ return true; -+} -+ -+static inline void mem_cgroup_unlock_pages(void) -+{ -+ rcu_read_unlock(); -+} -+ - static inline void mem_cgroup_handle_over_high(void) - { - } -diff --git a/include/linux/mm.h b/include/linux/mm.h -index ed5393e5930d..981b2e447936 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -1523,6 +1523,11 @@ static inline unsigned long folio_pfn(struct folio *folio) - return page_to_pfn(&folio->page); - } - -+static inline struct folio *pfn_folio(unsigned long pfn) -+{ -+ return page_folio(pfn_to_page(pfn)); -+} -+ - static inline atomic_t *folio_pincount_ptr(struct folio *folio) - { - return &folio_page(folio, 1)->compound_pincount; -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 0d76222501ed..4fd7fc16eeb4 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -372,6 +372,7 @@ enum lruvec_flags { - #ifndef __GENERATING_BOUNDS_H - - struct lruvec; -+struct page_vma_mapped_walk; - - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) - #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -@@ -427,6 +428,7 @@ struct lru_gen_struct { - }; - - void lru_gen_init_lruvec(struct lruvec *lruvec); -+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - - #ifdef CONFIG_MEMCG - void lru_gen_init_memcg(struct mem_cgroup *memcg); -@@ -439,6 +441,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) - { - } - -+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+{ -+} -+ - #ifdef CONFIG_MEMCG - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) - { -diff --git a/mm/internal.h b/mm/internal.h -index c0f8fbe0445b..3d070582052e 100644 ---- a/mm/internal.h -+++ b/mm/internal.h -@@ -83,6 +83,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); - void folio_rotate_reclaimable(struct folio *folio); - bool __folio_end_writeback(struct folio *folio); - void deactivate_file_folio(struct folio *folio); -+void folio_activate(struct folio *folio); - - void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 7d58e8a73ece..743f8513f1c3 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -2777,6 +2777,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) - * - LRU isolation - * - lock_page_memcg() - * - exclusive reference -+ * - mem_cgroup_trylock_pages() - */ - folio->memcg_data = (unsigned long)memcg; - } -diff --git a/mm/rmap.c b/mm/rmap.c -index 5bcb334cd6f2..dce1a56b02f8 100644 ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -830,6 +830,12 @@ static bool folio_referenced_one(struct folio *folio, - } - - if (pvmw.pte) { -+ if (lru_gen_enabled() && pte_young(*pvmw.pte) && -+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { -+ lru_gen_look_around(&pvmw); -+ referenced++; -+ } -+ - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - /* -diff --git a/mm/swap.c b/mm/swap.c -index 67e7962fbacc..131fc76242a3 100644 ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -342,7 +342,7 @@ static bool need_activate_page_drain(int cpu) - return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; - } - --static void folio_activate(struct folio *folio) -+void folio_activate(struct folio *folio) - { - if (folio_test_lru(folio) && !folio_test_active(folio) && - !folio_test_unevictable(folio)) { -@@ -362,7 +362,7 @@ static inline void activate_page_drain(int cpu) - { - } - --static void folio_activate(struct folio *folio) -+void folio_activate(struct folio *folio) - { - struct lruvec *lruvec; - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index f768d61e7b85..ec786fc556a7 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -1574,6 +1574,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, - if (!sc->may_unmap && folio_mapped(folio)) - goto keep_locked; - -+ /* folio_update_gen() tried to promote this page? */ -+ if (lru_gen_enabled() && !ignore_references && -+ folio_mapped(folio) && folio_test_referenced(folio)) -+ goto keep_locked; -+ - /* - * The number of dirty pages determines if a node is marked - * reclaim_congested. kswapd will stall and start writing -@@ -3161,6 +3166,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) - * the aging - ******************************************************************************/ - -+/* promote pages accessed through page tables */ -+static int folio_update_gen(struct folio *folio, int gen) -+{ -+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags); -+ -+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); -+ VM_WARN_ON_ONCE(!rcu_read_lock_held()); -+ -+ do { -+ /* lru_gen_del_folio() has isolated this page? */ -+ if (!(old_flags & LRU_GEN_MASK)) { -+ /* for shrink_page_list() */ -+ new_flags = old_flags | BIT(PG_referenced); -+ continue; -+ } -+ -+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); -+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); -+ -+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ - /* protect pages accessed multiple times through file descriptors */ - static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) - { -@@ -3172,6 +3200,11 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai - VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); - - do { -+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ /* folio_update_gen() has promoted this page? */ -+ if (new_gen >= 0 && new_gen != old_gen) -+ return new_gen; -+ - new_gen = (old_gen + 1) % MAX_NR_GENS; - - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); -@@ -3186,6 +3219,43 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai - return new_gen; - } - -+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) -+{ -+ unsigned long pfn = pte_pfn(pte); -+ -+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); -+ -+ if (!pte_present(pte) || is_zero_pfn(pfn)) -+ return -1; -+ -+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) -+ return -1; -+ -+ if (WARN_ON_ONCE(!pfn_valid(pfn))) -+ return -1; -+ -+ return pfn; -+} -+ -+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, -+ struct pglist_data *pgdat) -+{ -+ struct folio *folio; -+ -+ /* try to avoid unnecessary memory loads */ -+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) -+ return NULL; -+ -+ folio = pfn_folio(pfn); -+ if (folio_nid(folio) != pgdat->node_id) -+ return NULL; -+ -+ if (folio_memcg_rcu(folio) != memcg) -+ return NULL; -+ -+ return folio; -+} -+ - static void inc_min_seq(struct lruvec *lruvec, int type) - { - struct lru_gen_struct *lrugen = &lruvec->lrugen; -@@ -3387,6 +3457,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); - } - -+/* -+ * This function exploits spatial locality when shrink_page_list() walks the -+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. -+ */ -+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+{ -+ int i; -+ pte_t *pte; -+ unsigned long start; -+ unsigned long end; -+ unsigned long addr; -+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; -+ struct folio *folio = pfn_folio(pvmw->pfn); -+ struct mem_cgroup *memcg = folio_memcg(folio); -+ struct pglist_data *pgdat = folio_pgdat(folio); -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ DEFINE_MAX_SEQ(lruvec); -+ int old_gen, new_gen = lru_gen_from_seq(max_seq); -+ -+ lockdep_assert_held(pvmw->ptl); -+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); -+ -+ if (spin_is_contended(pvmw->ptl)) -+ return; -+ -+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); -+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; -+ -+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { -+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) -+ end = start + MIN_LRU_BATCH * PAGE_SIZE; -+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) -+ start = end - MIN_LRU_BATCH * PAGE_SIZE; -+ else { -+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; -+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; -+ } -+ } -+ -+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; -+ -+ rcu_read_lock(); -+ arch_enter_lazy_mmu_mode(); -+ -+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ unsigned long pfn; -+ -+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr); -+ if (pfn == -1) -+ continue; -+ -+ if (!pte_young(pte[i])) -+ continue; -+ -+ folio = get_pfn_folio(pfn, memcg, pgdat); -+ if (!folio) -+ continue; -+ -+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) -+ continue; -+ -+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && -+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) && -+ !folio_test_swapcache(folio))) -+ folio_mark_dirty(folio); -+ -+ old_gen = folio_lru_gen(folio); -+ if (old_gen < 0) -+ folio_set_referenced(folio); -+ else if (old_gen != new_gen) -+ __set_bit(i, bitmap); -+ } -+ -+ arch_leave_lazy_mmu_mode(); -+ rcu_read_unlock(); -+ -+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { -+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { -+ folio = pfn_folio(pte_pfn(pte[i])); -+ folio_activate(folio); -+ } -+ return; -+ } -+ -+ /* folio_update_gen() requires stable folio_memcg() */ -+ if (!mem_cgroup_trylock_pages(memcg)) -+ return; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); -+ -+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { -+ folio = pfn_folio(pte_pfn(pte[i])); -+ if (folio_memcg_rcu(folio) != memcg) -+ continue; -+ -+ old_gen = folio_update_gen(folio, new_gen); -+ if (old_gen < 0 || old_gen == new_gen) -+ continue; -+ -+ lru_gen_update_size(lruvec, folio, old_gen, new_gen); -+ } -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ mem_cgroup_unlock_pages(); -+} -+ - /****************************************************************************** - * the eviction - ******************************************************************************/ -@@ -3423,6 +3601,12 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) - return true; - } - -+ /* promoted */ -+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { -+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]); -+ return true; -+ } -+ - /* protected */ - if (tier > tier_idx) { - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - -From patchwork Wed Jul 6 22:00:17 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908709 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 2F4F3C433EF - for ; Wed, 6 Jul 2022 22:01:21 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id B67CF8E0008; Wed, 6 Jul 2022 18:01:11 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id A53EA8E0001; Wed, 6 Jul 2022 18:01:11 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id 7BC5B8E0008; Wed, 6 Jul 2022 18:01:11 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com - [216.40.44.11]) - by kanga.kvack.org (Postfix) with ESMTP id 613D28E0001 - for ; Wed, 6 Jul 2022 18:01:11 -0400 (EDT) -Received: from smtpin31.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay02.hostedemail.com (Postfix) with ESMTP id 3AABF33A6A - for ; Wed, 6 Jul 2022 22:01:11 +0000 (UTC) -X-FDA: 79658046342.31.25FB448 -Received: from mail-yw1-f202.google.com (mail-yw1-f202.google.com - [209.85.128.202]) - by imf01.hostedemail.com (Postfix) with ESMTP id E1CB840019 - for ; Wed, 6 Jul 2022 22:01:04 +0000 (UTC) -Received: by mail-yw1-f202.google.com with SMTP id - 00721157ae682-31814f7654dso116292467b3.15 - for ; Wed, 06 Jul 2022 15:01:04 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=QRwOHdNvCJsdEWcZ8PiNBBmz8P6BxE21GfKaWd62Hcs=; - b=B0sFmQhPneIOAV1YVS6vP1oEsRe/BTwVflV6UdX0rzJbZE3r0MadNNURWxHDBukL1I - ELnHPWwed0WOAIrL8nftaw9ahABsSVQtJZPifYycg6l36RW7IRVZKE/FLzqQbao5lQVp - 2lyTvVaA0fwTYrrOAkppMHFJS9NhtOwiPWkN8qczgGMF/wfTpEMLT1c3gwH7x0wTp0CA - bmGxHDwTUBtMQvnhn6ZHsn3tW2Mue+sW/jt4FZPTcsu1wgfJSmRfIgRB/FRZTem/MRn1 - s04RGx0yhTSGEtt8gc/smm1CW1G6xElKiEo1r8zVeztvFvFMntvooTqGlsQvsu1rVxNL - nxIA== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=QRwOHdNvCJsdEWcZ8PiNBBmz8P6BxE21GfKaWd62Hcs=; - b=c4bdMkLhxyvEnutnBA5XUiYftxDhTV6M0oPnBxTEWM2+ScjFG2RdUzrhfOZMxurWrf - sKZIm+7oIW+QIFcYwXv79hTW6tBrlW/YZfAQk5To3Rg3HYz9y6EONeJBeRq7VD3s7cDK - yCD5V7/rn1zPfpa7e5FCEQ3uaAAzJpmXH0yzMlJovObkLUf5/2H61vCu5Ss3s0nyUzu5 - PmBA7cbVlJg8w2iHFXSYVvkQw+nwkfZPYiZf6a7C4b5cBaaqSjFwp9R1Dj4Dmt/hyfqL - 9aSikv3Dqd00tRhmEqz7CFDN0nFe0RCoyF/1imT4h/wLfpY/PfqAblpzKLs1DVaRiIpK - 0dcg== -X-Gm-Message-State: AJIora/sVJDwRZqeywVvAnGpxiHdOH6QHZPRRxUF3BgVCxqUwhhWX/Sv - qRTmED85dcbhYCcgcB7NXYcxpzqLgic= -X-Google-Smtp-Source: - AGRyM1uf52B2LApundNg2J5h3sPxkMm3CEPkOlcVUbZUCRfPLLNdJaJY9XTnmessJ0elI3BxSTXeuJSaFn0= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a81:2f4c:0:b0:31c:2bee:dfa4 with SMTP id - v73-20020a812f4c000000b0031c2beedfa4mr47320138ywv.483.1657144863343; Wed, 06 - Jul 2022 15:01:03 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:17 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-9-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 08/14] mm: multi-gen LRU: support page table walks -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144870; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=QRwOHdNvCJsdEWcZ8PiNBBmz8P6BxE21GfKaWd62Hcs=; - b=yTdhrGd2Yn7SlvL67mHmk0coJxZY8xT17lW/ewc4fNMOsnnVS1sKnIvZPTnTcn0Fe+dccs - i2sOOxXkGXEhgV1hMozofaMLxhLPzFCWAEqHzOEcXyOK4AUM8ZYrXZOlIFqaID1et19+VY - 9DG+lIYPEo08J5Ku8PkMzTbLZN1d/1w= -ARC-Authentication-Results: i=1; - imf01.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=B0sFmQhP; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf01.hostedemail.com: domain of - 3HwbGYgYKCGAWSXF8MEMMEJC.AMKJGLSV-KKIT8AI.MPE@flex--yuzhao.bounces.google.com - designates 209.85.128.202 as permitted sender) - smtp.mailfrom=3HwbGYgYKCGAWSXF8MEMMEJC.AMKJGLSV-KKIT8AI.MPE@flex--yuzhao.bounces.google.com -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144870; a=rsa-sha256; - cv=none; - b=AqNzfeMgehbGAF0NaBoToCygtio3p/CKcEQ2XvVEAyt3GUO/NWzgGf3L1H/PYlFzCPzE7Z - USY6Zs44Owz7ybSkwmXNxexJwWitplxX8dRNKKzWXbZkJ3+tWRbprGyp/NLFp9NFcFhMkz - orvBVvz7eGVPFZ1+kb859dx9H/Ub2G4= -X-Rspam-User: -X-Rspamd-Server: rspam07 -Authentication-Results: imf01.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=B0sFmQhP; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf01.hostedemail.com: domain of - 3HwbGYgYKCGAWSXF8MEMMEJC.AMKJGLSV-KKIT8AI.MPE@flex--yuzhao.bounces.google.com - designates 209.85.128.202 as permitted sender) - smtp.mailfrom=3HwbGYgYKCGAWSXF8MEMMEJC.AMKJGLSV-KKIT8AI.MPE@flex--yuzhao.bounces.google.com -X-Stat-Signature: z89omp4mfbgn9jqrf7gixf63n1ypp6j5 -X-Rspamd-Queue-Id: E1CB840019 -X-HE-Tag: 1657144864-40541 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -To further exploit spatial locality, the aging prefers to walk page -tables to search for young PTEs and promote hot pages. A kill switch -will be added in the next patch to disable this behavior. When -disabled, the aging relies on the rmap only. - -NB: this behavior has nothing similar with the page table scanning in -the 2.4 kernel [1], which searches page tables for old PTEs, adds cold -pages to swapcache and unmaps them. - -To avoid confusion, the term "iteration" specifically means the -traversal of an entire mm_struct list; the term "walk" will be applied -to page tables and the rmap, as usual. - -An mm_struct list is maintained for each memcg, and an mm_struct -follows its owner task to the new memcg when this task is migrated. -Given an lruvec, the aging iterates lruvec_memcg()->mm_list and calls -walk_page_range() with each mm_struct on this list to promote hot -pages before it increments max_seq. - -When multiple page table walkers iterate the same list, each of them -gets a unique mm_struct; therefore they can run concurrently. Page -table walkers ignore any misplaced pages, e.g., if an mm_struct was -migrated, pages it left in the previous memcg will not be promoted -when its current memcg is under reclaim. Similarly, page table walkers -will not promote pages from nodes other than the one under reclaim. - -This patch uses the following optimizations when walking page tables: -1. It tracks the usage of mm_struct's between context switches so that - page table walkers can skip processes that have been sleeping since - the last iteration. -2. It uses generational Bloom filters to record populated branches so - that page table walkers can reduce their search space based on the - query results, e.g., to skip page tables containing mostly holes or - misplaced pages. -3. It takes advantage of the accessed bit in non-leaf PMD entries when - CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. -4. It does not zigzag between a PGD table and the same PMD table - spanning multiple VMAs. IOW, it finishes all the VMAs within the - range of the same PMD table before it returns to a PGD table. This - improves the cache performance for workloads that have large - numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5. - -Server benchmark results: - Single workload: - fio (buffered I/O): no change - - Single workload: - memcached (anon): +[8, 10]% - Ops/sec KB/sec - patch1-7: 1147696.57 44640.29 - patch1-8: 1245274.91 48435.66 - - Configurations: - no change - -Client benchmark results: - kswapd profiles: - patch1-7 - 48.16% lzo1x_1_do_compress (real work) - 8.20% page_vma_mapped_walk (overhead) - 7.06% _raw_spin_unlock_irq - 2.92% ptep_clear_flush - 2.53% __zram_bvec_write - 2.11% do_raw_spin_lock - 2.02% memmove - 1.93% lru_gen_look_around - 1.56% free_unref_page_list - 1.40% memset - - patch1-8 - 49.44% lzo1x_1_do_compress (real work) - 6.19% page_vma_mapped_walk (overhead) - 5.97% _raw_spin_unlock_irq - 3.13% get_pfn_folio - 2.85% ptep_clear_flush - 2.42% __zram_bvec_write - 2.08% do_raw_spin_lock - 1.92% memmove - 1.44% alloc_zspage - 1.36% memset - - Configurations: - no change - -Thanks to the following developers for their efforts [3]. - kernel test robot - -[1] https://lwn.net/Articles/23732/ -[2] https://llvm.org/docs/ScudoHardenedAllocator.html -[3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/ - -Signed-off-by: Yu Zhao -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - fs/exec.c | 2 + - include/linux/memcontrol.h | 5 + - include/linux/mm_types.h | 77 +++ - include/linux/mmzone.h | 56 +- - include/linux/swap.h | 4 + - kernel/exit.c | 1 + - kernel/fork.c | 9 + - kernel/sched/core.c | 1 + - mm/memcontrol.c | 25 + - mm/vmscan.c | 1000 +++++++++++++++++++++++++++++++++++- - 10 files changed, 1163 insertions(+), 17 deletions(-) - -diff --git a/fs/exec.c b/fs/exec.c -index 0989fb8472a1..b1fda634e01a 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -1015,6 +1015,7 @@ static int exec_mmap(struct mm_struct *mm) - active_mm = tsk->active_mm; - tsk->active_mm = mm; - tsk->mm = mm; -+ lru_gen_add_mm(mm); - /* - * This prevents preemption while active_mm is being loaded and - * it and mm are being updated, which could cause problems for -@@ -1030,6 +1031,7 @@ static int exec_mmap(struct mm_struct *mm) - tsk->mm->vmacache_seqnum = 0; - vmacache_flush(tsk); - task_unlock(tsk); -+ lru_gen_use_mm(mm); - if (old_mm) { - mmap_read_unlock(old_mm); - BUG_ON(active_mm != old_mm); -diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h -index 9d0fea17f9ef..eca62345fdd5 100644 ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -350,6 +350,11 @@ struct mem_cgroup { - struct deferred_split deferred_split_queue; - #endif - -+#ifdef CONFIG_LRU_GEN -+ /* per-memcg mm_struct list */ -+ struct lru_gen_mm_list mm_list; -+#endif -+ - struct mem_cgroup_per_node *nodeinfo[]; - }; - -diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index c29ab4c0cd5c..7db51151a28b 100644 ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -3,6 +3,7 @@ - #define _LINUX_MM_TYPES_H - - #include -+#include - - #include - #include -@@ -17,6 +18,7 @@ - #include - #include - #include -+#include - - #include - -@@ -667,6 +669,22 @@ struct mm_struct { - */ - unsigned long ksm_merging_pages; - #endif -+#ifdef CONFIG_LRU_GEN -+ struct { -+ /* this mm_struct is on lru_gen_mm_list */ -+ struct list_head list; -+ /* -+ * Set when switching to this mm_struct, as a hint of -+ * whether it has been used since the last time per-node -+ * page table walkers cleared the corresponding bits. -+ */ -+ unsigned long bitmap; -+#ifdef CONFIG_MEMCG -+ /* points to the memcg of "owner" above */ -+ struct mem_cgroup *memcg; -+#endif -+ } lru_gen; -+#endif /* CONFIG_LRU_GEN */ - } __randomize_layout; - - /* -@@ -693,6 +711,65 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) - return (struct cpumask *)&mm->cpu_bitmap; - } - -+#ifdef CONFIG_LRU_GEN -+ -+struct lru_gen_mm_list { -+ /* mm_struct list for page table walkers */ -+ struct list_head fifo; -+ /* protects the list above */ -+ spinlock_t lock; -+}; -+ -+void lru_gen_add_mm(struct mm_struct *mm); -+void lru_gen_del_mm(struct mm_struct *mm); -+#ifdef CONFIG_MEMCG -+void lru_gen_migrate_mm(struct mm_struct *mm); -+#endif -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+ INIT_LIST_HEAD(&mm->lru_gen.list); -+ mm->lru_gen.bitmap = 0; -+#ifdef CONFIG_MEMCG -+ mm->lru_gen.memcg = NULL; -+#endif -+} -+ -+static inline void lru_gen_use_mm(struct mm_struct *mm) -+{ -+ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ -+ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); -+ -+ if (!(current->flags & PF_KTHREAD)) -+ WRITE_ONCE(mm->lru_gen.bitmap, -1); -+} -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_add_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_del_mm(struct mm_struct *mm) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+} -+#endif -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_use_mm(struct mm_struct *mm) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct mmu_gather; - extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); - extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 4fd7fc16eeb4..0cf0856b484a 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -405,7 +405,7 @@ enum { - * min_seq behind. - * - * The number of pages in each generation is eventually consistent and therefore -- * can be transiently negative. -+ * can be transiently negative when reset_batch_size() is pending. - */ - struct lru_gen_struct { - /* the aging increments the youngest generation number */ -@@ -427,6 +427,53 @@ struct lru_gen_struct { - atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; - }; - -+enum { -+ MM_LEAF_TOTAL, /* total leaf entries */ -+ MM_LEAF_OLD, /* old leaf entries */ -+ MM_LEAF_YOUNG, /* young leaf entries */ -+ MM_NONLEAF_TOTAL, /* total non-leaf entries */ -+ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ -+ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ -+ NR_MM_STATS -+}; -+ -+/* double-buffering Bloom filters */ -+#define NR_BLOOM_FILTERS 2 -+ -+struct lru_gen_mm_state { -+ /* set to max_seq after each iteration */ -+ unsigned long seq; -+ /* where the current iteration continues (inclusive) */ -+ struct list_head *head; -+ /* where the last iteration ended (exclusive) */ -+ struct list_head *tail; -+ /* to wait for the last page table walker to finish */ -+ struct wait_queue_head wait; -+ /* Bloom filters flip after each iteration */ -+ unsigned long *filters[NR_BLOOM_FILTERS]; -+ /* the mm stats for debugging */ -+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; -+ /* the number of concurrent page table walkers */ -+ int nr_walkers; -+}; -+ -+struct lru_gen_mm_walk { -+ /* the lruvec under reclaim */ -+ struct lruvec *lruvec; -+ /* unstable max_seq from lru_gen_struct */ -+ unsigned long max_seq; -+ /* the next address within an mm to scan */ -+ unsigned long next_addr; -+ /* to batch promoted pages */ -+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* to batch the mm stats */ -+ int mm_stats[NR_MM_STATS]; -+ /* total batched items */ -+ int batched; -+ bool can_swap; -+ bool force_scan; -+}; -+ - void lru_gen_init_lruvec(struct lruvec *lruvec); - void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - -@@ -477,6 +524,8 @@ struct lruvec { - #ifdef CONFIG_LRU_GEN - /* evictable pages divided into generations */ - struct lru_gen_struct lrugen; -+ /* to concurrently iterate lru_gen_mm_list */ -+ struct lru_gen_mm_state mm_state; - #endif - #ifdef CONFIG_MEMCG - struct pglist_data *pgdat; -@@ -1070,6 +1119,11 @@ typedef struct pglist_data { - - unsigned long flags; - -+#ifdef CONFIG_LRU_GEN -+ /* kswap mm walk data */ -+ struct lru_gen_mm_walk mm_walk; -+#endif -+ - ZONE_PADDING(_pad2_) - - /* Per-node vmstats */ -diff --git a/include/linux/swap.h b/include/linux/swap.h -index 0c0fed1b348f..b66cbc7ea93c 100644 ---- a/include/linux/swap.h -+++ b/include/linux/swap.h -@@ -162,6 +162,10 @@ union swap_header { - */ - struct reclaim_state { - unsigned long reclaimed_slab; -+#ifdef CONFIG_LRU_GEN -+ /* per-thread mm walk data */ -+ struct lru_gen_mm_walk *mm_walk; -+#endif - }; - - #ifdef __KERNEL__ -diff --git a/kernel/exit.c b/kernel/exit.c -index f072959fcab7..f2d4d48ea790 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -466,6 +466,7 @@ void mm_update_next_owner(struct mm_struct *mm) - goto retry; - } - WRITE_ONCE(mm->owner, c); -+ lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); - } -diff --git a/kernel/fork.c b/kernel/fork.c -index 9d44f2d46c69..67b7666d7321 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -1152,6 +1152,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, - goto fail_nocontext; - - mm->user_ns = get_user_ns(user_ns); -+ lru_gen_init_mm(mm); - return mm; - - fail_nocontext: -@@ -1194,6 +1195,7 @@ static inline void __mmput(struct mm_struct *mm) - } - if (mm->binfmt) - module_put(mm->binfmt->module); -+ lru_gen_del_mm(mm); - mmdrop(mm); - } - -@@ -2676,6 +2678,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) - get_task_struct(p); - } - -+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { -+ /* lock the task to synchronize with memcg migration */ -+ task_lock(p); -+ lru_gen_add_mm(p->mm); -+ task_unlock(p); -+ } -+ - wake_up_new_task(p); - - /* forking complete and child started to run, tell ptracer */ -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index da0bf6fe9ecd..320d82697037 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -5130,6 +5130,7 @@ context_switch(struct rq *rq, struct task_struct *prev, - * finish_task_switch()'s mmdrop(). - */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ lru_gen_use_mm(next->mm); - - if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 743f8513f1c3..84f3707667bc 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -6133,6 +6133,30 @@ static void mem_cgroup_move_task(void) - } - #endif - -+#ifdef CONFIG_LRU_GEN -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+ struct task_struct *task; -+ struct cgroup_subsys_state *css; -+ -+ /* find the first leader if there is any */ -+ cgroup_taskset_for_each_leader(task, css, tset) -+ break; -+ -+ if (!task) -+ return; -+ -+ task_lock(task); -+ if (task->mm && task->mm->owner == task) -+ lru_gen_migrate_mm(task->mm); -+ task_unlock(task); -+} -+#else -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) - { - if (value == PAGE_COUNTER_MAX) -@@ -6536,6 +6560,7 @@ struct cgroup_subsys memory_cgrp_subsys = { - .css_reset = mem_cgroup_css_reset, - .css_rstat_flush = mem_cgroup_css_rstat_flush, - .can_attach = mem_cgroup_can_attach, -+ .attach = mem_cgroup_attach, - .cancel_attach = mem_cgroup_cancel_attach, - .post_attach = mem_cgroup_move_task, - .dfl_cftypes = memory_files, -diff --git a/mm/vmscan.c b/mm/vmscan.c -index ec786fc556a7..8e55a1ce1ae0 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -50,6 +50,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -3024,7 +3026,7 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, - for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ - for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) - --static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid) -+static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) - { - struct pglist_data *pgdat = NODE_DATA(nid); - -@@ -3069,6 +3071,372 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) - get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; - } - -+/****************************************************************************** -+ * mm_struct list -+ ******************************************************************************/ -+ -+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) -+{ -+ static struct lru_gen_mm_list mm_list = { -+ .fifo = LIST_HEAD_INIT(mm_list.fifo), -+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), -+ }; -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ return &memcg->mm_list; -+#endif -+ VM_WARN_ON_ONCE(!mem_cgroup_disabled()); -+ -+ return &mm_list; -+} -+ -+void lru_gen_add_mm(struct mm_struct *mm) -+{ -+ int nid; -+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); -+ -+ VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); -+#ifdef CONFIG_MEMCG -+ VM_WARN_ON_ONCE(mm->lru_gen.memcg); -+ mm->lru_gen.memcg = memcg; -+#endif -+ spin_lock(&mm_list->lock); -+ -+ for_each_node_state(nid, N_MEMORY) { -+ struct lruvec *lruvec = get_lruvec(memcg, nid); -+ -+ if (!lruvec) -+ continue; -+ -+ /* the first addition since the last iteration */ -+ if (lruvec->mm_state.tail == &mm_list->fifo) -+ lruvec->mm_state.tail = &mm->lru_gen.list; -+ } -+ -+ list_add_tail(&mm->lru_gen.list, &mm_list->fifo); -+ -+ spin_unlock(&mm_list->lock); -+} -+ -+void lru_gen_del_mm(struct mm_struct *mm) -+{ -+ int nid; -+ struct lru_gen_mm_list *mm_list; -+ struct mem_cgroup *memcg = NULL; -+ -+ if (list_empty(&mm->lru_gen.list)) -+ return; -+ -+#ifdef CONFIG_MEMCG -+ memcg = mm->lru_gen.memcg; -+#endif -+ mm_list = get_mm_list(memcg); -+ -+ spin_lock(&mm_list->lock); -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(memcg, nid); -+ -+ if (!lruvec) -+ continue; -+ -+ /* where the last iteration ended (exclusive) */ -+ if (lruvec->mm_state.tail == &mm->lru_gen.list) -+ lruvec->mm_state.tail = lruvec->mm_state.tail->next; -+ -+ /* where the current iteration continues (inclusive) */ -+ if (lruvec->mm_state.head != &mm->lru_gen.list) -+ continue; -+ -+ lruvec->mm_state.head = lruvec->mm_state.head->next; -+ /* the deletion ends the current iteration */ -+ if (lruvec->mm_state.head == &mm_list->fifo) -+ WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); -+ } -+ -+ list_del_init(&mm->lru_gen.list); -+ -+ spin_unlock(&mm_list->lock); -+ -+#ifdef CONFIG_MEMCG -+ mem_cgroup_put(mm->lru_gen.memcg); -+ mm->lru_gen.memcg = NULL; -+#endif -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+ struct mem_cgroup *memcg; -+ -+ lockdep_assert_held(&mm->owner->alloc_lock); -+ -+ /* for mm_update_next_owner() */ -+ if (mem_cgroup_disabled()) -+ return; -+ -+ rcu_read_lock(); -+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); -+ rcu_read_unlock(); -+ if (memcg == mm->lru_gen.memcg) -+ return; -+ -+ VM_WARN_ON_ONCE(!mm->lru_gen.memcg); -+ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); -+ -+ lru_gen_del_mm(mm); -+ lru_gen_add_mm(mm); -+} -+#endif -+ -+/* -+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when -+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of -+ * bits in a bitmap, k is the number of hash functions and n is the number of -+ * inserted items. -+ * -+ * Page table walkers use one of the two filters to reduce their search space. -+ * To get rid of non-leaf entries that no longer have enough leaf entries, the -+ * aging uses the double-buffering technique to flip to the other filter each -+ * time it produces a new generation. For non-leaf entries that have enough -+ * leaf entries, the aging carries them over to the next generation in -+ * walk_pmd_range(); the eviction also report them when walking the rmap -+ * in lru_gen_look_around(). -+ * -+ * For future optimizations: -+ * 1. It's not necessary to keep both filters all the time. The spare one can be -+ * freed after the RCU grace period and reallocated if needed again. -+ * 2. And when reallocating, it's worth scaling its size according to the number -+ * of inserted entries in the other filter, to reduce the memory overhead on -+ * small systems and false positives on large systems. -+ * 3. Jenkins' hash function is an alternative to Knuth's. -+ */ -+#define BLOOM_FILTER_SHIFT 15 -+ -+static inline int filter_gen_from_seq(unsigned long seq) -+{ -+ return seq % NR_BLOOM_FILTERS; -+} -+ -+static void get_item_key(void *item, int *key) -+{ -+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); -+ -+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); -+ -+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); -+ key[1] = hash >> BLOOM_FILTER_SHIFT; -+} -+ -+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) -+{ -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = lruvec->mm_state.filters[gen]; -+ if (filter) { -+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); -+ return; -+ } -+ -+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), -+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); -+ WRITE_ONCE(lruvec->mm_state.filters[gen], filter); -+} -+ -+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_state.filters[gen]); -+ if (!filter) -+ return; -+ -+ get_item_key(item, key); -+ -+ if (!test_bit(key[0], filter)) -+ set_bit(key[0], filter); -+ if (!test_bit(key[1], filter)) -+ set_bit(key[1], filter); -+} -+ -+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_state.filters[gen]); -+ if (!filter) -+ return true; -+ -+ get_item_key(item, key); -+ -+ return test_bit(key[0], filter) && test_bit(key[1], filter); -+} -+ -+static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) -+{ -+ int i; -+ int hist; -+ -+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); -+ -+ if (walk) { -+ hist = lru_hist_from_seq(walk->max_seq); -+ -+ for (i = 0; i < NR_MM_STATS; i++) { -+ WRITE_ONCE(lruvec->mm_state.stats[hist][i], -+ lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); -+ walk->mm_stats[i] = 0; -+ } -+ } -+ -+ if (NR_HIST_GENS > 1 && last) { -+ hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); -+ -+ for (i = 0; i < NR_MM_STATS; i++) -+ WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); -+ } -+} -+ -+static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) -+{ -+ int type; -+ unsigned long size = 0; -+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); -+ int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); -+ -+ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) -+ return true; -+ -+ clear_bit(key, &mm->lru_gen.bitmap); -+ -+ for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { -+ size += type ? get_mm_counter(mm, MM_FILEPAGES) : -+ get_mm_counter(mm, MM_ANONPAGES) + -+ get_mm_counter(mm, MM_SHMEMPAGES); -+ } -+ -+ if (size < MIN_LRU_BATCH) -+ return true; -+ -+ if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags)) -+ return true; -+ -+ return !mmget_not_zero(mm); -+} -+ -+static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, -+ struct mm_struct **iter) -+{ -+ bool first = false; -+ bool last = true; -+ struct mm_struct *mm = NULL; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); -+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state; -+ -+ /* -+ * There are four interesting cases for this page table walker: -+ * 1. It tries to start a new iteration of mm_list with a stale max_seq; -+ * there is nothing left to do. -+ * 2. It's the first of the current generation, and it needs to reset -+ * the Bloom filter for the next generation. -+ * 3. It reaches the end of mm_list, and it needs to increment -+ * mm_state->seq; the iteration is done. -+ * 4. It's the last of the current generation, and it needs to reset the -+ * mm stats counters for the next generation. -+ */ -+ spin_lock(&mm_list->lock); -+ -+ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); -+ VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); -+ VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); -+ -+ if (walk->max_seq <= mm_state->seq) { -+ if (!*iter) -+ last = false; -+ goto done; -+ } -+ -+ if (!mm_state->nr_walkers) { -+ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); -+ -+ mm_state->head = mm_list->fifo.next; -+ first = true; -+ } -+ -+ while (!mm && mm_state->head != &mm_list->fifo) { -+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); -+ -+ mm_state->head = mm_state->head->next; -+ -+ /* force scan for those added after the last iteration */ -+ if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { -+ mm_state->tail = mm_state->head; -+ walk->force_scan = true; -+ } -+ -+ if (should_skip_mm(mm, walk)) -+ mm = NULL; -+ } -+ -+ if (mm_state->head == &mm_list->fifo) -+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1); -+done: -+ if (*iter && !mm) -+ mm_state->nr_walkers--; -+ if (!*iter && mm) -+ mm_state->nr_walkers++; -+ -+ if (mm_state->nr_walkers) -+ last = false; -+ -+ if (*iter || last) -+ reset_mm_stats(lruvec, walk, last); -+ -+ spin_unlock(&mm_list->lock); -+ -+ if (mm && first) -+ reset_bloom_filter(lruvec, walk->max_seq + 1); -+ -+ if (*iter) -+ mmput_async(*iter); -+ -+ *iter = mm; -+ -+ return last; -+} -+ -+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) -+{ -+ bool success = false; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); -+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state; -+ -+ spin_lock(&mm_list->lock); -+ -+ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); -+ -+ if (max_seq > mm_state->seq && !mm_state->nr_walkers) { -+ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); -+ -+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1); -+ reset_mm_stats(lruvec, NULL, true); -+ success = true; -+ } -+ -+ spin_unlock(&mm_list->lock); -+ -+ return success; -+} -+ - /****************************************************************************** - * refault feedback loop - ******************************************************************************/ -@@ -3219,6 +3587,118 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai - return new_gen; - } - -+static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, -+ int old_gen, int new_gen) -+{ -+ int type = folio_is_file_lru(folio); -+ int zone = folio_zonenum(folio); -+ int delta = folio_nr_pages(folio); -+ -+ VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); -+ VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); -+ -+ walk->batched++; -+ -+ walk->nr_pages[old_gen][type][zone] -= delta; -+ walk->nr_pages[new_gen][type][zone] += delta; -+} -+ -+static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) -+{ -+ int gen, type, zone; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ walk->batched = 0; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ enum lru_list lru = type * LRU_INACTIVE_FILE; -+ int delta = walk->nr_pages[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ walk->nr_pages[gen][type][zone] = 0; -+ WRITE_ONCE(lrugen->nr_pages[gen][type][zone], -+ lrugen->nr_pages[gen][type][zone] + delta); -+ -+ if (lru_gen_is_active(lruvec, gen)) -+ lru += LRU_ACTIVE; -+ __update_lru_size(lruvec, lru, zone, delta); -+ } -+} -+ -+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) -+{ -+ struct address_space *mapping; -+ struct vm_area_struct *vma = args->vma; -+ struct lru_gen_mm_walk *walk = args->private; -+ -+ if (!vma_is_accessible(vma)) -+ return true; -+ -+ if (is_vm_hugetlb_page(vma)) -+ return true; -+ -+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) -+ return true; -+ -+ if (vma == get_gate_vma(vma->vm_mm)) -+ return true; -+ -+ if (vma_is_anonymous(vma)) -+ return !walk->can_swap; -+ -+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) -+ return true; -+ -+ mapping = vma->vm_file->f_mapping; -+ if (mapping_unevictable(mapping)) -+ return true; -+ -+ if (shmem_mapping(mapping)) -+ return !walk->can_swap; -+ -+ /* to exclude special mappings like dax, etc. */ -+ return !mapping->a_ops->read_folio; -+} -+ -+/* -+ * Some userspace memory allocators map many single-page VMAs. Instead of -+ * returning back to the PGD table for each of such VMAs, finish an entire PMD -+ * table to reduce zigzags and improve cache performance. -+ */ -+static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, -+ unsigned long *vm_start, unsigned long *vm_end) -+{ -+ unsigned long start = round_up(*vm_end, size); -+ unsigned long end = (start | ~mask) + 1; -+ -+ VM_WARN_ON_ONCE(mask & size); -+ VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); -+ -+ while (args->vma) { -+ if (start >= args->vma->vm_end) { -+ args->vma = args->vma->vm_next; -+ continue; -+ } -+ -+ if (end && end <= args->vma->vm_start) -+ return false; -+ -+ if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) { -+ args->vma = args->vma->vm_next; -+ continue; -+ } -+ -+ *vm_start = max(start, args->vma->vm_start); -+ *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; -+ -+ return true; -+ } -+ -+ return false; -+} -+ - static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) - { - unsigned long pfn = pte_pfn(pte); -@@ -3237,8 +3717,28 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned - return pfn; - } - -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) -+{ -+ unsigned long pfn = pmd_pfn(pmd); -+ -+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); -+ -+ if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) -+ return -1; -+ -+ if (WARN_ON_ONCE(pmd_devmap(pmd))) -+ return -1; -+ -+ if (WARN_ON_ONCE(!pfn_valid(pfn))) -+ return -1; -+ -+ return pfn; -+} -+#endif -+ - static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, -- struct pglist_data *pgdat) -+ struct pglist_data *pgdat, bool can_swap) - { - struct folio *folio; - -@@ -3253,9 +3753,371 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, - if (folio_memcg_rcu(folio) != memcg) - return NULL; - -+ /* file VMAs can contain anon pages from COW */ -+ if (!folio_is_file_lru(folio) && !can_swap) -+ return NULL; -+ - return folio; - } - -+static bool suitable_to_scan(int total, int young) -+{ -+ int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); -+ -+ /* suitable if the average number of young PTEs per cacheline is >=1 */ -+ return young * n >= total; -+} -+ -+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, -+ struct mm_walk *args) -+{ -+ int i; -+ pte_t *pte; -+ spinlock_t *ptl; -+ unsigned long addr; -+ int total = 0; -+ int young = 0; -+ struct lru_gen_mm_walk *walk = args->private; -+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); -+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); -+ -+ VM_WARN_ON_ONCE(pmd_leaf(*pmd)); -+ -+ ptl = pte_lockptr(args->mm, pmd); -+ if (!spin_trylock(ptl)) -+ return false; -+ -+ arch_enter_lazy_mmu_mode(); -+ -+ pte = pte_offset_map(pmd, start & PMD_MASK); -+restart: -+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ unsigned long pfn; -+ struct folio *folio; -+ -+ total++; -+ walk->mm_stats[MM_LEAF_TOTAL]++; -+ -+ pfn = get_pte_pfn(pte[i], args->vma, addr); -+ if (pfn == -1) -+ continue; -+ -+ if (!pte_young(pte[i])) { -+ walk->mm_stats[MM_LEAF_OLD]++; -+ continue; -+ } -+ -+ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); -+ if (!folio) -+ continue; -+ -+ if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) -+ continue; -+ -+ young++; -+ walk->mm_stats[MM_LEAF_YOUNG]++; -+ -+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && -+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) && -+ !folio_test_swapcache(folio))) -+ folio_mark_dirty(folio); -+ -+ old_gen = folio_update_gen(folio, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(walk, folio, old_gen, new_gen); -+ } -+ -+ if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) -+ goto restart; -+ -+ pte_unmap(pte); -+ -+ arch_leave_lazy_mmu_mode(); -+ spin_unlock(ptl); -+ -+ return suitable_to_scan(total, young); -+} -+ -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, -+ struct mm_walk *args, unsigned long *bitmap, unsigned long *start) -+{ -+ int i; -+ pmd_t *pmd; -+ spinlock_t *ptl; -+ struct lru_gen_mm_walk *walk = args->private; -+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); -+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); -+ -+ VM_WARN_ON_ONCE(pud_leaf(*pud)); -+ -+ /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ -+ if (*start == -1) { -+ *start = next; -+ return; -+ } -+ -+ i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); -+ if (i && i <= MIN_LRU_BATCH) { -+ __set_bit(i - 1, bitmap); -+ return; -+ } -+ -+ pmd = pmd_offset(pud, *start); -+ -+ ptl = pmd_lockptr(args->mm, pmd); -+ if (!spin_trylock(ptl)) -+ goto done; -+ -+ arch_enter_lazy_mmu_mode(); -+ -+ do { -+ unsigned long pfn; -+ struct folio *folio; -+ unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; -+ -+ pfn = get_pmd_pfn(pmd[i], vma, addr); -+ if (pfn == -1) -+ goto next; -+ -+ if (!pmd_trans_huge(pmd[i])) { -+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) -+ pmdp_test_and_clear_young(vma, addr, pmd + i); -+ goto next; -+ } -+ -+ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); -+ if (!folio) -+ goto next; -+ -+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) -+ goto next; -+ -+ walk->mm_stats[MM_LEAF_YOUNG]++; -+ -+ if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && -+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) && -+ !folio_test_swapcache(folio))) -+ folio_mark_dirty(folio); -+ -+ old_gen = folio_update_gen(folio, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(walk, folio, old_gen, new_gen); -+next: -+ i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; -+ } while (i <= MIN_LRU_BATCH); -+ -+ arch_leave_lazy_mmu_mode(); -+ spin_unlock(ptl); -+done: -+ *start = -1; -+ bitmap_zero(bitmap, MIN_LRU_BATCH); -+} -+#else -+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, -+ struct mm_walk *args, unsigned long *bitmap, unsigned long *start) -+{ -+} -+#endif -+ -+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, -+ struct mm_walk *args) -+{ -+ int i; -+ pmd_t *pmd; -+ unsigned long next; -+ unsigned long addr; -+ struct vm_area_struct *vma; -+ unsigned long pos = -1; -+ struct lru_gen_mm_walk *walk = args->private; -+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; -+ -+ VM_WARN_ON_ONCE(pud_leaf(*pud)); -+ -+ /* -+ * Finish an entire PMD in two passes: the first only reaches to PTE -+ * tables to avoid taking the PMD lock; the second, if necessary, takes -+ * the PMD lock to clear the accessed bit in PMD entries. -+ */ -+ pmd = pmd_offset(pud, start & PUD_MASK); -+restart: -+ /* walk_pte_range() may call get_next_vma() */ -+ vma = args->vma; -+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { -+ pmd_t val = pmd_read_atomic(pmd + i); -+ -+ /* for pmd_read_atomic() */ -+ barrier(); -+ -+ next = pmd_addr_end(addr, end); -+ -+ if (!pmd_present(val) || is_huge_zero_pmd(val)) { -+ walk->mm_stats[MM_LEAF_TOTAL]++; -+ continue; -+ } -+ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ if (pmd_trans_huge(val)) { -+ unsigned long pfn = pmd_pfn(val); -+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); -+ -+ walk->mm_stats[MM_LEAF_TOTAL]++; -+ -+ if (!pmd_young(val)) { -+ walk->mm_stats[MM_LEAF_OLD]++; -+ continue; -+ } -+ -+ /* try to avoid unnecessary memory loads */ -+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) -+ continue; -+ -+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); -+ continue; -+ } -+#endif -+ walk->mm_stats[MM_NONLEAF_TOTAL]++; -+ -+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG -+ if (!pmd_young(val)) -+ continue; -+ -+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); -+#endif -+ if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) -+ continue; -+ -+ walk->mm_stats[MM_NONLEAF_FOUND]++; -+ -+ if (!walk_pte_range(&val, addr, next, args)) -+ continue; -+ -+ walk->mm_stats[MM_NONLEAF_ADDED]++; -+ -+ /* carry over to the next generation */ -+ update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); -+ } -+ -+ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); -+ -+ if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) -+ goto restart; -+} -+ -+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, -+ struct mm_walk *args) -+{ -+ int i; -+ pud_t *pud; -+ unsigned long addr; -+ unsigned long next; -+ struct lru_gen_mm_walk *walk = args->private; -+ -+ VM_WARN_ON_ONCE(p4d_leaf(*p4d)); -+ -+ pud = pud_offset(p4d, start & P4D_MASK); -+restart: -+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { -+ pud_t val = READ_ONCE(pud[i]); -+ -+ next = pud_addr_end(addr, end); -+ -+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) -+ continue; -+ -+ walk_pmd_range(&val, addr, next, args); -+ -+ if (walk->batched >= MAX_LRU_BATCH) { -+ end = (addr | ~PUD_MASK) + 1; -+ goto done; -+ } -+ } -+ -+ if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) -+ goto restart; -+ -+ end = round_up(end, P4D_SIZE); -+done: -+ if (!end || !args->vma) -+ return 1; -+ -+ walk->next_addr = max(end, args->vma->vm_start); -+ -+ return -EAGAIN; -+} -+ -+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) -+{ -+ static const struct mm_walk_ops mm_walk_ops = { -+ .test_walk = should_skip_vma, -+ .p4d_entry = walk_pud_range, -+ }; -+ -+ int err; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ -+ walk->next_addr = FIRST_USER_ADDRESS; -+ -+ do { -+ err = -EBUSY; -+ -+ /* folio_update_gen() requires stable folio_memcg() */ -+ if (!mem_cgroup_trylock_pages(memcg)) -+ break; -+ -+ /* the caller might be holding the lock for write */ -+ if (mmap_read_trylock(mm)) { -+ err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); -+ -+ mmap_read_unlock(mm); -+ } -+ -+ mem_cgroup_unlock_pages(); -+ -+ if (walk->batched) { -+ spin_lock_irq(&lruvec->lru_lock); -+ reset_batch_size(lruvec, walk); -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+ -+ cond_resched(); -+ } while (err == -EAGAIN); -+} -+ -+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) -+{ -+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; -+ -+ if (pgdat && current_is_kswapd()) { -+ VM_WARN_ON_ONCE(walk); -+ -+ walk = &pgdat->mm_walk; -+ } else if (!pgdat && !walk) { -+ VM_WARN_ON_ONCE(current_is_kswapd()); -+ -+ walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); -+ } -+ -+ current->reclaim_state->mm_walk = walk; -+ -+ return walk; -+} -+ -+static void clear_mm_walk(void) -+{ -+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; -+ -+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); -+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); -+ -+ current->reclaim_state->mm_walk = NULL; -+ -+ if (!current_is_kswapd()) -+ kfree(walk); -+} -+ - static void inc_min_seq(struct lruvec *lruvec, int type) - { - struct lru_gen_struct *lrugen = &lruvec->lrugen; -@@ -3307,7 +4169,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) - return success; - } - --static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) -+static void inc_max_seq(struct lruvec *lruvec, bool can_swap) - { - int prev, next; - int type, zone; -@@ -3317,9 +4179,6 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s - - VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); - -- if (max_seq != lrugen->max_seq) -- goto unlock; -- - for (type = 0; type < ANON_AND_FILE; type++) { - if (get_nr_gens(lruvec, type) != MAX_NR_GENS) - continue; -@@ -3357,10 +4216,76 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s - - /* make sure preceding modifications appear */ - smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); --unlock: -+ - spin_unlock_irq(&lruvec->lru_lock); - } - -+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, -+ struct scan_control *sc, bool can_swap) -+{ -+ bool success; -+ struct lru_gen_mm_walk *walk; -+ struct mm_struct *mm = NULL; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); -+ -+ /* see the comment in iterate_mm_list() */ -+ if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { -+ success = false; -+ goto done; -+ } -+ -+ /* -+ * If the hardware doesn't automatically set the accessed bit, fallback -+ * to lru_gen_look_around(), which only clears the accessed bit in a -+ * handful of PTEs. Spreading the work out over a period of time usually -+ * is less efficient, but it avoids bursty page faults. -+ */ -+ if (!arch_has_hw_pte_young()) { -+ success = iterate_mm_list_nowalk(lruvec, max_seq); -+ goto done; -+ } -+ -+ walk = set_mm_walk(NULL); -+ if (!walk) { -+ success = iterate_mm_list_nowalk(lruvec, max_seq); -+ goto done; -+ } -+ -+ walk->lruvec = lruvec; -+ walk->max_seq = max_seq; -+ walk->can_swap = can_swap; -+ walk->force_scan = false; -+ -+ do { -+ success = iterate_mm_list(lruvec, walk, &mm); -+ if (mm) -+ walk_mm(lruvec, mm, walk); -+ -+ cond_resched(); -+ } while (mm); -+done: -+ if (!success) { -+ if (!current_is_kswapd() && !sc->priority) -+ wait_event_killable(lruvec->mm_state.wait, -+ max_seq < READ_ONCE(lrugen->max_seq)); -+ -+ return max_seq < READ_ONCE(lrugen->max_seq); -+ } -+ -+ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); -+ -+ inc_max_seq(lruvec, can_swap); -+ /* either this sees any waiters or they will see updated max_seq */ -+ if (wq_has_sleeper(&lruvec->mm_state.wait)) -+ wake_up_all(&lruvec->mm_state.wait); -+ -+ wakeup_flusher_threads(WB_REASON_VMSCAN); -+ -+ return true; -+} -+ - static unsigned long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq, - unsigned long *min_seq, bool can_swap, bool *need_aging) - { -@@ -3438,7 +4363,7 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) - nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0; - - if (nr_to_scan && need_aging) -- inc_max_seq(lruvec, max_seq, swappiness); -+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); - } - - static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -@@ -3447,6 +4372,8 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - - VM_WARN_ON_ONCE(!current_is_kswapd()); - -+ set_mm_walk(pgdat); -+ - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -@@ -3455,11 +4382,16 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - - cond_resched(); - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ clear_mm_walk(); - } - - /* - * This function exploits spatial locality when shrink_page_list() walks the -- * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. -+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If -+ * the scan was done cacheline efficiently, it adds the PMD entry pointing to -+ * the PTE table to the Bloom filter. This forms a feedback loop between the -+ * eviction and the aging. - */ - void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - { -@@ -3468,6 +4400,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - unsigned long start; - unsigned long end; - unsigned long addr; -+ struct lru_gen_mm_walk *walk; -+ int young = 0; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; - struct folio *folio = pfn_folio(pvmw->pfn); - struct mem_cgroup *memcg = folio_memcg(folio); -@@ -3497,6 +4431,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - } - - pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; -+ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; - - rcu_read_lock(); - arch_enter_lazy_mmu_mode(); -@@ -3511,13 +4446,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - if (!pte_young(pte[i])) - continue; - -- folio = get_pfn_folio(pfn, memcg, pgdat); -+ folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); - if (!folio) - continue; - - if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) - continue; - -+ young++; -+ - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) -@@ -3533,7 +4470,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - arch_leave_lazy_mmu_mode(); - rcu_read_unlock(); - -- if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { -+ /* feedback from rmap walkers to page table walkers */ -+ if (suitable_to_scan(i, young)) -+ update_bloom_filter(lruvec, max_seq, pvmw->pmd); -+ -+ if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - folio_activate(folio); -@@ -3545,8 +4486,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - if (!mem_cgroup_trylock_pages(memcg)) - return; - -- spin_lock_irq(&lruvec->lru_lock); -- new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); -+ if (!walk) { -+ spin_lock_irq(&lruvec->lru_lock); -+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); -+ } - - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); -@@ -3557,10 +4500,14 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) - if (old_gen < 0 || old_gen == new_gen) - continue; - -- lru_gen_update_size(lruvec, folio, old_gen, new_gen); -+ if (walk) -+ update_batch_size(walk, folio, old_gen, new_gen); -+ else -+ lru_gen_update_size(lruvec, folio, old_gen, new_gen); - } - -- spin_unlock_irq(&lruvec->lru_lock); -+ if (!walk) -+ spin_unlock_irq(&lruvec->lru_lock); - - mem_cgroup_unlock_pages(); - } -@@ -3843,6 +4790,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap - struct folio *folio; - enum vm_event_item item; - struct reclaim_stat stat; -+ struct lru_gen_mm_walk *walk; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - -@@ -3879,6 +4827,10 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap - - move_pages_to_lru(lruvec, &list); - -+ walk = current->reclaim_state->mm_walk; -+ if (walk && walk->batched) -+ reset_batch_size(lruvec, walk); -+ - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; - if (!cgroup_reclaim(sc)) - __count_vm_events(item, reclaimed); -@@ -3936,7 +4888,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * - if (current_is_kswapd()) - return 0; - -- inc_max_seq(lruvec, max_seq, can_swap); -+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap)) -+ return nr_to_scan; - done: - return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; - } -@@ -3951,6 +4904,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc - - blk_start_plug(&plug); - -+ set_mm_walk(lruvec_pgdat(lruvec)); -+ - while (true) { - int delta; - int swappiness; -@@ -3978,6 +4933,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc - cond_resched(); - } - -+ clear_mm_walk(); -+ - blk_finish_plug(&plug); - } - -@@ -3994,15 +4951,21 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) - - for_each_gen_type_zone(gen, type, zone) - INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+ -+ lruvec->mm_state.seq = MIN_NR_GENS; -+ init_waitqueue_head(&lruvec->mm_state.wait); - } - - #ifdef CONFIG_MEMCG - void lru_gen_init_memcg(struct mem_cgroup *memcg) - { -+ INIT_LIST_HEAD(&memcg->mm_list.fifo); -+ spin_lock_init(&memcg->mm_list.lock); - } - - void lru_gen_exit_memcg(struct mem_cgroup *memcg) - { -+ int i; - int nid; - - for_each_node(nid) { -@@ -4010,6 +4973,11 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) - - VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, - sizeof(lruvec->lrugen.nr_pages))); -+ -+ for (i = 0; i < NR_BLOOM_FILTERS; i++) { -+ bitmap_free(lruvec->mm_state.filters[i]); -+ lruvec->mm_state.filters[i] = NULL; -+ } - } - } - #endif - -From patchwork Wed Jul 6 22:00:18 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908706 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 4EB4BCCA480 - for ; Wed, 6 Jul 2022 22:01:11 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id BDBC98E0005; Wed, 6 Jul 2022 18:01:06 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id B8B188E0001; Wed, 6 Jul 2022 18:01:06 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id A05548E0005; Wed, 6 Jul 2022 18:01:06 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com - [216.40.44.11]) - by kanga.kvack.org (Postfix) with ESMTP id 8B02E8E0001 - for ; Wed, 6 Jul 2022 18:01:06 -0400 (EDT) -Received: from smtpin07.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay08.hostedemail.com (Postfix) with ESMTP id 5322D218C7 - for ; Wed, 6 Jul 2022 22:01:06 +0000 (UTC) -X-FDA: 79658046132.07.9DE38CF -Received: from mail-yw1-f201.google.com (mail-yw1-f201.google.com - [209.85.128.201]) - by imf28.hostedemail.com (Postfix) with ESMTP id 97908C0054 - for ; Wed, 6 Jul 2022 22:01:05 +0000 (UTC) -Received: by mail-yw1-f201.google.com with SMTP id - 00721157ae682-31c8c7138ebso70710887b3.17 - for ; Wed, 06 Jul 2022 15:01:05 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=ZS7i+zWbsuFYZiRlenI/F/Y7PzZj3Cv3ABmuogIV+d0=; - b=Is9nnwDLdoF8cmdhQhl8FEZEIPLZOTCQNPziPrZ3WCv4Hkh+8SM7Qirn2/JzlJe5Qt - IMzoKhGVVu62zPGO2f8uqvwVO7ZBpwGEu3Y0nx+xsR+UR6rSMs9BgDYfSl6hxumhEzXQ - AVU29P45SCq1drQE+AuDu2NsKyQ+R9NLi2XNN7GjQzGIS59mnKnciabxZ70kUwocqXEh - TsuagDSQmmH5SjPkOzOUNm6Sk8f3JEhf7X8a1bPpbg+ozA3KspzkTBjkMrHomLe9ffcm - BFgwNEyH9XBgnj0m4gnfT2SYRWWY1k3MsXJMQ+zIJmqc6vDRB4WpYW/qGMJadOFCZfMM - nXgA== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=ZS7i+zWbsuFYZiRlenI/F/Y7PzZj3Cv3ABmuogIV+d0=; - b=RCS2awk0QPdb9ZH7wNOyKeKXpab1x67IIAx6j5zq56jXzEwz7GXRfLumdFrGKXCGb4 - Ni9Rp5dJZYzq8bxw94GX64zJKMTSoJ4V+UAQj4zSBLEczJlyZKy/yWOit9ZSOEb0iMvR - 4+5mHFOU3YS4X1wdcqI8B4TLEOn1Me9mvfoveMAhSf8VHEhQ3neJrM0mY0hrTiGCSK5A - J4de3u3SGUe8nuXf8ZpGc38zd9x6oFOeBCUdjka6ao55yXPRLAwyLVnsLaRAKC6md3fT - VHJAQ+yfE2vhF9ga9RIwItKKGhAOJVHWoDcC3pQb9GfR0/p6eeP/23lc7iaFdQYywDcG - ckOQ== -X-Gm-Message-State: AJIora9+K81MvAVQpqZj+MVKlE2AQLbnYpnFjbMqoGoc4IJHROymV58Z - hBHBT8xaW7KW2eD0IR1+YvgbPP1Vazg= -X-Google-Smtp-Source: - AGRyM1uwFcCEi/xhgZ0h4sIMbRvSkHm3hRssR6SxZ63hO5m3+xAhe4vRctYY9iQ6nmc/njn1u9BXzW/MiYM= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a25:abb3:0:b0:66e:2f9a:4201 with SMTP id - v48-20020a25abb3000000b0066e2f9a4201mr26479914ybi.125.1657144864829; Wed, 06 - Jul 2022 15:01:04 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:18 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-10-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 09/14] mm: multi-gen LRU: optimize multiple memcgs -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144865; a=rsa-sha256; - cv=none; - b=q3V8GcW5a7gLdEkDRvzjbN1oCqdl/PcNPOBK/4yn5O6DZ96fhshx4mHlVqwifd6VM/h6DQ - bokhlmu8Wk2Z61Pnli47ITBMWiwbyG5GqreBCrRln1NfcGXS0mPhEW7lQeDU+ca5XNHA6R - 1crW4jbDQ+Ez8oz8X4F5X5OxrI6ddoA= -ARC-Authentication-Results: i=1; - imf28.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=Is9nnwDL; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf28.hostedemail.com: domain of - 3IAbGYgYKCGEXTYG9NFNNFKD.BNLKHMTW-LLJU9BJ.NQF@flex--yuzhao.bounces.google.com - designates 209.85.128.201 as permitted sender) - smtp.mailfrom=3IAbGYgYKCGEXTYG9NFNNFKD.BNLKHMTW-LLJU9BJ.NQF@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144865; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=ZS7i+zWbsuFYZiRlenI/F/Y7PzZj3Cv3ABmuogIV+d0=; - b=fen/7hqEjb4DZZ+j8OhMLxCTQaBQg1nYoyqyzaLOG2yllRdoXgC8upN+NQjx2/OsQCSFdY - tIpgzM/hy9Vb35EZoTXPI0b6U/1kGDSWHJyLMU3CkJKzSF9lMbdGm9UdAtXJ/1dnB/CxlX - R3DdBW8MzocpYgtBQDvKhNaIKlqhdRg= -X-Rspamd-Server: rspam08 -X-Rspamd-Queue-Id: 97908C0054 -X-Rspam-User: -Authentication-Results: imf28.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=Is9nnwDL; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf28.hostedemail.com: domain of - 3IAbGYgYKCGEXTYG9NFNNFKD.BNLKHMTW-LLJU9BJ.NQF@flex--yuzhao.bounces.google.com - designates 209.85.128.201 as permitted sender) - smtp.mailfrom=3IAbGYgYKCGEXTYG9NFNNFKD.BNLKHMTW-LLJU9BJ.NQF@flex--yuzhao.bounces.google.com -X-Stat-Signature: xt8apxhnez18ydabrirx1u5kimzk5obt -X-HE-Tag: 1657144865-519413 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -When multiple memcgs are available, it is possible to make better -choices based on generations and tiers and therefore improve the -overall performance under global memory pressure. This patch adds a -rudimentary optimization to select memcgs that can drop single-use -unmapped clean pages first. Doing so reduces the chance of going into -the aging path or swapping. These two decisions can be costly. - -A typical example that benefits from this optimization is a server -running mixed types of workloads, e.g., heavy anon workload in one -memcg and heavy buffered I/O workload in the other. - -Though this optimization can be applied to both kswapd and direct -reclaim, it is only added to kswapd to keep the patchset manageable. -Later improvements will cover the direct reclaim path. - -Server benchmark results: - Mixed workloads: - fio (buffered I/O): +[19, 21]% - IOPS BW - patch1-8: 1880k 7343MiB/s - patch1-9: 2252k 8796MiB/s - - memcached (anon): +[119, 123]% - Ops/sec KB/sec - patch1-8: 862768.65 33514.68 - patch1-9: 1911022.12 74234.54 - - Mixed workloads: - fio (buffered I/O): +[75, 77]% - IOPS BW - 5.19-rc1: 1279k 4996MiB/s - patch1-9: 2252k 8796MiB/s - - memcached (anon): +[13, 15]% - Ops/sec KB/sec - 5.19-rc1: 1673524.04 65008.87 - patch1-9: 1911022.12 74234.54 - - Configurations: - (changes since patch 6) - - cat mixed.sh - modprobe brd rd_nr=2 rd_size=56623104 - - swapoff -a - mkswap /dev/ram0 - swapon /dev/ram0 - - mkfs.ext4 /dev/ram1 - mount -t ext4 /dev/ram1 /mnt - - memtier_benchmark -S /var/run/memcached/memcached.sock \ - -P memcache_binary -n allkeys --key-minimum=1 \ - --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \ - --ratio 1:0 --pipeline 8 -d 2000 - - fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \ - --buffered=1 --ioengine=io_uring --iodepth=128 \ - --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ - --rw=randread --random_distribution=random --norandommap \ - --time_based --ramp_time=10m --runtime=90m --group_reporting & - pid=$! - - sleep 200 - - memtier_benchmark -S /var/run/memcached/memcached.sock \ - -P memcache_binary -n allkeys --key-minimum=1 \ - --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \ - --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed - - kill -INT $pid - wait - -Client benchmark results: - no change (CONFIG_MEMCG=n) - -Signed-off-by: Yu Zhao -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - mm/vmscan.c | 55 ++++++++++++++++++++++++++++++++++++++++++++--------- - 1 file changed, 46 insertions(+), 9 deletions(-) - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 8e55a1ce1ae0..f469a2740835 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -129,6 +129,13 @@ struct scan_control { - /* Always discard instead of demoting to lower tier memory */ - unsigned int no_demotion:1; - -+#ifdef CONFIG_LRU_GEN -+ /* help make better choices when multiple memcgs are available */ -+ unsigned int memcgs_need_aging:1; -+ unsigned int memcgs_need_swapping:1; -+ unsigned int memcgs_avoid_swapping:1; -+#endif -+ - /* Allocation order */ - s8 order; - -@@ -4372,6 +4379,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - - VM_WARN_ON_ONCE(!current_is_kswapd()); - -+ /* -+ * To reduce the chance of going into the aging path or swapping, which -+ * can be costly, optimistically skip them unless their corresponding -+ * flags were cleared in the eviction path. This improves the overall -+ * performance when multiple memcgs are available. -+ */ -+ if (!sc->memcgs_need_aging) { -+ sc->memcgs_need_aging = true; -+ sc->memcgs_avoid_swapping = !sc->memcgs_need_swapping; -+ sc->memcgs_need_swapping = true; -+ return; -+ } -+ -+ sc->memcgs_need_swapping = true; -+ sc->memcgs_avoid_swapping = true; -+ - set_mm_walk(pgdat); - - memcg = mem_cgroup_iter(NULL, NULL, NULL); -@@ -4781,7 +4804,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw - return scanned; - } - --static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ bool *need_swapping) - { - int type; - int scanned; -@@ -4844,14 +4868,16 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap - - sc->nr_reclaimed += reclaimed; - -+ if (type == LRU_GEN_ANON && need_swapping) -+ *need_swapping = true; -+ - return scanned; - } - - static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, -- bool can_swap, unsigned long reclaimed) -+ bool can_swap, unsigned long reclaimed, bool *need_aging) - { - int priority; -- bool need_aging; - unsigned long nr_to_scan; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - DEFINE_MAX_SEQ(lruvec); -@@ -4861,7 +4887,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * - (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) - return 0; - -- nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, &need_aging); -+ nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, need_aging); - if (!nr_to_scan) - return 0; - -@@ -4877,7 +4903,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * - if (!nr_to_scan) - return 0; - -- if (!need_aging) -+ if (!*need_aging) - return nr_to_scan; - - /* skip the aging path at the default priority */ -@@ -4897,6 +4923,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * - static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { - struct blk_plug plug; -+ bool need_aging = false; -+ bool need_swapping = false; - unsigned long scanned = 0; - unsigned long reclaimed = sc->nr_reclaimed; - -@@ -4918,21 +4946,30 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc - else - swappiness = 0; - -- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed); -+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed, &need_aging); - if (!nr_to_scan) -- break; -+ goto done; - -- delta = evict_folios(lruvec, sc, swappiness); -+ delta = evict_folios(lruvec, sc, swappiness, &need_swapping); - if (!delta) -- break; -+ goto done; - - scanned += delta; - if (scanned >= nr_to_scan) - break; - -+ if (sc->memcgs_avoid_swapping && swappiness < 200 && need_swapping) -+ break; -+ - cond_resched(); - } - -+ /* see the comment in lru_gen_age_node() */ -+ if (!need_aging) -+ sc->memcgs_need_aging = false; -+ if (!need_swapping) -+ sc->memcgs_need_swapping = false; -+done: - clear_mm_walk(); - - blk_finish_plug(&plug); - -From patchwork Wed Jul 6 22:00:19 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908707 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 8DE21CCA480 - for ; Wed, 6 Jul 2022 22:01:14 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id D162E8E0006; Wed, 6 Jul 2022 18:01:07 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id C9D778E0001; Wed, 6 Jul 2022 18:01:07 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id B177F8E0006; Wed, 6 Jul 2022 18:01:07 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0014.hostedemail.com - [216.40.44.14]) - by kanga.kvack.org (Postfix) with ESMTP id 9D5508E0001 - for ; Wed, 6 Jul 2022 18:01:07 -0400 (EDT) -Received: from smtpin29.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay12.hostedemail.com (Postfix) with ESMTP id 5B55512053A - for ; Wed, 6 Jul 2022 22:01:07 +0000 (UTC) -X-FDA: 79658046174.29.1D659FF -Received: from mail-yw1-f201.google.com (mail-yw1-f201.google.com - [209.85.128.201]) - by imf23.hostedemail.com (Postfix) with ESMTP id F2017140064 - for ; Wed, 6 Jul 2022 22:01:06 +0000 (UTC) -Received: by mail-yw1-f201.google.com with SMTP id - 00721157ae682-31c9a49a1a8so63946947b3.9 - for ; Wed, 06 Jul 2022 15:01:06 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=t3JqYbFJT9lP6E96sRUzmCzQEu0iJg+mfU6dciROW6I=; - b=K/nKIb14JmIaSQ25G+voEr3Xu6sFBToolWxLX2DrPdbxAa6BpfoEW4/5621Rzsff4D - 1k3G9tp+5ESbNVZCZfqietdtMt6OTAchdy14TXI4WTiTZLglVlIfr80zpxGfIGcphLBv - c2R6icWOjZ0upEVkivTfwH9rKBl233YFlYCWfHzoiU07eBFA2yPOzHZx49n6UFl3tbHt - eSai05q6oFPAPMqEwWKLLg5e2ewTiqoowbahH4nTTyw69dIDZhmip41HFaA0/Sczzyq3 - JDic9dSJ+BDTRQ6TaWU0nw7eqP8mi+/sxNdfATpIluPgr0W9A0QZ1JCn1D9q09woZwV/ - PFjA== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=t3JqYbFJT9lP6E96sRUzmCzQEu0iJg+mfU6dciROW6I=; - b=HM7LGBCrS1eJ/vRo3XCktj01RjkBHhQmWt02aEfYLwa8PL1HdwG+c4Me4gn54xxguO - czAtvRKRQXHGFYRw3EumNTE4ZGOfg2XqVtN9EjwqQhTlBSwX+fziamWFBeroVGwQW5G1 - dGw5hoaY2I+TFoZJ29KsIagqOSfrJETzQGULi/sVBWKaWeb/S8HFZY/EKyoYDxZqIBmP - sF9WEmAcW0+fvVxqWYl3uVJzRtjHRL6YsrIahPgXVedZvFAkXUNU4kkV7vNYV3lz5mBQ - xaBVG/fe1KTKIxNs72Rv8R1FEPhZcIGFotO3DUMW0MRlYpm9F1IVf32khtQ+h2Ym9As/ - PzIA== -X-Gm-Message-State: AJIora954b3OSYa+S8ljATClx2rklKm1t2+1N36MlNK4kzbf9PLLetyh - 9bLCk9rYASJ4G36LmS+oOZUzhHu3MzI= -X-Google-Smtp-Source: - AGRyM1uCmWNAcyv7l4c+bwlvsNWjdcmS50NXK/ousi79Gs9bHWyAObimB3RXzG41nJY/wFbH1TL7Js/68Zk= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a25:e74e:0:b0:66e:32d4:1f0 with SMTP id - e75-20020a25e74e000000b0066e32d401f0mr24265460ybh.421.1657144866511; Wed, 06 - Jul 2022 15:01:06 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:19 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-11-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 10/14] mm: multi-gen LRU: kill switch -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144867; a=rsa-sha256; - cv=none; - b=srv0M1GwC9igO3Ssc6UQavsL8SeMR4TafyrVnDRr+qa3IdXnaeYuD7mVXVj2DTKftiNgsf - Z0438E6xGFdYlVffElthsW5+/j/7H+6w/+Rrj2/CtQGpnJriXc77Tp9lWDSGzJMNo/18Ur - 5XMrBTHRNBBRZNKQECXPfqx8mY2KlTw= -ARC-Authentication-Results: i=1; - imf23.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b="K/nKIb14"; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf23.hostedemail.com: domain of - 3IgbGYgYKCGMZVaIBPHPPHMF.DPNMJOVY-NNLWBDL.PSH@flex--yuzhao.bounces.google.com - designates 209.85.128.201 as permitted sender) - smtp.mailfrom=3IgbGYgYKCGMZVaIBPHPPHMF.DPNMJOVY-NNLWBDL.PSH@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144867; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=t3JqYbFJT9lP6E96sRUzmCzQEu0iJg+mfU6dciROW6I=; - b=v9VdQ/ak+0604gCltqLudvPrAy3WcrJhWxCXksIxicPZWyjAnzABJHeJwcXRUr74ilTy45 - 1o9D+n98WPgODBBDXuQgOxdZ/m1FekCnCpnWnR72lB+33NlF8zDMR0jbie23mZMDqsMO/w - cYTpCFhyTn0ribSQdUv7TlFoFBXyRTw= -Authentication-Results: imf23.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b="K/nKIb14"; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf23.hostedemail.com: domain of - 3IgbGYgYKCGMZVaIBPHPPHMF.DPNMJOVY-NNLWBDL.PSH@flex--yuzhao.bounces.google.com - designates 209.85.128.201 as permitted sender) - smtp.mailfrom=3IgbGYgYKCGMZVaIBPHPPHMF.DPNMJOVY-NNLWBDL.PSH@flex--yuzhao.bounces.google.com -X-Stat-Signature: u9yuk5ppb8f6meekzwox11y8u8f8zsip -X-Rspamd-Queue-Id: F2017140064 -X-Rspamd-Server: rspam05 -X-Rspam-User: -X-HE-Tag: 1657144866-771308 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that -can be disabled include: - 0x0001: the multi-gen LRU core - 0x0002: walking page table, when arch_has_hw_pte_young() returns - true - 0x0004: clearing the accessed bit in non-leaf PMD entries, when - CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y - [yYnN]: apply to all the components above -E.g., - echo y >/sys/kernel/mm/lru_gen/enabled - cat /sys/kernel/mm/lru_gen/enabled - 0x0007 - echo 5 >/sys/kernel/mm/lru_gen/enabled - cat /sys/kernel/mm/lru_gen/enabled - 0x0005 - -NB: the page table walks happen on the scale of seconds under heavy -memory pressure, in which case the mmap_lock contention is a lesser -concern, compared with the LRU lock contention and the I/O congestion. -So far the only well-known case of the mmap_lock contention happens on -Android, due to Scudo [1] which allocates several thousand VMAs for -merely a few hundred MBs. The SPF and the Maple Tree also have -provided their own assessments [2][3]. However, if walking page tables -does worsen the mmap_lock contention, the kill switch can be used to -disable it. In this case the multi-gen LRU will suffer a minor -performance degradation, as shown previously. - -Clearing the accessed bit in non-leaf PMD entries can also be -disabled, since this behavior was not tested on x86 varieties other -than Intel and AMD. - -[1] https://source.android.com/devices/tech/debug/scudo -[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/ -[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/ - -Signed-off-by: Yu Zhao -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - include/linux/cgroup.h | 15 ++- - include/linux/mm_inline.h | 15 ++- - include/linux/mmzone.h | 9 ++ - kernel/cgroup/cgroup-internal.h | 1 - - mm/Kconfig | 6 + - mm/vmscan.c | 231 +++++++++++++++++++++++++++++++- - 6 files changed, 268 insertions(+), 9 deletions(-) - -diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h -index 0d1ada8968d7..1bc0cabf993f 100644 ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) - css_put(&cgrp->self); - } - -+extern struct mutex cgroup_mutex; -+ -+static inline void cgroup_lock(void) -+{ -+ mutex_lock(&cgroup_mutex); -+} -+ -+static inline void cgroup_unlock(void) -+{ -+ mutex_unlock(&cgroup_mutex); -+} -+ - /** - * task_css_set_check - obtain a task's css_set with extra access conditions - * @task: the task to obtain css_set for -@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) - * as locks used during the cgroup_subsys::attach() methods. - */ - #ifdef CONFIG_PROVE_RCU --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - #define task_css_set_check(task, __c) \ - rcu_dereference_check((task)->cgroups, \ -@@ -708,6 +719,8 @@ struct cgroup; - static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } - static inline void css_get(struct cgroup_subsys_state *css) {} - static inline void css_put(struct cgroup_subsys_state *css) {} -+static inline void cgroup_lock(void) {} -+static inline void cgroup_unlock(void) {} - static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) { return 0; } - static inline int cgroupstats_build(struct cgroupstats *stats, -diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h -index f2b2296a42f9..4949eda9a9a2 100644 ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -106,10 +106,21 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) - - #ifdef CONFIG_LRU_GEN - -+#ifdef CONFIG_LRU_GEN_ENABLED - static inline bool lru_gen_enabled(void) - { -- return true; -+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); -+ -+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); - } -+#else -+static inline bool lru_gen_enabled(void) -+{ -+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); -+ -+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); -+} -+#endif - - static inline bool lru_gen_in_fault(void) - { -@@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, - - VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); - -- if (folio_test_unevictable(folio)) -+ if (folio_test_unevictable(folio) || !lrugen->enabled) - return false; - /* - * There are three common cases for this page: -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 0cf0856b484a..840b7ca8b91f 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -384,6 +384,13 @@ enum { - LRU_GEN_FILE, - }; - -+enum { -+ LRU_GEN_CORE, -+ LRU_GEN_MM_WALK, -+ LRU_GEN_NONLEAF_YOUNG, -+ NR_LRU_GEN_CAPS -+}; -+ - #define MIN_LRU_BATCH BITS_PER_LONG - #define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) - -@@ -425,6 +432,8 @@ struct lru_gen_struct { - /* can be modified without holding the LRU lock */ - atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; - atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; -+ /* whether the multi-gen LRU is enabled */ -+ bool enabled; - }; - - enum { -diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h -index 5da09c74228d..c966e55cab29 100644 ---- a/kernel/cgroup/cgroup-internal.h -+++ b/kernel/cgroup/cgroup-internal.h -@@ -164,7 +164,6 @@ struct cgroup_mgctx { - #define DEFINE_CGROUP_MGCTX(name) \ - struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) - --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - extern struct cgroup_subsys *cgroup_subsys[]; - extern struct list_head cgroup_roots; -diff --git a/mm/Kconfig b/mm/Kconfig -index a93478acf341..0c2ef0af0036 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -1139,6 +1139,12 @@ config LRU_GEN - help - A high performance LRU implementation to overcommit memory. - -+config LRU_GEN_ENABLED -+ bool "Enable by default" -+ depends on LRU_GEN -+ help -+ This option enables the multi-gen LRU by default. -+ - config LRU_GEN_STATS - bool "Full stats for debugging" - depends on LRU_GEN -diff --git a/mm/vmscan.c b/mm/vmscan.c -index f469a2740835..4c8b475429ed 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -52,6 +52,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -3013,6 +3014,14 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, - - #ifdef CONFIG_LRU_GEN - -+#ifdef CONFIG_LRU_GEN_ENABLED -+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); -+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) -+#else -+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); -+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) -+#endif -+ - /****************************************************************************** - * shorthand helpers - ******************************************************************************/ -@@ -3890,7 +3899,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area - goto next; - - if (!pmd_trans_huge(pmd[i])) { -- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) -+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && -+ get_cap(LRU_GEN_NONLEAF_YOUNG)) - pmdp_test_and_clear_young(vma, addr, pmd + i); - goto next; - } -@@ -3988,10 +3998,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, - walk->mm_stats[MM_NONLEAF_TOTAL]++; - - #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG -- if (!pmd_young(val)) -- continue; -+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { -+ if (!pmd_young(val)) -+ continue; - -- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); -+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); -+ } - #endif - if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) - continue; -@@ -4249,7 +4261,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - * handful of PTEs. Spreading the work out over a period of time usually - * is less efficient, but it avoids bursty page faults. - */ -- if (!arch_has_hw_pte_young()) { -+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { - success = iterate_mm_list_nowalk(lruvec, max_seq); - goto done; - } -@@ -4975,6 +4987,211 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc - blk_finish_plug(&plug); - } - -+/****************************************************************************** -+ * state change -+ ******************************************************************************/ -+ -+static bool __maybe_unused state_is_valid(struct lruvec *lruvec) -+{ -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ if (lrugen->enabled) { -+ enum lru_list lru; -+ -+ for_each_evictable_lru(lru) { -+ if (!list_empty(&lruvec->lists[lru])) -+ return false; -+ } -+ } else { -+ int gen, type, zone; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ if (!list_empty(&lrugen->lists[gen][type][zone])) -+ return false; -+ -+ /* unlikely but not a bug when reset_batch_size() is pending */ -+ VM_WARN_ON_ONCE(lrugen->nr_pages[gen][type][zone]); -+ } -+ } -+ -+ return true; -+} -+ -+static bool fill_evictable(struct lruvec *lruvec) -+{ -+ enum lru_list lru; -+ int remaining = MAX_LRU_BATCH; -+ -+ for_each_evictable_lru(lru) { -+ int type = is_file_lru(lru); -+ bool active = is_active_lru(lru); -+ struct list_head *head = &lruvec->lists[lru]; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct folio *folio = lru_to_folio(head); -+ -+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); -+ -+ lruvec_del_folio(lruvec, folio); -+ success = lru_gen_add_folio(lruvec, folio, false); -+ VM_WARN_ON_ONCE(!success); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+static bool drain_evictable(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ int remaining = MAX_LRU_BATCH; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct folio *folio = lru_to_folio(head); -+ -+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); -+ -+ success = lru_gen_del_folio(lruvec, folio, false); -+ VM_WARN_ON_ONCE(!success); -+ lruvec_add_folio(lruvec, folio); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+static void lru_gen_change_state(bool enabled) -+{ -+ static DEFINE_MUTEX(state_mutex); -+ -+ struct mem_cgroup *memcg; -+ -+ cgroup_lock(); -+ cpus_read_lock(); -+ get_online_mems(); -+ mutex_lock(&state_mutex); -+ -+ if (enabled == lru_gen_enabled()) -+ goto unlock; -+ -+ if (enabled) -+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); -+ else -+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(memcg, nid); -+ -+ if (!lruvec) -+ continue; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); -+ VM_WARN_ON_ONCE(!state_is_valid(lruvec)); -+ -+ lruvec->lrugen.enabled = enabled; -+ -+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+unlock: -+ mutex_unlock(&state_mutex); -+ put_online_mems(); -+ cpus_read_unlock(); -+ cgroup_unlock(); -+} -+ -+/****************************************************************************** -+ * sysfs interface -+ ******************************************************************************/ -+ -+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -+{ -+ unsigned int caps = 0; -+ -+ if (get_cap(LRU_GEN_CORE)) -+ caps |= BIT(LRU_GEN_CORE); -+ -+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) -+ caps |= BIT(LRU_GEN_MM_WALK); -+ -+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) -+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG); -+ -+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); -+} -+ -+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t len) -+{ -+ int i; -+ unsigned int caps; -+ -+ if (tolower(*buf) == 'n') -+ caps = 0; -+ else if (tolower(*buf) == 'y') -+ caps = -1; -+ else if (kstrtouint(buf, 0, &caps)) -+ return -EINVAL; -+ -+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) { -+ bool enabled = caps & BIT(i); -+ -+ if (i == LRU_GEN_CORE) -+ lru_gen_change_state(enabled); -+ else if (enabled) -+ static_branch_enable(&lru_gen_caps[i]); -+ else -+ static_branch_disable(&lru_gen_caps[i]); -+ } -+ -+ return len; -+} -+ -+static struct kobj_attribute lru_gen_enabled_attr = __ATTR( -+ enabled, 0644, show_enabled, store_enabled -+); -+ -+static struct attribute *lru_gen_attrs[] = { -+ &lru_gen_enabled_attr.attr, -+ NULL -+}; -+ -+static struct attribute_group lru_gen_attr_group = { -+ .name = "lru_gen", -+ .attrs = lru_gen_attrs, -+}; -+ - /****************************************************************************** - * initialization - ******************************************************************************/ -@@ -4985,6 +5202,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) - struct lru_gen_struct *lrugen = &lruvec->lrugen; - - lrugen->max_seq = MIN_NR_GENS + 1; -+ lrugen->enabled = lru_gen_enabled(); - - for_each_gen_type_zone(gen, type, zone) - INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -@@ -5024,6 +5242,9 @@ static int __init init_lru_gen(void) - BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); - BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); - -+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) -+ pr_err("lru_gen: failed to create sysfs group\n"); -+ - return 0; - }; - late_initcall(init_lru_gen); - -From patchwork Wed Jul 6 22:00:20 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908708 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id E259FCCA47C - for ; Wed, 6 Jul 2022 22:01:17 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 1A2F98E0007; Wed, 6 Jul 2022 18:01:09 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id 153298E0001; Wed, 6 Jul 2022 18:01:09 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id EE7558E0007; Wed, 6 Jul 2022 18:01:08 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0016.hostedemail.com - [216.40.44.16]) - by kanga.kvack.org (Postfix) with ESMTP id DAE728E0001 - for ; Wed, 6 Jul 2022 18:01:08 -0400 (EDT) -Received: from smtpin28.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay12.hostedemail.com (Postfix) with ESMTP id BCB43120606 - for ; Wed, 6 Jul 2022 22:01:08 +0000 (UTC) -X-FDA: 79658046216.28.7964C66 -Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com - [209.85.219.201]) - by imf22.hostedemail.com (Postfix) with ESMTP id 5376DC0059 - for ; Wed, 6 Jul 2022 22:01:08 +0000 (UTC) -Received: by mail-yb1-f201.google.com with SMTP id - k18-20020a25fe12000000b0066e21b72767so9506499ybe.5 - for ; Wed, 06 Jul 2022 15:01:08 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=m3EW4cfAlntTqnxn3SvhsZvF1ytN+sfDtB6iRdzihvY=; - b=NNZxOJisLedvEph13coGoCeVo89XYF3cKhoLr0Qj+8EQSroRh25w+qZuSGaKvrNfmO - djUv79dYHeRCliQ2lBYEsuuPJN6lgSZ6cKW987LKYkUaRIiHw552kndr1VR1raRgUvCU - 568te5aggKYg95okJZ0cLsdFaiOBB18/hCGgU+4bQM73SosPCL/NpSqGWL8mW9AiVFs+ - hT7ErHYOnMn+bCDzuk8GAu9J4/5Gq8c/6z9M6D6X+HmVK0MeVpaKpZ0jPz/vsi747v3J - zvNibUS9XJKNBhR7/Fg26FpINdlMkWHvvcikRiTD5O+czcMeNF2XfnGAvAgAPgyPnYK8 - b6mw== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=m3EW4cfAlntTqnxn3SvhsZvF1ytN+sfDtB6iRdzihvY=; - b=zXNzXRZtXF7HqnKj+YqGY7LRRVq6lHVBV5jsPt3MSSmDDGJn6CoeLbEp4cPtjTwO6B - PdPka308tTjCbbT5NueJUGYkQFn+dt6QZPZS/jb4O/Jp4FJYfjG0o4VrhF0wFgETWdJW - SlKTi07ik53nd5tJXcgBVdPvMoYv2WqllknYQA/iDN2/SNhuFxoKXFeHv/5ulZkc5nBp - SmXgJE0BppobJXNXNvFGVF0nGLh8MGF2CDBRi/+lvRIg3ypzFxQ/hVUXa8U2PNCdICqT - s8aNXeAFHJX2x34DN+/C6pMD4gEu9krSDK9BNN79fEhVT7obeGbxtKJ4DZnLSgUa0Xm+ - emKQ== -X-Gm-Message-State: AJIora+LcODNzO0M8diNM7f7lT7CGqqvE1q+GRK1vGVAVA10uhTltSsu - Casv9y2GuL6ljruRdbC60eu1gck5MIA= -X-Google-Smtp-Source: - AGRyM1uVFLai60fRsrxUz+UveX+2HvTnchQxr73gyI+bA9ud92MMOTkT47lvZz9+aNC2VPhD8jfbEwKxJDM= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a25:1c56:0:b0:66e:2d23:d65d with SMTP id - c83-20020a251c56000000b0066e2d23d65dmr26931039ybc.253.1657144867700; Wed, 06 - Jul 2022 15:01:07 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:20 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-12-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 11/14] mm: multi-gen LRU: thrashing prevention -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144868; a=rsa-sha256; - cv=none; - b=8QjwJzQPm7r/G+Ug8d4Bn/JrZtirxW14NE/TPM5Yuz8TtgqfXHSgDZ0NZs+0NMnmPdFebK - BewOSgj/R+9PisPRBLUEepAkTTAjyW6prOGRhTAKigLh6I3aJOU2/+iXQO+eUvhPXHNnMf - KRnEXerAaMeP1dBwH0VFivF74hvg2OQ= -ARC-Authentication-Results: i=1; - imf22.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=NNZxOJis; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf22.hostedemail.com: domain of - 3IwbGYgYKCGQaWbJCQIQQING.EQONKPWZ-OOMXCEM.QTI@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3IwbGYgYKCGQaWbJCQIQQING.EQONKPWZ-OOMXCEM.QTI@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144868; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=m3EW4cfAlntTqnxn3SvhsZvF1ytN+sfDtB6iRdzihvY=; - b=DTQGqCEN2saKpCn2Rlj0DwxYUYns5aLH6ctyLw23CxaYk5FVEKFifd/4msPagn2x3OyYoJ - IHUvwyXUjQkcTa1cZQoQjZtTkZ5tAB3HGWKknBtj00SV590QYCz1tvu/9DdrTQBAJJQVkL - NDOvAf+Q5C0pIHmFotDqxriphq5nQvg= -X-Stat-Signature: ob4t1mrtn1zzw1fgdc6fyo685ij649e9 -X-Rspam-User: -X-Rspamd-Server: rspam12 -X-Rspamd-Queue-Id: 5376DC0059 -Authentication-Results: imf22.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=NNZxOJis; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf22.hostedemail.com: domain of - 3IwbGYgYKCGQaWbJCQIQQING.EQONKPWZ-OOMXCEM.QTI@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3IwbGYgYKCGQaWbJCQIQQING.EQONKPWZ-OOMXCEM.QTI@flex--yuzhao.bounces.google.com -X-HE-Tag: 1657144868-301835 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as -requested by many desktop users [1]. - -When set to value N, it prevents the working set of N milliseconds -from getting evicted. The OOM killer is triggered if this working set -cannot be kept in memory. Based on the average human detectable lag -(~100ms), N=1000 usually eliminates intolerable lags due to thrashing. -Larger values like N=3000 make lags less noticeable at the risk of -premature OOM kills. - -Compared with the size-based approach [2], this time-based approach -has the following advantages: -1. It is easier to configure because it is agnostic to applications - and memory sizes. -2. It is more reliable because it is directly wired to the OOM killer. - -[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/ -[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/ - -Signed-off-by: Yu Zhao -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - include/linux/mmzone.h | 2 ++ - mm/vmscan.c | 71 +++++++++++++++++++++++++++++++++++++++--- - 2 files changed, 69 insertions(+), 4 deletions(-) - -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 840b7ca8b91f..472bd5335517 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -419,6 +419,8 @@ struct lru_gen_struct { - unsigned long max_seq; - /* the eviction increments the oldest generation numbers */ - unsigned long min_seq[ANON_AND_FILE]; -+ /* the birth time of each generation in jiffies */ -+ unsigned long timestamps[MAX_NR_GENS]; - /* the multi-gen LRU lists, lazily sorted on eviction */ - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; - /* the multi-gen LRU sizes, eventually consistent */ -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 4c8b475429ed..1f2892a0dc41 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -4233,6 +4233,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) - for (type = 0; type < ANON_AND_FILE; type++) - reset_ctrl_pos(lruvec, type, false); - -+ WRITE_ONCE(lrugen->timestamps[next], jiffies); - /* make sure preceding modifications appear */ - smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); - -@@ -4359,7 +4360,7 @@ static unsigned long get_nr_evictable(struct lruvec *lruvec, unsigned long max_s - return total; - } - --static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) - { - bool need_aging; - unsigned long nr_to_scan; -@@ -4373,21 +4374,40 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) - mem_cgroup_calculate_protection(NULL, memcg); - - if (mem_cgroup_below_min(memcg)) -- return; -+ return false; - - nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, swappiness, &need_aging); - if (!nr_to_scan) -- return; -+ return false; - - nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0; - -+ if (min_ttl) { -+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); -+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); -+ -+ if (time_is_after_jiffies(birth + min_ttl)) -+ return false; -+ -+ /* the size is likely too small to be helpful */ -+ if (!nr_to_scan && sc->priority != DEF_PRIORITY) -+ return false; -+ } -+ - if (nr_to_scan && need_aging) - try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); -+ -+ return true; - } - -+/* to protect the working set of the last N jiffies */ -+static unsigned long lru_gen_min_ttl __read_mostly; -+ - static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - { - struct mem_cgroup *memcg; -+ bool success = false; -+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); - - VM_WARN_ON_ONCE(!current_is_kswapd()); - -@@ -4413,12 +4433,28 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - do { - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - -- age_lruvec(lruvec, sc); -+ if (age_lruvec(lruvec, sc, min_ttl)) -+ success = true; - - cond_resched(); - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); - - clear_mm_walk(); -+ -+ /* -+ * The main goal is to OOM kill if every generation from all memcgs is -+ * younger than min_ttl. However, another theoretical possibility is all -+ * memcgs are either below min or empty. -+ */ -+ if (!success && !sc->order && mutex_trylock(&oom_lock)) { -+ struct oom_control oc = { -+ .gfp_mask = sc->gfp_mask, -+ }; -+ -+ out_of_memory(&oc); -+ -+ mutex_unlock(&oom_lock); -+ } - } - - /* -@@ -5135,6 +5171,28 @@ static void lru_gen_change_state(bool enabled) - * sysfs interface - ******************************************************************************/ - -+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); -+} -+ -+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t len) -+{ -+ unsigned int msecs; -+ -+ if (kstrtouint(buf, 0, &msecs)) -+ return -EINVAL; -+ -+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); -+ -+ return len; -+} -+ -+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( -+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl -+); -+ - static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) - { - unsigned int caps = 0; -@@ -5183,6 +5241,7 @@ static struct kobj_attribute lru_gen_enabled_attr = __ATTR( - ); - - static struct attribute *lru_gen_attrs[] = { -+ &lru_gen_min_ttl_attr.attr, - &lru_gen_enabled_attr.attr, - NULL - }; -@@ -5198,12 +5257,16 @@ static struct attribute_group lru_gen_attr_group = { - - void lru_gen_init_lruvec(struct lruvec *lruvec) - { -+ int i; - int gen, type, zone; - struct lru_gen_struct *lrugen = &lruvec->lrugen; - - lrugen->max_seq = MIN_NR_GENS + 1; - lrugen->enabled = lru_gen_enabled(); - -+ for (i = 0; i <= MIN_NR_GENS + 1; i++) -+ lrugen->timestamps[i] = jiffies; -+ - for_each_gen_type_zone(gen, type, zone) - INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); - - -From patchwork Wed Jul 6 22:00:21 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908710 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 69F71CCA47C - for ; Wed, 6 Jul 2022 22:01:25 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 57B768E0009; Wed, 6 Jul 2022 18:01:12 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id 4D6338E0001; Wed, 6 Jul 2022 18:01:12 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id 3007F8E0009; Wed, 6 Jul 2022 18:01:12 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0012.hostedemail.com - [216.40.44.12]) - by kanga.kvack.org (Postfix) with ESMTP id 1256B8E0001 - for ; Wed, 6 Jul 2022 18:01:12 -0400 (EDT) -Received: from smtpin31.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay09.hostedemail.com (Postfix) with ESMTP id AB0F535EC6 - for ; Wed, 6 Jul 2022 22:01:11 +0000 (UTC) -X-FDA: 79658046342.31.A60FB64 -Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com - [209.85.219.201]) - by imf26.hostedemail.com (Postfix) with ESMTP id 4A6EA140028 - for ; Wed, 6 Jul 2022 22:01:10 +0000 (UTC) -Received: by mail-yb1-f201.google.com with SMTP id - p7-20020a25d807000000b0066e36989a90so7892676ybg.8 - for ; Wed, 06 Jul 2022 15:01:10 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=nff1jLrA4AEpo88lpO2ZCXRvuzs0CKl/TI+ofmEg1y8=; - b=CPQvXMErOqHr1LM+OMqtT0F59XyB+HiQxBX+EbwoUSnPn/FOpbR4dV1NCCwYakR+KD - gThfZIfqp3Y1SzCO2443reP2Soe3KDHNgAEXCZ5YNoeE7AXlAuA2fgD7YeAXZovjmVIh - 7mERrjTMT6/EWjW531e5FNoxfhaMBEMBEgwjAOQ3Km57LeRgBcWr2IgRe48XaW69M16C - KWj2PGLEmurhGwwHU4NVVPpbjL3o7cE3vD/yehuUCz476hIOcC2Nqpn4krz36H5vP68u - MNeJkhynrE7FhYi7+GgffibtX96Vf3x/16YGAxyUCnSyvvk6OhNUeqKo/LQmoS3LAyl4 - LFpw== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=nff1jLrA4AEpo88lpO2ZCXRvuzs0CKl/TI+ofmEg1y8=; - b=VL/nujGONvdvil9k1He7hpkq5LhpScFPlvGxIB7gfBV4qX15+ZTbNG009jHkmfEswJ - LM40W6DV3mGXjx6Gy2MTjobH0jL4c9qrU1ia5WRKzWkXlxaCkDE82vwuaz7rycBaAiPt - JhRi9ADSMoA9G43MZZei2oSwmUoW9WcH4Umy1YImLdHAjkYdJQ+Ss3Q4uYfGGw3866qm - nfc0pXT5KiNC2DMr+Cla/Llx1WlFNi7QIf3AmdpJ9gZTxCC28ikjniRVZN6b7bTrvjnO - iEyt9jKYEk9vW/yTUEzM8L41D+e+Z60AT6T0qi0KACO4Tp3xz77ui1i2Q85btfHs4Uah - 1qwQ== -X-Gm-Message-State: AJIora+YmPGCc9b5W8VNnqsviDKSYwcLGbNwLNCyRBey3F3rMvUMSj7O - PlfF3OKJjs3zxsBvgcOgTGWclCLXuc4= -X-Google-Smtp-Source: - AGRyM1shBkUATwCAbsz8cAeEoY3s7WAj+Jhs0L0rMlWdOOLCX8yRP4QO9OI90Aiszy92GtEPUW7W76UGd7w= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a0d:c486:0:b0:31c:3b63:91fe with SMTP id - g128-20020a0dc486000000b0031c3b6391femr43427605ywd.7.1657144869573; Wed, 06 - Jul 2022 15:01:09 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:21 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-13-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 12/14] mm: multi-gen LRU: debugfs interface -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Qi Zheng , Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144870; a=rsa-sha256; - cv=none; - b=o2/nINgmuZxzIEJU+aSCudJDHKhL7ULIt8sF3JC62cV/HuuBLU0B/xVeMeA2f0cfJN2wtO - kh2UubWWhgsh8V4Cx5XQQfC0fnIjU7kesFrHJslbwx0sV7BFvFu/mCMwBdA2zb0NjFYQ+H - 9ov/Z61nItyp9dvXEOPZKlu3qRCy8D4= -ARC-Authentication-Results: i=1; - imf26.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=CPQvXMEr; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf26.hostedemail.com: domain of - 3JQbGYgYKCGYcYdLESKSSKPI.GSQPMRYb-QQOZEGO.SVK@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3JQbGYgYKCGYcYdLESKSSKPI.GSQPMRYb-QQOZEGO.SVK@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144870; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=nff1jLrA4AEpo88lpO2ZCXRvuzs0CKl/TI+ofmEg1y8=; - b=RI+I7W6K9d5xsUHY54+KeCzGoeOqxuVYKkoikwvrtSbya6NSfQOh7+EFtaBhpVNiDwQMte - 1gOSPtlHmqa//TuxixmT7E3h+4+bbMuck8gjgOl+LEQXqAO7KWKyE6sirgzmwX5HwXk8e5 - zWZIZi2rLOPaapJlUtXn2+31FvtGh1c= -Authentication-Results: imf26.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=CPQvXMEr; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf26.hostedemail.com: domain of - 3JQbGYgYKCGYcYdLESKSSKPI.GSQPMRYb-QQOZEGO.SVK@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3JQbGYgYKCGYcYdLESKSSKPI.GSQPMRYb-QQOZEGO.SVK@flex--yuzhao.bounces.google.com -X-Stat-Signature: oqpxscpz6ano7mm34xg1zaoyrcimtdxo -X-Rspamd-Queue-Id: 4A6EA140028 -X-Rspamd-Server: rspam05 -X-Rspam-User: -X-HE-Tag: 1657144870-268992 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Add /sys/kernel/debug/lru_gen for working set estimation and proactive -reclaim. These techniques are commonly used to optimize job scheduling -(bin packing) in data centers [1][2]. - -Compared with the page table-based approach and the PFN-based -approach, this lruvec-based approach has the following advantages: -1. It offers better choices because it is aware of memcgs, NUMA nodes, - shared mappings and unmapped page cache. -2. It is more scalable because it is O(nr_hot_pages), whereas the - PFN-based approach is O(nr_total_pages). - -Add /sys/kernel/debug/lru_gen_full for debugging. - -[1] https://dl.acm.org/doi/10.1145/3297858.3304053 -[2] https://dl.acm.org/doi/10.1145/3503222.3507731 - -Signed-off-by: Yu Zhao -Reviewed-by: Qi Zheng -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain ---- - include/linux/nodemask.h | 1 + - mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++- - 2 files changed, 402 insertions(+), 10 deletions(-) - -diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h -index 0f233b76c9ce..292ec0ce0d63 100644 ---- a/include/linux/nodemask.h -+++ b/include/linux/nodemask.h -@@ -485,6 +485,7 @@ static inline int num_node_state(enum node_states state) - #define first_online_node 0 - #define first_memory_node 0 - #define next_online_node(nid) (MAX_NUMNODES) -+#define next_memory_node(nid) (MAX_NUMNODES) - #define nr_node_ids 1U - #define nr_online_nodes 1U - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 1f2892a0dc41..fbcd298adca7 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -53,6 +53,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -4137,12 +4138,40 @@ static void clear_mm_walk(void) - kfree(walk); - } - --static void inc_min_seq(struct lruvec *lruvec, int type) -+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) - { -+ int zone; -+ int remaining = MAX_LRU_BATCH; - struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - -+ if (type == LRU_GEN_ANON && !can_swap) -+ goto done; -+ -+ /* prevent cold/hot inversion if force_scan is true */ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ struct list_head *head = &lrugen->lists[old_gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct folio *folio = lru_to_folio(head); -+ -+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); -+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); -+ -+ new_gen = folio_inc_gen(lruvec, folio, false); -+ list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+done: - reset_ctrl_pos(lruvec, type, true); - WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); -+ -+ return true; - } - - static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) -@@ -4188,7 +4217,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) - return success; - } - --static void inc_max_seq(struct lruvec *lruvec, bool can_swap) -+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) - { - int prev, next; - int type, zone; -@@ -4202,9 +4231,13 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) - if (get_nr_gens(lruvec, type) != MAX_NR_GENS) - continue; - -- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); -+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); - -- inc_min_seq(lruvec, type); -+ while (!inc_min_seq(lruvec, type, can_swap)) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } - } - - /* -@@ -4241,7 +4274,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) - } - - static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, -- struct scan_control *sc, bool can_swap) -+ struct scan_control *sc, bool can_swap, bool force_scan) - { - bool success; - struct lru_gen_mm_walk *walk; -@@ -4262,7 +4295,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - * handful of PTEs. Spreading the work out over a period of time usually - * is less efficient, but it avoids bursty page faults. - */ -- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { -+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { - success = iterate_mm_list_nowalk(lruvec, max_seq); - goto done; - } -@@ -4276,7 +4309,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - walk->lruvec = lruvec; - walk->max_seq = max_seq; - walk->can_swap = can_swap; -- walk->force_scan = false; -+ walk->force_scan = force_scan; - - do { - success = iterate_mm_list(lruvec, walk, &mm); -@@ -4296,7 +4329,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - - VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); - -- inc_max_seq(lruvec, can_swap); -+ inc_max_seq(lruvec, can_swap, force_scan); - /* either this sees any waiters or they will see updated max_seq */ - if (wq_has_sleeper(&lruvec->mm_state.wait)) - wake_up_all(&lruvec->mm_state.wait); -@@ -4395,7 +4428,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned - } - - if (nr_to_scan && need_aging) -- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); -+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); - - return true; - } -@@ -4962,7 +4995,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * - if (current_is_kswapd()) - return 0; - -- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap)) -+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) - return nr_to_scan; - done: - return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; -@@ -5251,6 +5284,361 @@ static struct attribute_group lru_gen_attr_group = { - .attrs = lru_gen_attrs, - }; - -+/****************************************************************************** -+ * debugfs interface -+ ******************************************************************************/ -+ -+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) -+{ -+ struct mem_cgroup *memcg; -+ loff_t nr_to_skip = *pos; -+ -+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); -+ if (!m->private) -+ return ERR_PTR(-ENOMEM); -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node_state(nid, N_MEMORY) { -+ if (!nr_to_skip--) -+ return get_lruvec(memcg, nid); -+ } -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ return NULL; -+} -+ -+static void lru_gen_seq_stop(struct seq_file *m, void *v) -+{ -+ if (!IS_ERR_OR_NULL(v)) -+ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); -+ -+ kvfree(m->private); -+ m->private = NULL; -+} -+ -+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) -+{ -+ int nid = lruvec_pgdat(v)->node_id; -+ struct mem_cgroup *memcg = lruvec_memcg(v); -+ -+ ++*pos; -+ -+ nid = next_memory_node(nid); -+ if (nid == MAX_NUMNODES) { -+ memcg = mem_cgroup_iter(NULL, memcg, NULL); -+ if (!memcg) -+ return NULL; -+ -+ nid = first_memory_node; -+ } -+ -+ return get_lruvec(memcg, nid); -+} -+ -+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, -+ unsigned long max_seq, unsigned long *min_seq, -+ unsigned long seq) -+{ -+ int i; -+ int type, tier; -+ int hist = lru_hist_from_seq(seq); -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ seq_printf(m, " %10d", tier); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ const char *s = " "; -+ unsigned long n[3] = {}; -+ -+ if (seq == max_seq) { -+ s = "RT "; -+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); -+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); -+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { -+ s = "rep"; -+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); -+ } -+ -+ for (i = 0; i < 3; i++) -+ seq_printf(m, " %10lu%c", n[i], s[i]); -+ } -+ seq_putc(m, '\n'); -+ } -+ -+ seq_puts(m, " "); -+ for (i = 0; i < NR_MM_STATS; i++) { -+ const char *s = " "; -+ unsigned long n = 0; -+ -+ if (seq == max_seq && NR_HIST_GENS == 1) { -+ s = "LOYNFA"; -+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]); -+ } else if (seq != max_seq && NR_HIST_GENS > 1) { -+ s = "loynfa"; -+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]); -+ } -+ -+ seq_printf(m, " %10lu%c", n, s[i]); -+ } -+ seq_putc(m, '\n'); -+} -+ -+static int lru_gen_seq_show(struct seq_file *m, void *v) -+{ -+ unsigned long seq; -+ bool full = !debugfs_real_fops(m->file)->write; -+ struct lruvec *lruvec = v; -+ struct lru_gen_struct *lrugen = &lruvec->lrugen; -+ int nid = lruvec_pgdat(lruvec)->node_id; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (nid == first_memory_node) { -+ const char *path = memcg ? m->private : ""; -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); -+#endif -+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); -+ } -+ -+ seq_printf(m, " node %5d\n", nid); -+ -+ if (!full) -+ seq = min_seq[LRU_GEN_ANON]; -+ else if (max_seq >= MAX_NR_GENS) -+ seq = max_seq - MAX_NR_GENS + 1; -+ else -+ seq = 0; -+ -+ for (; seq <= max_seq; seq++) { -+ int type, zone; -+ int gen = lru_gen_from_seq(seq); -+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); -+ -+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ unsigned long size = 0; -+ char mark = full && seq < min_seq[type] ? 'x' : ' '; -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) -+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); -+ -+ seq_printf(m, " %10lu%c", size, mark); -+ } -+ -+ seq_putc(m, '\n'); -+ -+ if (full) -+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); -+ } -+ -+ return 0; -+} -+ -+static const struct seq_operations lru_gen_seq_ops = { -+ .start = lru_gen_seq_start, -+ .stop = lru_gen_seq_stop, -+ .next = lru_gen_seq_next, -+ .show = lru_gen_seq_show, -+}; -+ -+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, -+ bool can_swap, bool force_scan) -+{ -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (seq < max_seq) -+ return 0; -+ -+ if (seq > max_seq) -+ return -EINVAL; -+ -+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) -+ return -ERANGE; -+ -+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); -+ -+ return 0; -+} -+ -+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, -+ int swappiness, unsigned long nr_to_reclaim) -+{ -+ DEFINE_MAX_SEQ(lruvec); -+ -+ if (seq + MIN_NR_GENS > max_seq) -+ return -EINVAL; -+ -+ sc->nr_reclaimed = 0; -+ -+ while (!signal_pending(current)) { -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (seq < min_seq[!swappiness]) -+ return 0; -+ -+ if (sc->nr_reclaimed >= nr_to_reclaim) -+ return 0; -+ -+ if (!evict_folios(lruvec, sc, swappiness, NULL)) -+ return 0; -+ -+ cond_resched(); -+ } -+ -+ return -EINTR; -+} -+ -+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, -+ struct scan_control *sc, int swappiness, unsigned long opt) -+{ -+ struct lruvec *lruvec; -+ int err = -EINVAL; -+ struct mem_cgroup *memcg = NULL; -+ -+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) -+ return -EINVAL; -+ -+ if (!mem_cgroup_disabled()) { -+ rcu_read_lock(); -+ memcg = mem_cgroup_from_id(memcg_id); -+#ifdef CONFIG_MEMCG -+ if (memcg && !css_tryget(&memcg->css)) -+ memcg = NULL; -+#endif -+ rcu_read_unlock(); -+ -+ if (!memcg) -+ return -EINVAL; -+ } -+ -+ if (memcg_id != mem_cgroup_id(memcg)) -+ goto done; -+ -+ lruvec = get_lruvec(memcg, nid); -+ -+ if (swappiness < 0) -+ swappiness = get_swappiness(lruvec, sc); -+ else if (swappiness > 200) -+ goto done; -+ -+ switch (cmd) { -+ case '+': -+ err = run_aging(lruvec, seq, sc, swappiness, opt); -+ break; -+ case '-': -+ err = run_eviction(lruvec, seq, sc, swappiness, opt); -+ break; -+ } -+done: -+ mem_cgroup_put(memcg); -+ -+ return err; -+} -+ -+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, -+ size_t len, loff_t *pos) -+{ -+ void *buf; -+ char *cur, *next; -+ unsigned int flags; -+ struct blk_plug plug; -+ int err = -EINVAL; -+ struct scan_control sc = { -+ .may_writepage = true, -+ .may_unmap = true, -+ .may_swap = true, -+ .reclaim_idx = MAX_NR_ZONES - 1, -+ .gfp_mask = GFP_KERNEL, -+ }; -+ -+ buf = kvmalloc(len + 1, GFP_KERNEL); -+ if (!buf) -+ return -ENOMEM; -+ -+ if (copy_from_user(buf, src, len)) { -+ kvfree(buf); -+ return -EFAULT; -+ } -+ -+ set_task_reclaim_state(current, &sc.reclaim_state); -+ flags = memalloc_noreclaim_save(); -+ blk_start_plug(&plug); -+ if (!set_mm_walk(NULL)) { -+ err = -ENOMEM; -+ goto done; -+ } -+ -+ next = buf; -+ next[len] = '\0'; -+ -+ while ((cur = strsep(&next, ",;\n"))) { -+ int n; -+ int end; -+ char cmd; -+ unsigned int memcg_id; -+ unsigned int nid; -+ unsigned long seq; -+ unsigned int swappiness = -1; -+ unsigned long opt = -1; -+ -+ cur = skip_spaces(cur); -+ if (!*cur) -+ continue; -+ -+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, -+ &seq, &end, &swappiness, &end, &opt, &end); -+ if (n < 4 || cur[end]) { -+ err = -EINVAL; -+ break; -+ } -+ -+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); -+ if (err) -+ break; -+ } -+done: -+ clear_mm_walk(); -+ blk_finish_plug(&plug); -+ memalloc_noreclaim_restore(flags); -+ set_task_reclaim_state(current, NULL); -+ -+ kvfree(buf); -+ -+ return err ? : len; -+} -+ -+static int lru_gen_seq_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &lru_gen_seq_ops); -+} -+ -+static const struct file_operations lru_gen_rw_fops = { -+ .open = lru_gen_seq_open, -+ .read = seq_read, -+ .write = lru_gen_seq_write, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static const struct file_operations lru_gen_ro_fops = { -+ .open = lru_gen_seq_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ - /****************************************************************************** - * initialization - ******************************************************************************/ -@@ -5308,6 +5696,9 @@ static int __init init_lru_gen(void) - if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) - pr_err("lru_gen: failed to create sysfs group\n"); - -+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); -+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); -+ - return 0; - }; - late_initcall(init_lru_gen); - -From patchwork Wed Jul 6 22:00:22 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908711 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 6414FC43334 - for ; Wed, 6 Jul 2022 22:01:29 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 39E278E000A; Wed, 6 Jul 2022 18:01:13 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id 34DB98E0001; Wed, 6 Jul 2022 18:01:13 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id 1A1728E000A; Wed, 6 Jul 2022 18:01:13 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0017.hostedemail.com - [216.40.44.17]) - by kanga.kvack.org (Postfix) with ESMTP id 047FA8E0001 - for ; Wed, 6 Jul 2022 18:01:13 -0400 (EDT) -Received: from smtpin07.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay06.hostedemail.com (Postfix) with ESMTP id CA9C334906 - for ; Wed, 6 Jul 2022 22:01:12 +0000 (UTC) -X-FDA: 79658046384.07.4AE281A -Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com - [209.85.219.201]) - by imf19.hostedemail.com (Postfix) with ESMTP id E1D811A004A - for ; Wed, 6 Jul 2022 22:01:11 +0000 (UTC) -Received: by mail-yb1-f201.google.com with SMTP id - j11-20020a05690212cb00b006454988d225so12639320ybu.10 - for ; Wed, 06 Jul 2022 15:01:11 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=gpspQZzCiCtLDF1mE2Rbzp3OUWg7vlq70C4xLE3ya+E=; - b=KmRh3W6zCTnYhuu2uLwH/71AGZzl5TVUrtsNnUP5zXTmGsYrVbcqdtCu+MA/r0Ndp0 - Swx6K5/Y1yzZuona+ojX9pyfPH0vSgmsnPUuGuK8IgKoxke8pbVIOMVO1oHB4MFfbJr9 - MZQ2DHsaZhnv+oABy231/ZNYVnut1uI8HXMoZE64GkKDaX0oTm6VD5IWp6Pjb9e4CCS2 - 4l6LRlV0GkUZbtfNu7oRMgYKOcOBXuCtbtOCopiW839uMoofW0liroJ2wElyPDiAsF2j - ZEKcyiLmzwxANf1QRl8D0H0t207nTseUwQuoJ0fGq2geu1GyW7/GzRuxYm66v/+UUfVJ - Ti/g== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=gpspQZzCiCtLDF1mE2Rbzp3OUWg7vlq70C4xLE3ya+E=; - b=Ct8NkvISAcd2F1onRi9j8wNBQ3yVS4sMkwQThZBmSai4nt0pCzUW6MSInM6la2RQ+6 - Iyk/Q6V/4/M1AEzJ1CIyUOjtskptWB7g9JCLcYDV67l3e3cym3CfKO6faANsjcNo61aE - cGyF+8I3UwoMP2XkhiX8e+sh+JyAVS+7v6ah2jAK3rMcN9Qy3pRUpTzse16anYIvPXmH - D/n6XDiuVtka4xdvtVrXH1Ovj7jTQyu5zNSeDpYUHIIuY5HyyWlwP2GqOXO5+3ztetSe - lqHq/pwTeg5OaKzyo1/S4u5j63+cCDsRbst48LWqqY7iSJl7Jqjh9IcuciM5gwWyKVQq - exXg== -X-Gm-Message-State: AJIora8Djp7T6fvZwj7nFJ1nTHsOTMleXrE/THizuhZy3oXIgXemxG6T - WRLpIC0iL2d+my0UEmLvbYJe1kwX4xc= -X-Google-Smtp-Source: - AGRyM1tcS0bhkovBqaAMGcBTFG0LXet+IyIY3UhCyBJaxouYWPrdgSATtWZUnD1044Cxo6jW3UsFPLIHGBk= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a25:73d1:0:b0:66e:aee4:feb3 with SMTP id - o200-20020a2573d1000000b0066eaee4feb3mr1925521ybc.452.1657144871215; Wed, 06 - Jul 2022 15:01:11 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:22 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-14-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 13/14] mm: multi-gen LRU: admin guide -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144872; a=rsa-sha256; - cv=none; - b=JsWQytQvs1ZknqPoqD3qo1TJldLBGiKSTga/ejO8CyQYViqdXml7nvJD7fQyRxXf/sYCeO - o91ZwxjqHFV+Qk45x3ZWpVnbVz5s7Ub1LlWxdnj2ACxVxDi2i4I70KlZDYV1V7+0DLXiwM - Cf5UnSo0xArYOHGQTNNAa/beRpM+U2U= -ARC-Authentication-Results: i=1; - imf19.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=KmRh3W6z; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf19.hostedemail.com: domain of - 3JwbGYgYKCGgeafNGUMUUMRK.IUSROTad-SSQbGIQ.UXM@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3JwbGYgYKCGgeafNGUMUUMRK.IUSROTad-SSQbGIQ.UXM@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144872; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=gpspQZzCiCtLDF1mE2Rbzp3OUWg7vlq70C4xLE3ya+E=; - b=7SORwSc3XuCDIhY4Nnt3155Fml8B5PM7q+cxyDoyzRH9f30a8JT7kTOzO43GZtqu1vi0gx - ZvOQWsmLsXdrJ4He9F7TCEfWwHvTKJw2xq5RY+ztHvYdkw0u4ntOGqKfhRRpSNYMieV4f5 - IIb7Tz/BoxO7bW/vk+Bjj4szoFKZSXU= -X-Rspamd-Server: rspam04 -X-Rspam-User: -Authentication-Results: imf19.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=KmRh3W6z; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf19.hostedemail.com: domain of - 3JwbGYgYKCGgeafNGUMUUMRK.IUSROTad-SSQbGIQ.UXM@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3JwbGYgYKCGgeafNGUMUUMRK.IUSROTad-SSQbGIQ.UXM@flex--yuzhao.bounces.google.com -X-Stat-Signature: a3k84bgjbfr9z8g5wse9kf3mp6fodfft -X-Rspamd-Queue-Id: E1D811A004A -X-HE-Tag: 1657144871-897432 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Add an admin guide. - -Signed-off-by: Yu Zhao -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain -Reviewed-by: Bagas Sanjaya ---- - Documentation/admin-guide/mm/index.rst | 1 + - Documentation/admin-guide/mm/multigen_lru.rst | 156 ++++++++++++++++++ - mm/Kconfig | 3 +- - mm/vmscan.c | 4 + - 4 files changed, 163 insertions(+), 1 deletion(-) - create mode 100644 Documentation/admin-guide/mm/multigen_lru.rst - -diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst -index c21b5823f126..2cf5bae62036 100644 ---- a/Documentation/admin-guide/mm/index.rst -+++ b/Documentation/admin-guide/mm/index.rst -@@ -32,6 +32,7 @@ the Linux memory management. - idle_page_tracking - ksm - memory-hotplug -+ multigen_lru - nommu-mmap - numa_memory_policy - numaperf -diff --git a/Documentation/admin-guide/mm/multigen_lru.rst b/Documentation/admin-guide/mm/multigen_lru.rst -new file mode 100644 -index 000000000000..6355f2b5019d ---- /dev/null -+++ b/Documentation/admin-guide/mm/multigen_lru.rst -@@ -0,0 +1,156 @@ -+.. SPDX-License-Identifier: GPL-2.0 -+ -+============= -+Multi-Gen LRU -+============= -+The multi-gen LRU is an alternative LRU implementation that optimizes -+page reclaim and improves performance under memory pressure. Page -+reclaim decides the kernel's caching policy and ability to overcommit -+memory. It directly impacts the kswapd CPU usage and RAM efficiency. -+ -+Quick start -+=========== -+Build the kernel with the following configurations. -+ -+* ``CONFIG_LRU_GEN=y`` -+* ``CONFIG_LRU_GEN_ENABLED=y`` -+ -+All set! -+ -+Runtime options -+=============== -+``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the -+following subsections. -+ -+Kill switch -+----------- -+``enabled`` accepts different values to enable or disable the -+following components. Its default value depends on -+``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled -+unless some of them have unforeseen side effects. Writing to -+``enabled`` has no effect when a component is not supported by the -+hardware, and valid values will be accepted even when the main switch -+is off. -+ -+====== =============================================================== -+Values Components -+====== =============================================================== -+0x0001 The main switch for the multi-gen LRU. -+0x0002 Clearing the accessed bit in leaf page table entries in large -+ batches, when MMU sets it (e.g., on x86). This behavior can -+ theoretically worsen lock contention (mmap_lock). If it is -+ disabled, the multi-gen LRU will suffer a minor performance -+ degradation for workloads that contiguously map hot pages, -+ whose accessed bits can be otherwise cleared by fewer larger -+ batches. -+0x0004 Clearing the accessed bit in non-leaf page table entries as -+ well, when MMU sets it (e.g., on x86). This behavior was not -+ verified on x86 varieties other than Intel and AMD. If it is -+ disabled, the multi-gen LRU will suffer a negligible -+ performance degradation. -+[yYnN] Apply to all the components above. -+====== =============================================================== -+ -+E.g., -+:: -+ -+ echo y >/sys/kernel/mm/lru_gen/enabled -+ cat /sys/kernel/mm/lru_gen/enabled -+ 0x0007 -+ echo 5 >/sys/kernel/mm/lru_gen/enabled -+ cat /sys/kernel/mm/lru_gen/enabled -+ 0x0005 -+ -+Thrashing prevention -+-------------------- -+Personal computers are more sensitive to thrashing because it can -+cause janks (lags when rendering UI) and negatively impact user -+experience. The multi-gen LRU offers thrashing prevention to the -+majority of laptop and desktop users who do not have ``oomd``. -+ -+Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of -+``N`` milliseconds from getting evicted. The OOM killer is triggered -+if this working set cannot be kept in memory. In other words, this -+option works as an adjustable pressure relief valve, and when open, it -+terminates applications that are hopefully not being used. -+ -+Based on the average human detectable lag (~100ms), ``N=1000`` usually -+eliminates intolerable janks due to thrashing. Larger values like -+``N=3000`` make janks less noticeable at the risk of premature OOM -+kills. -+ -+The default value ``0`` means disabled. -+ -+Experimental features -+===================== -+``/sys/kernel/debug/lru_gen`` accepts commands described in the -+following subsections. Multiple command lines are supported, so does -+concatenation with delimiters ``,`` and ``;``. -+ -+``/sys/kernel/debug/lru_gen_full`` provides additional stats for -+debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from -+evicted generations in this file. -+ -+Working set estimation -+---------------------- -+Working set estimation measures how much memory an application needs -+in a given time interval, and it is usually done with little impact on -+the performance of the application. E.g., data centers want to -+optimize job scheduling (bin packing) to improve memory utilizations. -+When a new job comes in, the job scheduler needs to find out whether -+each server it manages can allocate a certain amount of memory for -+this new job before it can pick a candidate. To do so, the job -+scheduler needs to estimate the working sets of the existing jobs. -+ -+When it is read, ``lru_gen`` returns a histogram of numbers of pages -+accessed over different time intervals for each memcg and node. -+``MAX_NR_GENS`` decides the number of bins for each histogram. The -+histograms are noncumulative. -+:: -+ -+ memcg memcg_id memcg_path -+ node node_id -+ min_gen_nr age_in_ms nr_anon_pages nr_file_pages -+ ... -+ max_gen_nr age_in_ms nr_anon_pages nr_file_pages -+ -+Each bin contains an estimated number of pages that have been accessed -+within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages -+and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of -+the former is the largest and that of the latter is the smallest. -+ -+Users can write ``+ memcg_id node_id max_gen_nr -+[can_swap [force_scan]]`` to ``lru_gen`` to create a new generation -+``max_gen_nr+1``. ``can_swap`` defaults to the swap setting and, if it -+is set to ``1``, it forces the scan of anon pages when swap is off, -+and vice versa. ``force_scan`` defaults to ``1`` and, if it is set to -+``0``, it employs heuristics to reduce the overhead, which is likely -+to reduce the coverage as well. -+ -+A typical use case is that a job scheduler writes to ``lru_gen`` at a -+certain time interval to create new generations, and it ranks the -+servers it manages based on the sizes of their cold pages defined by -+this time interval. -+ -+Proactive reclaim -+----------------- -+Proactive reclaim induces page reclaim when there is no memory -+pressure. It usually targets cold pages only. E.g., when a new job -+comes in, the job scheduler wants to proactively reclaim cold pages on -+the server it selected to improve the chance of successfully landing -+this new job. -+ -+Users can write ``- memcg_id node_id min_gen_nr [swappiness -+[nr_to_reclaim]]`` to ``lru_gen`` to evict generations less than or -+equal to ``min_gen_nr``. Note that ``min_gen_nr`` should be less than -+``max_gen_nr-1`` as ``max_gen_nr`` and ``max_gen_nr-1`` are not fully -+aged and therefore cannot be evicted. ``swappiness`` overrides the -+default value in ``/proc/sys/vm/swappiness``. ``nr_to_reclaim`` limits -+the number of pages to evict. -+ -+A typical use case is that a job scheduler writes to ``lru_gen`` -+before it tries to land a new job on a server. If it fails to -+materialize enough cold pages because of the overestimation, it -+retries on the next server according to the ranking result obtained -+from the working set estimation step. This less forceful approach -+limits the impacts on the existing jobs. -diff --git a/mm/Kconfig b/mm/Kconfig -index 0c2ef0af0036..a0f7b6e66410 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -1137,7 +1137,8 @@ config LRU_GEN - # make sure folio->flags has enough spare bits - depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP - help -- A high performance LRU implementation to overcommit memory. -+ A high performance LRU implementation to overcommit memory. See -+ Documentation/admin-guide/mm/multigen_lru.rst for details. - - config LRU_GEN_ENABLED - bool "Enable by default" -diff --git a/mm/vmscan.c b/mm/vmscan.c -index fbcd298adca7..7096ff7836db 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -5209,6 +5209,7 @@ static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, c - return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); - } - -+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ - static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) - { -@@ -5242,6 +5243,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c - return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); - } - -+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ - static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) - { -@@ -5389,6 +5391,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, - seq_putc(m, '\n'); - } - -+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ - static int lru_gen_seq_show(struct seq_file *m, void *v) - { - unsigned long seq; -@@ -5547,6 +5550,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, - return err; - } - -+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ - static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, - size_t len, loff_t *pos) - { - -From patchwork Wed Jul 6 22:00:23 2022 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 8bit -X-Patchwork-Submitter: Yu Zhao -X-Patchwork-Id: 12908712 -Return-Path: -X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on - aws-us-west-2-korg-lkml-1.web.codeaurora.org -Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) - by smtp.lore.kernel.org (Postfix) with ESMTP id 6E4E6C43334 - for ; Wed, 6 Jul 2022 22:01:33 +0000 (UTC) -Received: by kanga.kvack.org (Postfix) - id 59C1C8E000B; Wed, 6 Jul 2022 18:01:14 -0400 (EDT) -Received: by kanga.kvack.org (Postfix, from userid 40) - id 5235B8E0001; Wed, 6 Jul 2022 18:01:14 -0400 (EDT) -X-Delivered-To: int-list-linux-mm@kvack.org -Received: by kanga.kvack.org (Postfix, from userid 63042) - id 350398E000B; Wed, 6 Jul 2022 18:01:14 -0400 (EDT) -X-Delivered-To: linux-mm@kvack.org -Received: from relay.hostedemail.com (smtprelay0012.hostedemail.com - [216.40.44.12]) - by kanga.kvack.org (Postfix) with ESMTP id 225F58E0001 - for ; Wed, 6 Jul 2022 18:01:14 -0400 (EDT) -Received: from smtpin31.hostedemail.com (a10.router.float.18 [10.200.18.1]) - by unirelay08.hostedemail.com (Postfix) with ESMTP id EE41D2169C - for ; Wed, 6 Jul 2022 22:01:13 +0000 (UTC) -X-FDA: 79658046426.31.47294E4 -Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com - [209.85.219.201]) - by imf01.hostedemail.com (Postfix) with ESMTP id 5513940016 - for ; Wed, 6 Jul 2022 22:01:13 +0000 (UTC) -Received: by mail-yb1-f201.google.com with SMTP id - a8-20020a25a188000000b0066839c45fe8so12515135ybi.17 - for ; Wed, 06 Jul 2022 15:01:13 -0700 (PDT) -DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=google.com; s=20210112; - h=date:in-reply-to:message-id:mime-version:references:subject:from:to - :cc:content-transfer-encoding; - bh=Y7M5+uMCyjK2Tw3gtvlFnf3s0uMKtiqOOKU+iupOzGc=; - b=RaJYVCw6kQFWZr57Fj6Z+M7CjIu+Fy2mkXaD9icGpAKOAxyz1uufDA95qkMfXqksCy - CttyIsR4+X5trkDvd0W5HTI3/XFLKoLEsiRSAv23qebNkIOkH8cPlNd2JsU/+DVzJUpM - TGOZ6teMB/sFPIH8IZKMODnpg+VxKIyScGqlsqOiDoxcPPCMP8e0zolM240kI1HmhYsj - WxZdSDL+OZnX2V8pTDz516/mmCsEM23W0x65TiLdKDGOIFAAkNP/EIcvQWWj8SBUz/dL - a0IGdBEhZobBNts8S/4QPXOFk1zc9TBNhY+OPo4y5YJG3duUWWVQ+373DmVdZPluRI23 - DgVQ== -X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; - d=1e100.net; s=20210112; - h=x-gm-message-state:date:in-reply-to:message-id:mime-version - :references:subject:from:to:cc:content-transfer-encoding; - bh=Y7M5+uMCyjK2Tw3gtvlFnf3s0uMKtiqOOKU+iupOzGc=; - b=t8KFMI+odj2H0PYsSR514rWxJS26xzV7AKq6CAfD49T5kfUjPz8wfdAfySr/kBsGFX - Ijo9N8v2aDQNSOqwxiy6N0WnrzD6bgFtRWeblglDP9rnKXQmG38PpjQrFbGbWRu0JwOP - V4GhBBsVBqsEbP6lV54mE8LL+AX6orjSmsdYgGuR7py2ze/69AI+KXkU4wuGGk7f02J6 - NOorMZZljVWHawNiYzwJ/nSCIEDP1RdLxj/QR1X2gsT6fGY0XqrePFMti1n8UBr5vGzF - qDM3r6uoPM0Dl89KQfjhANf8jyajCPr0wd7Ldc2REEmnDU12jZhd2cV3sTQEKMbtFvsH - RiDA== -X-Gm-Message-State: AJIora9JyuHh+WKBn43isO3BKSkb8MvQEqp82Y/5Bs0mEkxpSgPDJzSM - HtWXZ+iDc2EVNjhmgizIp3qSZYJgkRM= -X-Google-Smtp-Source: - AGRyM1tPyG6w7lg37p0dKVbMplDSUgwZboH2lG42opEnpdXZgbjOhtWD7cZCMHKO+sLemtrKnNTphNyTinE= -X-Received: from yuzhao.bld.corp.google.com - ([2620:15c:183:200:b89c:e10a:466e:cf7d]) - (user=yuzhao job=sendgmr) by 2002:a25:b806:0:b0:663:d35d:8b8a with SMTP id - v6-20020a25b806000000b00663d35d8b8amr45647399ybj.69.1657144872662; Wed, 06 - Jul 2022 15:01:12 -0700 (PDT) -Date: Wed, 6 Jul 2022 16:00:23 -0600 -In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> -Message-Id: <20220706220022.968789-15-yuzhao@google.com> -Mime-Version: 1.0 -References: <20220706220022.968789-1-yuzhao@google.com> -X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog -Subject: [PATCH v13 14/14] mm: multi-gen LRU: design doc -From: Yu Zhao -To: Andrew Morton -Cc: Andi Kleen , - Aneesh Kumar , - Catalin Marinas , - Dave Hansen , Hillf Danton , - Jens Axboe , Johannes Weiner , - Jonathan Corbet , - Linus Torvalds , - Matthew Wilcox , Mel Gorman , - Michael Larabel , - Michal Hocko , Mike Rapoport , - Peter Zijlstra , Tejun Heo , - Vlastimil Babka , Will Deacon , - linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, - linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, - page-reclaim@google.com, Yu Zhao , - Brian Geffon , - Jan Alexander Steffens , - Oleksandr Natalenko , - Steven Barrett , - Suleiman Souhlal , Daniel Byrne , - Donald Carr , - " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , - Konstantin Kharlamov , - Shuang Zhai , Sofia Trinh , - Vaibhav Jain -ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144873; a=rsa-sha256; - cv=none; - b=UrrRpXp7KWnXHmjT/QxuJ33LiGsO02xp/Gl5IKp2przZQE/MN2oPkN0qvS6FM/HpuayBLm - zd3wW1kYV7c+CYfLpUIs4G8pg9A6gNyLzycabKZPgoBu+fqMU04tsshxN75CQVnnpFeUVh - ZD4xhdIcppi7j9nVM9IcKC/45QGbnp4= -ARC-Authentication-Results: i=1; - imf01.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=RaJYVCw6; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf01.hostedemail.com: domain of - 3KAbGYgYKCGkfbgOHVNVVNSL.JVTSPUbe-TTRcHJR.VYN@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3KAbGYgYKCGkfbgOHVNVVNSL.JVTSPUbe-TTRcHJR.VYN@flex--yuzhao.bounces.google.com -ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; - d=hostedemail.com; - s=arc-20220608; t=1657144873; - h=from:from:sender:reply-to:subject:subject:date:date: - message-id:message-id:to:to:cc:cc:mime-version:mime-version: - content-type:content-type: - content-transfer-encoding:content-transfer-encoding: - in-reply-to:in-reply-to:references:references:dkim-signature; - bh=Y7M5+uMCyjK2Tw3gtvlFnf3s0uMKtiqOOKU+iupOzGc=; - b=CC8ORwOmRVo1ysrsxcLM/w/OQsNgtHVDsWXjTolVPaVGtsBAmORZs9mo/t9qQJXlTbpE6W - MK4e1j+KxvgzJ4hEk7FEh4udfXbo/i2Zs4SIAS1fMWoE8oSUqdpISvSeaeM8m9OTpSMv9b - y/YSdGTLFiLWNyHM+yI8Q6QaQPpR8FA= -X-Rspamd-Server: rspam04 -X-Rspam-User: -Authentication-Results: imf01.hostedemail.com; - dkim=pass header.d=google.com header.s=20210112 header.b=RaJYVCw6; - dmarc=pass (policy=reject) header.from=google.com; - spf=pass (imf01.hostedemail.com: domain of - 3KAbGYgYKCGkfbgOHVNVVNSL.JVTSPUbe-TTRcHJR.VYN@flex--yuzhao.bounces.google.com - designates 209.85.219.201 as permitted sender) - smtp.mailfrom=3KAbGYgYKCGkfbgOHVNVVNSL.JVTSPUbe-TTRcHJR.VYN@flex--yuzhao.bounces.google.com -X-Stat-Signature: gkifem6ym4fgtjcteqxerconsisp8cqt -X-Rspamd-Queue-Id: 5513940016 -X-HE-Tag: 1657144873-85540 -X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 -Sender: owner-linux-mm@kvack.org -Precedence: bulk -X-Loop: owner-majordomo@kvack.org -List-ID: - -Add a design doc. - -Signed-off-by: Yu Zhao -Acked-by: Brian Geffon -Acked-by: Jan Alexander Steffens (heftig) -Acked-by: Oleksandr Natalenko -Acked-by: Steven Barrett -Acked-by: Suleiman Souhlal -Tested-by: Daniel Byrne -Tested-by: Donald Carr -Tested-by: Holger Hoffstätte -Tested-by: Konstantin Kharlamov -Tested-by: Shuang Zhai -Tested-by: Sofia Trinh -Tested-by: Vaibhav Jain -Reviewed-by: Bagas Sanjaya ---- - Documentation/vm/index.rst | 1 + - Documentation/vm/multigen_lru.rst | 159 ++++++++++++++++++++++++++++++ - 2 files changed, 160 insertions(+) - create mode 100644 Documentation/vm/multigen_lru.rst - -diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst -index 575ccd40e30c..4aa12b8be278 100644 ---- a/Documentation/vm/index.rst -+++ b/Documentation/vm/index.rst -@@ -51,6 +51,7 @@ above structured documentation, or deleted if it has served its purpose. - ksm - memory-model - mmu_notifier -+ multigen_lru - numa - overcommit-accounting - page_migration -diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst -new file mode 100644 -index 000000000000..d7062c6a8946 ---- /dev/null -+++ b/Documentation/vm/multigen_lru.rst -@@ -0,0 +1,159 @@ -+.. SPDX-License-Identifier: GPL-2.0 -+ -+============= -+Multi-Gen LRU -+============= -+The multi-gen LRU is an alternative LRU implementation that optimizes -+page reclaim and improves performance under memory pressure. Page -+reclaim decides the kernel's caching policy and ability to overcommit -+memory. It directly impacts the kswapd CPU usage and RAM efficiency. -+ -+Design overview -+=============== -+Objectives -+---------- -+The design objectives are: -+ -+* Good representation of access recency -+* Try to profit from spatial locality -+* Fast paths to make obvious choices -+* Simple self-correcting heuristics -+ -+The representation of access recency is at the core of all LRU -+implementations. In the multi-gen LRU, each generation represents a -+group of pages with similar access recency. Generations establish a -+(time-based) common frame of reference and therefore help make better -+choices, e.g., between different memcgs on a computer or different -+computers in a data center (for job scheduling). -+ -+Exploiting spatial locality improves efficiency when gathering the -+accessed bit. A rmap walk targets a single page and does not try to -+profit from discovering a young PTE. A page table walk can sweep all -+the young PTEs in an address space, but the address space can be too -+sparse to make a profit. The key is to optimize both methods and use -+them in combination. -+ -+Fast paths reduce code complexity and runtime overhead. Unmapped pages -+do not require TLB flushes; clean pages do not require writeback. -+These facts are only helpful when other conditions, e.g., access -+recency, are similar. With generations as a common frame of reference, -+additional factors stand out. But obvious choices might not be good -+choices; thus self-correction is necessary. -+ -+The benefits of simple self-correcting heuristics are self-evident. -+Again, with generations as a common frame of reference, this becomes -+attainable. Specifically, pages in the same generation can be -+categorized based on additional factors, and a feedback loop can -+statistically compare the refault percentages across those categories -+and infer which of them are better choices. -+ -+Assumptions -+----------- -+The protection of hot pages and the selection of cold pages are based -+on page access channels and patterns. There are two access channels: -+ -+* Accesses through page tables -+* Accesses through file descriptors -+ -+The protection of the former channel is by design stronger because: -+ -+1. The uncertainty in determining the access patterns of the former -+ channel is higher due to the approximation of the accessed bit. -+2. The cost of evicting the former channel is higher due to the TLB -+ flushes required and the likelihood of encountering the dirty bit. -+3. The penalty of underprotecting the former channel is higher because -+ applications usually do not prepare themselves for major page -+ faults like they do for blocked I/O. E.g., GUI applications -+ commonly use dedicated I/O threads to avoid blocking rendering -+ threads. -+ -+There are also two access patterns: -+ -+* Accesses exhibiting temporal locality -+* Accesses not exhibiting temporal locality -+ -+For the reasons listed above, the former channel is assumed to follow -+the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is -+present, and the latter channel is assumed to follow the latter -+pattern unless outlying refaults have been observed. -+ -+Workflow overview -+================= -+Evictable pages are divided into multiple generations for each -+``lruvec``. The youngest generation number is stored in -+``lrugen->max_seq`` for both anon and file types as they are aged on -+an equal footing. The oldest generation numbers are stored in -+``lrugen->min_seq[]`` separately for anon and file types as clean file -+pages can be evicted regardless of swap constraints. These three -+variables are monotonically increasing. -+ -+Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` -+bits in order to fit into the gen counter in ``folio->flags``. Each -+truncated generation number is an index to ``lrugen->lists[]``. The -+sliding window technique is used to track at least ``MIN_NR_GENS`` and -+at most ``MAX_NR_GENS`` generations. The gen counter stores a value -+within ``[1, MAX_NR_GENS]`` while a page is on one of -+``lrugen->lists[]``; otherwise it stores zero. -+ -+Each generation is divided into multiple tiers. A page accessed ``N`` -+times through file descriptors is in tier ``order_base_2(N)``. Unlike -+generations, tiers do not have dedicated ``lrugen->lists[]``. In -+contrast to moving across generations, which requires the LRU lock, -+moving across tiers only involves atomic operations on -+``folio->flags`` and therefore has a negligible cost. A feedback loop -+modeled after the PID controller monitors refaults over all the tiers -+from anon and file types and decides which tiers from which types to -+evict or protect. -+ -+There are two conceptually independent procedures: the aging and the -+eviction. They form a closed-loop system, i.e., the page reclaim. -+ -+Aging -+----- -+The aging produces young generations. Given an ``lruvec``, it -+increments ``max_seq`` when ``max_seq-min_seq+1`` approaches -+``MIN_NR_GENS``. The aging promotes hot pages to the youngest -+generation when it finds them accessed through page tables; the -+demotion of cold pages happens consequently when it increments -+``max_seq``. The aging uses page table walks and rmap walks to find -+young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list`` -+and calls ``walk_page_range()`` with each ``mm_struct`` on this list -+to scan PTEs, and after each iteration, it increments ``max_seq``. For -+the latter, when the eviction walks the rmap and finds a young PTE, -+the aging scans the adjacent PTEs. For both, on finding a young PTE, -+the aging clears the accessed bit and updates the gen counter of the -+page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. -+ -+Eviction -+-------- -+The eviction consumes old generations. Given an ``lruvec``, it -+increments ``min_seq`` when ``lrugen->lists[]`` indexed by -+``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to -+evict from, it first compares ``min_seq[]`` to select the older type. -+If both types are equally old, it selects the one whose first tier has -+a lower refault percentage. The first tier contains single-use -+unmapped clean pages, which are the best bet. The eviction sorts a -+page according to its gen counter if the aging has found this page -+accessed through page tables and updated its gen counter. It also -+moves a page to the next generation, i.e., ``min_seq+1``, if this page -+was accessed multiple times through file descriptors and the feedback -+loop has detected outlying refaults from the tier this page is in. To -+this end, the feedback loop uses the first tier as the baseline, for -+the reason stated earlier. -+ -+Summary -+------- -+The multi-gen LRU can be disassembled into the following parts: -+ -+* Generations -+* Rmap walks -+* Page table walks -+* Bloom filters -+* PID controller -+ -+The aging and the eviction form a producer-consumer model; -+specifically, the latter drives the former by the sliding window over -+generations. Within the aging, rmap walks drive page table walks by -+inserting hot densely populated page tables to the Bloom filters. -+Within the eviction, the PID controller uses refaults as the feedback -+to select types to evict and tiers to protect. diff --git a/sys-kernel/pinephone-pro-sources/files/config-ppp b/sys-kernel/pinephone-sources/files/config-ppp similarity index 100% rename from sys-kernel/pinephone-pro-sources/files/config-ppp rename to sys-kernel/pinephone-sources/files/config-ppp diff --git a/sys-kernel/pinephone-pro-sources/files/config-ppp-old b/sys-kernel/pinephone-sources/files/config-ppp-old similarity index 100% rename from sys-kernel/pinephone-pro-sources/files/config-ppp-old rename to sys-kernel/pinephone-sources/files/config-ppp-old diff --git a/sys-kernel/pinephone-pro-sources/files/dracut-ppp.conf b/sys-kernel/pinephone-sources/files/dracut-ppp.conf similarity index 100% rename from sys-kernel/pinephone-pro-sources/files/dracut-ppp.conf rename to sys-kernel/pinephone-sources/files/dracut-ppp.conf diff --git a/sys-kernel/pinephone-sources/files/pp-keyboard.patch b/sys-kernel/pinephone-sources/files/pp-keyboard.patch deleted file mode 100644 index a8e818e..0000000 --- a/sys-kernel/pinephone-sources/files/pp-keyboard.patch +++ /dev/null @@ -1,176 +0,0 @@ -From d1d849cae12db71aa81ceedaedc1b17a34790367 Mon Sep 17 00:00:00 2001 -From: Samuel Holland -Date: Sat, 19 Jun 2021 18:36:05 -0500 -Subject: [PATCH] Input: kb151 - Add a driver for the KB151 keyboard - -This keyboard is found in the official Pine64 PinePhone keyboard case. -It is connected over I2C and runs a libre firmware. - -Signed-off-by: Samuel Holland ---- - .../dts/allwinner/sun50i-a64-pinephone.dtsi | 64 +++++ - drivers/input/keyboard/Kconfig | 10 + - drivers/input/keyboard/Makefile | 1 + - drivers/input/keyboard/kb151.c | 246 ++++++++++++++++++ - 4 files changed, 321 insertions(+) - create mode 100644 drivers/input/keyboard/kb151.c - -diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -index 4ede9fe66020c..0bdc6eceec609 100644 ---- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -@@ -551,6 +551,70 @@ - /* Connected to pogo pins (external spring based pinheader for user addons) */ - &i2c2 { - status = "okay"; -+ -+ keyboard@15 { -+ compatible = "pine64,kb151"; -+ reg = <0x15>; -+ interrupt-parent = <&r_pio>; -+ interrupts = <0 12 IRQ_TYPE_EDGE_FALLING>; /* PL12 */ -+ keypad,num-rows = <6>; -+ keypad,num-columns = <12>; -+ linux,keymap = ; -+ wakeup-source; -+ }; - }; - - &i2s2 { -diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig -index 40a070a2e7f5b..0259e9133f469 100644 ---- a/drivers/input/keyboard/Kconfig -+++ b/drivers/input/keyboard/Kconfig -@@ -353,6 +353,16 @@ config KEYBOARD_HP7XX - To compile this driver as a module, choose M here: the - module will be called jornada720_kbd. - -+config KEYBOARD_KB151 -+ tristate "Pine64 KB151 Keyboard" -+ depends on I2C -+ select CRC8 -+ select INPUT_MATRIXKMAP -+ help -+ Say Y here to enable support for the KB151 keyboard used in the -+ Pine64 PinePhone keyboard case. This driver supports the FLOSS -+ firmware available at https://megous.com/git/pinephone-keyboard/ -+ - config KEYBOARD_LM8323 - tristate "LM8323 keypad chip" - depends on I2C -From 2423aac2d6f5db55da99e11fd799ee66fe6f54c6 Mon Sep 17 00:00:00 2001 -From: Samuel Holland -Date: Mon, 9 Aug 2021 19:30:18 -0500 -Subject: [PATCH] Input: kb151 - Add support for the FN layer - -Signed-off-by: Samuel Holland ---- - .../dts/allwinner/sun50i-a64-pinephone.dtsi | 34 +++++++++++++++++-- - drivers/input/keyboard/kb151.c | 33 ++++++++++-------- - 2 files changed, 51 insertions(+), 16 deletions(-) - -diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -index 0bdc6eceec609..68f5730cf164c 100644 ---- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi -@@ -557,7 +557,7 @@ - reg = <0x15>; - interrupt-parent = <&r_pio>; - interrupts = <0 12 IRQ_TYPE_EDGE_FALLING>; /* PL12 */ -- keypad,num-rows = <6>; -+ keypad,num-rows = <12>; - keypad,num-columns = <12>; - linux,keymap = ; -+ MATRIX_KEY(5, 5, KEY_RIGHTALT) -+ -+ /* FN layer */ -+ MATRIX_KEY(6, 1, KEY_BACKSLASH) -+ MATRIX_KEY(6, 2, KEY_BACKSLASH) -+ MATRIX_KEY(6, 3, KEY_DOLLAR) -+ MATRIX_KEY(6, 4, KEY_EURO) -+ MATRIX_KEY(6, 5, KEY_GRAVE) -+ MATRIX_KEY(6, 6, KEY_GRAVE) -+ MATRIX_KEY(6, 7, KEY_MINUS) -+ MATRIX_KEY(6, 8, KEY_EQUAL) -+ MATRIX_KEY(6, 9, KEY_MINUS) -+ MATRIX_KEY(6, 10, KEY_EQUAL) -+ MATRIX_KEY(6, 11, KEY_DELETE) -+ -+ MATRIX_KEY(8, 0, KEY_SYSRQ) -+ MATRIX_KEY(8, 10, KEY_INSERT) -+ -+ MATRIX_KEY(9, 0, KEY_LEFTSHIFT) -+ MATRIX_KEY(9, 8, KEY_HOME) -+ MATRIX_KEY(9, 9, KEY_UP) -+ MATRIX_KEY(9, 10, KEY_END) -+ -+ MATRIX_KEY(10, 1, KEY_LEFTCTRL) -+ MATRIX_KEY(10, 6, KEY_LEFT) -+ MATRIX_KEY(10, 8, KEY_RIGHT) -+ MATRIX_KEY(10, 9, KEY_DOWN) -+ -+ MATRIX_KEY(11, 2, KEY_FN) -+ MATRIX_KEY(11, 3, KEY_LEFTALT) -+ MATRIX_KEY(11, 5, KEY_RIGHTALT)>; - wakeup-source; - }; - }; diff --git a/sys-kernel/pinephone-sources/pinephone-sources-5.19.0.ebuild b/sys-kernel/pinephone-sources/pinephone-sources-5.19.12.ebuild similarity index 71% rename from sys-kernel/pinephone-sources/pinephone-sources-5.19.0.ebuild rename to sys-kernel/pinephone-sources/pinephone-sources-5.19.12.ebuild index 650a9b2..010ff7a 100644 --- a/sys-kernel/pinephone-sources/pinephone-sources-5.19.0.ebuild +++ b/sys-kernel/pinephone-sources/pinephone-sources-5.19.12.ebuild @@ -17,7 +17,7 @@ DEPEND="${RDEPEND} DESCRIPTION="Full sources for the Linux kernel, with megi's patch for pinephone and gentoo patchset" -MEGI_TAG="orange-pi-5.19-20220802-0940" +MEGI_TAG="orange-pi-5.19-20220909-1622" SRC_URI="https://github.com/megous/linux/archive/${MEGI_TAG}.tar.gz" PATCHES=( @@ -35,18 +35,20 @@ PATCHES=( ${FILESDIR}/5021_BMQ-and-PDS-gentoo-defaults.patch #PinePhone Patches - ${FILESDIR}/0101-arm64-dts-pinephone-drop-modem-power-node.patch ${FILESDIR}/0102-arm64-dts-pinephone-pro-remove-modem-node.patch + ${FILESDIR}/0103-arm64-dts-rk3399-pinephone-pro-add-modem-RI-pin.patch ${FILESDIR}/0103-ccu-sun50i-a64-reparent-clocks-to-lower-speed-oscillator.patch + ${FILESDIR}/0104-PPP-Add-reset-resume-to-usb_wwan.patch ${FILESDIR}/0104-quirk-kernel-org-bug-210681-firmware_rome_error.patch + ${FILESDIR}/0104-Revert-usb-quirks-Add-USB_QUIRK_RESET-for-Quectel-EG25G.patch + ${FILESDIR}/0104-rk818_charger-use-type-battery-again.patch ${FILESDIR}/0105-leds-gpio-make-max_brightness-configurable.patch - ${FILESDIR}/0106-panic-led.patch + ${FILESDIR}/0106-panic-led.patch + ${FILESDIR}/0106-sound-rockchip-i2s-Dont-disable-mclk-on-suspend.patch + ${FILESDIR}/0201-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch + ${FILESDIR}/0202-revert-fbcon-remove-no-op-fbcon_set_origin.patch + ${FILESDIR}/0203-revert-fbcon-remove-soft-scrollback-code.patch - # keyboard - ${FILESDIR}/pp-keyboard.patch - - # LRU - ${FILESDIR}/Multi-Gen-LRU-Framework.patch ) S="${WORKDIR}/linux-${MEGI_TAG}" @@ -61,18 +63,17 @@ src_prepare() { } pkg_postinst() { - kernel-2_pkg_postinst kernel-2_pkg_postinst einfo "To build and install the kernel use the following commands:" einfo "# make Image modules" einfo "# make DTC_FLAGS="-@" dtbs" einfo "# cp arch/arm64/boot/Image /boot" - einfo "# make INSTALL_MOD_PATH=/usr modules_install" + einfo "# make INSTALL_MOD_PATH=/ modules_intall" einfo "# make INSTALL_DTBS_PATH=/boot/dtbs dtbs_install" einfo "You will need to create and initramfs afterwards." einfo "If you use dracut you can run:" - einfo "# dracut -m \"rootfs-block base\" --host-only --kver 5.19.0-gentoo-arm64" - einfo "Change 5.19.0-gentoo-arm64 to your kernel version installed in /lib/modules" + einfo "# dracut -m \"rootfs-block base\" --host-only --kver 5.19.12-pinehone-gentoo-arm64" + einfo "Change 5.19.12-pinehone-gentoo-arm64 to your kernel version installed in /lib/modules" } pkg_postrm() {