From 424b902587b69eea82eb5096587300fbecb3019f Mon Sep 17 00:00:00 2001 From: Gerben Jan Dijkman Date: Wed, 8 Mar 2023 11:27:01 +0100 Subject: [PATCH] Version bump --- sys-kernel/pinephone-sources/Manifest | 2 +- .../files/1003_linux-6.1.4.patch | 8792 -------------- ...ink-security-restrictions-by-default.patch | 17 + ...020_BMQ-and-PDS-io-scheduler-v6.1-r0.patch | 10076 ---------------- ...hed-alt-missing-rq-lock-irq-function.patch | 30 - ....ebuild => pinephone-sources-6.2.2.ebuild} | 6 +- 6 files changed, 20 insertions(+), 18903 deletions(-) delete mode 100644 sys-kernel/pinephone-sources/files/1003_linux-6.1.4.patch create mode 100644 sys-kernel/pinephone-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch delete mode 100644 sys-kernel/pinephone-sources/files/5020_BMQ-and-PDS-io-scheduler-v6.1-r0.patch delete mode 100644 sys-kernel/pinephone-sources/files/5021_sched-alt-missing-rq-lock-irq-function.patch rename sys-kernel/pinephone-sources/{pinephone-sources-6.1.4.ebuild => pinephone-sources-6.2.2.ebuild} (92%) diff --git a/sys-kernel/pinephone-sources/Manifest b/sys-kernel/pinephone-sources/Manifest index fc54382..8ee36f3 100644 --- a/sys-kernel/pinephone-sources/Manifest +++ b/sys-kernel/pinephone-sources/Manifest @@ -1 +1 @@ -DIST orange-pi-6.1-20230104-1712.tar.gz 223562139 BLAKE2B 635525429db3599691dba21ac86f40492f0cb2d5060bbfcc32cd789ac2379593008b09de0dc40a189481e4e033404546aad6bbd9b32214a09c390f738d4410cf SHA512 4795a261e1016c9d5d4c11cfee43d8cd020b2c2cfed50b87840dba28b8aaee236e533c6eef582ccbbdef65fc36dd7381189c2b0bc0da0922f1282ade8252d75f +DIST orange-pi-6.2-20230307-1859.tar.gz 225749266 BLAKE2B 6afa4a274e04aaedbd31d8185a6c373c9379dfee3414b8a0006307caa30a22fab4616b655cc47a8c14e149ed7d2adf56251a569a7bb762d6f50ac2a26bdb18c9 SHA512 b1be97c723aa2781aa83d0464bfe5f8364a96e3c2f519986730f84984f93ab567c4c4a9d02a422881da1f1e59c40550562f4d1e28818b384acdb0824f2ee049e diff --git a/sys-kernel/pinephone-sources/files/1003_linux-6.1.4.patch b/sys-kernel/pinephone-sources/files/1003_linux-6.1.4.patch deleted file mode 100644 index 894610d..0000000 --- a/sys-kernel/pinephone-sources/files/1003_linux-6.1.4.patch +++ /dev/null @@ -1,8792 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 42af9ca0127e5..6b838869554b1 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2300,7 +2300,13 @@ - Provide an override to the IOAPIC-ID<->DEVICE-ID - mapping provided in the IVRS ACPI table. - By default, PCI segment is 0, and can be omitted. -- For example: -+ -+ For example, to map IOAPIC-ID decimal 10 to -+ PCI segment 0x1 and PCI device 00:14.0, -+ write the parameter as: -+ ivrs_ioapic=10@0001:00:14.0 -+ -+ Deprecated formats: - * To map IOAPIC-ID decimal 10 to PCI device 00:14.0 - write the parameter as: - ivrs_ioapic[10]=00:14.0 -@@ -2312,7 +2318,13 @@ - Provide an override to the HPET-ID<->DEVICE-ID - mapping provided in the IVRS ACPI table. - By default, PCI segment is 0, and can be omitted. -- For example: -+ -+ For example, to map HPET-ID decimal 10 to -+ PCI segment 0x1 and PCI device 00:14.0, -+ write the parameter as: -+ ivrs_hpet=10@0001:00:14.0 -+ -+ Deprecated formats: - * To map HPET-ID decimal 0 to PCI device 00:14.0 - write the parameter as: - ivrs_hpet[0]=00:14.0 -@@ -2323,15 +2335,20 @@ - ivrs_acpihid [HW,X86-64] - Provide an override to the ACPI-HID:UID<->DEVICE-ID - mapping provided in the IVRS ACPI table. -+ By default, PCI segment is 0, and can be omitted. - - For example, to map UART-HID:UID AMD0020:0 to - PCI segment 0x1 and PCI device ID 00:14.5, - write the parameter as: -- ivrs_acpihid[0001:00:14.5]=AMD0020:0 -+ ivrs_acpihid=AMD0020:0@0001:00:14.5 - -- By default, PCI segment is 0, and can be omitted. -- For example, PCI device 00:14.5 write the parameter as: -+ Deprecated formats: -+ * To map UART-HID:UID AMD0020:0 to PCI segment is 0, -+ PCI device ID 00:14.5, write the parameter as: - ivrs_acpihid[00:14.5]=AMD0020:0 -+ * To map UART-HID:UID AMD0020:0 to PCI segment 0x1 and -+ PCI device ID 00:14.5, write the parameter as: -+ ivrs_acpihid[0001:00:14.5]=AMD0020:0 - - js= [HW,JOY] Analog joystick - See Documentation/input/joydev/joystick.rst. -diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst -index eb358a00be279..1d16787a00e95 100644 ---- a/Documentation/filesystems/mount_api.rst -+++ b/Documentation/filesystems/mount_api.rst -@@ -814,6 +814,7 @@ process the parameters it is given. - int fs_lookup_param(struct fs_context *fc, - struct fs_parameter *value, - bool want_bdev, -+ unsigned int flags, - struct path *_path); - - This takes a parameter that carries a string or filename type and attempts -diff --git a/Makefile b/Makefile -index a69d14983a489..56afd1509c74f 100644 ---- a/Makefile -+++ b/Makefile -@@ -1,7 +1,7 @@ - # SPDX-License-Identifier: GPL-2.0 - VERSION = 6 - PATCHLEVEL = 1 --SUBLEVEL = 3 -+SUBLEVEL = 4 - EXTRAVERSION = - NAME = Hurr durr I'ma ninja sloth - -diff --git a/arch/arm/nwfpe/Makefile b/arch/arm/nwfpe/Makefile -index 303400fa2cdf7..2aec85ab1e8b9 100644 ---- a/arch/arm/nwfpe/Makefile -+++ b/arch/arm/nwfpe/Makefile -@@ -11,3 +11,9 @@ nwfpe-y += fpa11.o fpa11_cpdo.o fpa11_cpdt.o \ - entry.o - - nwfpe-$(CONFIG_FPE_NWFPE_XP) += extended_cpdo.o -+ -+# Try really hard to avoid generating calls to __aeabi_uldivmod() from -+# float64_rem() due to loop elision. -+ifdef CONFIG_CC_IS_CLANG -+CFLAGS_softfloat.o += -mllvm -replexitval=never -+endif -diff --git a/arch/arm64/boot/dts/mediatek/mt8195-demo.dts b/arch/arm64/boot/dts/mediatek/mt8195-demo.dts -index 4fbd99eb496a2..dec85d2548384 100644 ---- a/arch/arm64/boot/dts/mediatek/mt8195-demo.dts -+++ b/arch/arm64/boot/dts/mediatek/mt8195-demo.dts -@@ -56,10 +56,10 @@ - #size-cells = <2>; - ranges; - -- /* 192 KiB reserved for ARM Trusted Firmware (BL31) */ -+ /* 2 MiB reserved for ARM Trusted Firmware (BL31) */ - bl31_secmon_reserved: secmon@54600000 { - no-map; -- reg = <0 0x54600000 0x0 0x30000>; -+ reg = <0 0x54600000 0x0 0x200000>; - }; - - /* 12 MiB reserved for OP-TEE (BL32) -diff --git a/arch/arm64/boot/dts/qcom/sc8280xp.dtsi b/arch/arm64/boot/dts/qcom/sc8280xp.dtsi -index 212d63d5cbf28..9f2a136d5cbc5 100644 ---- a/arch/arm64/boot/dts/qcom/sc8280xp.dtsi -+++ b/arch/arm64/boot/dts/qcom/sc8280xp.dtsi -@@ -855,12 +855,13 @@ - required-opps = <&rpmhpd_opp_nom>; - - iommus = <&apps_smmu 0xe0 0x0>; -+ dma-coherent; - - clocks = <&gcc GCC_UFS_PHY_AXI_CLK>, - <&gcc GCC_AGGRE_UFS_PHY_AXI_CLK>, - <&gcc GCC_UFS_PHY_AHB_CLK>, - <&gcc GCC_UFS_PHY_UNIPRO_CORE_CLK>, -- <&rpmhcc RPMH_CXO_CLK>, -+ <&gcc GCC_UFS_REF_CLKREF_CLK>, - <&gcc GCC_UFS_PHY_TX_SYMBOL_0_CLK>, - <&gcc GCC_UFS_PHY_RX_SYMBOL_0_CLK>, - <&gcc GCC_UFS_PHY_RX_SYMBOL_1_CLK>; -@@ -891,7 +892,7 @@ - ranges; - clock-names = "ref", - "ref_aux"; -- clocks = <&gcc GCC_UFS_REF_CLKREF_CLK>, -+ clocks = <&gcc GCC_UFS_CARD_CLKREF_CLK>, - <&gcc GCC_UFS_PHY_PHY_AUX_CLK>; - - resets = <&ufs_mem_hc 0>; -@@ -923,12 +924,13 @@ - power-domains = <&gcc UFS_CARD_GDSC>; - - iommus = <&apps_smmu 0x4a0 0x0>; -+ dma-coherent; - - clocks = <&gcc GCC_UFS_CARD_AXI_CLK>, - <&gcc GCC_AGGRE_UFS_CARD_AXI_CLK>, - <&gcc GCC_UFS_CARD_AHB_CLK>, - <&gcc GCC_UFS_CARD_UNIPRO_CORE_CLK>, -- <&rpmhcc RPMH_CXO_CLK>, -+ <&gcc GCC_UFS_REF_CLKREF_CLK>, - <&gcc GCC_UFS_CARD_TX_SYMBOL_0_CLK>, - <&gcc GCC_UFS_CARD_RX_SYMBOL_0_CLK>, - <&gcc GCC_UFS_CARD_RX_SYMBOL_1_CLK>; -@@ -959,7 +961,7 @@ - ranges; - clock-names = "ref", - "ref_aux"; -- clocks = <&gcc GCC_UFS_REF_CLKREF_CLK>, -+ clocks = <&gcc GCC_UFS_1_CARD_CLKREF_CLK>, - <&gcc GCC_UFS_CARD_PHY_AUX_CLK>; - - resets = <&ufs_card_hc 0>; -diff --git a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts -index 132417e2d11e5..a3e15dedd60cb 100644 ---- a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts -+++ b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts -@@ -1123,7 +1123,10 @@ - - /* PINCTRL - additions to nodes defined in sdm845.dtsi */ - &qup_spi2_default { -- drive-strength = <16>; -+ pinconf { -+ pins = "gpio27", "gpio28", "gpio29", "gpio30"; -+ drive-strength = <16>; -+ }; - }; - - &qup_uart3_default{ -diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts -index be59a8ba9c1fe..74f43da51fa50 100644 ---- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts -+++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts -@@ -487,8 +487,10 @@ - }; - - &qup_i2c12_default { -- drive-strength = <2>; -- bias-disable; -+ pinmux { -+ drive-strength = <2>; -+ bias-disable; -+ }; - }; - - &qup_uart6_default { -diff --git a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts -index f954fe5cb61ab..d028a7eb364a6 100644 ---- a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts -+++ b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts -@@ -415,8 +415,10 @@ - }; - - &qup_i2c12_default { -- drive-strength = <2>; -- bias-disable; -+ pinmux { -+ drive-strength = <2>; -+ bias-disable; -+ }; - }; - - &qup_uart6_default { -diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c -index 634279b3b03d1..117e2c180f3c7 100644 ---- a/arch/arm64/kernel/stacktrace.c -+++ b/arch/arm64/kernel/stacktrace.c -@@ -23,8 +23,8 @@ - * - * The regs must be on a stack currently owned by the calling task. - */ --static inline void unwind_init_from_regs(struct unwind_state *state, -- struct pt_regs *regs) -+static __always_inline void unwind_init_from_regs(struct unwind_state *state, -+ struct pt_regs *regs) - { - unwind_init_common(state, current); - -@@ -58,8 +58,8 @@ static __always_inline void unwind_init_from_caller(struct unwind_state *state) - * duration of the unwind, or the unwind will be bogus. It is never valid to - * call this for the current task. - */ --static inline void unwind_init_from_task(struct unwind_state *state, -- struct task_struct *task) -+static __always_inline void unwind_init_from_task(struct unwind_state *state, -+ struct task_struct *task) - { - unwind_init_common(state, task); - -@@ -186,7 +186,7 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) - : stackinfo_get_unknown(); \ - }) - --noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, -+noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, - void *cookie, struct task_struct *task, - struct pt_regs *regs) - { -diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h -index ecd0288544698..68ae77069d23f 100644 ---- a/arch/parisc/include/asm/pgtable.h -+++ b/arch/parisc/include/asm/pgtable.h -@@ -166,8 +166,8 @@ extern void __update_cache(pte_t pte); - - /* This calculates the number of initial pages we need for the initial - * page tables */ --#if (KERNEL_INITIAL_ORDER) >= (PMD_SHIFT) --# define PT_INITIAL (1 << (KERNEL_INITIAL_ORDER - PMD_SHIFT)) -+#if (KERNEL_INITIAL_ORDER) >= (PLD_SHIFT + BITS_PER_PTE) -+# define PT_INITIAL (1 << (KERNEL_INITIAL_ORDER - PLD_SHIFT - BITS_PER_PTE)) - #else - # define PT_INITIAL (1) /* all initial PTEs fit into one page */ - #endif -diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c -index 6a7e315bcc2e5..a115315d88e69 100644 ---- a/arch/parisc/kernel/firmware.c -+++ b/arch/parisc/kernel/firmware.c -@@ -1288,9 +1288,8 @@ void pdc_io_reset_devices(void) - - #endif /* defined(BOOTLOADER) */ - --/* locked by pdc_console_lock */ --static int __attribute__((aligned(8))) iodc_retbuf[32]; --static char __attribute__((aligned(64))) iodc_dbuf[4096]; -+/* locked by pdc_lock */ -+static char iodc_dbuf[4096] __page_aligned_bss; - - /** - * pdc_iodc_print - Console print using IODC. -@@ -1307,6 +1306,9 @@ int pdc_iodc_print(const unsigned char *str, unsigned count) - unsigned int i; - unsigned long flags; - -+ count = min_t(unsigned int, count, sizeof(iodc_dbuf)); -+ -+ spin_lock_irqsave(&pdc_lock, flags); - for (i = 0; i < count;) { - switch(str[i]) { - case '\n': -@@ -1322,12 +1324,11 @@ int pdc_iodc_print(const unsigned char *str, unsigned count) - } - - print: -- spin_lock_irqsave(&pdc_lock, flags); -- real32_call(PAGE0->mem_cons.iodc_io, -- (unsigned long)PAGE0->mem_cons.hpa, ENTRY_IO_COUT, -- PAGE0->mem_cons.spa, __pa(PAGE0->mem_cons.dp.layers), -- __pa(iodc_retbuf), 0, __pa(iodc_dbuf), i, 0); -- spin_unlock_irqrestore(&pdc_lock, flags); -+ real32_call(PAGE0->mem_cons.iodc_io, -+ (unsigned long)PAGE0->mem_cons.hpa, ENTRY_IO_COUT, -+ PAGE0->mem_cons.spa, __pa(PAGE0->mem_cons.dp.layers), -+ __pa(pdc_result), 0, __pa(iodc_dbuf), i, 0); -+ spin_unlock_irqrestore(&pdc_lock, flags); - - return i; - } -@@ -1354,10 +1355,11 @@ int pdc_iodc_getc(void) - real32_call(PAGE0->mem_kbd.iodc_io, - (unsigned long)PAGE0->mem_kbd.hpa, ENTRY_IO_CIN, - PAGE0->mem_kbd.spa, __pa(PAGE0->mem_kbd.dp.layers), -- __pa(iodc_retbuf), 0, __pa(iodc_dbuf), 1, 0); -+ __pa(pdc_result), 0, __pa(iodc_dbuf), 1, 0); - - ch = *iodc_dbuf; -- status = *iodc_retbuf; -+ /* like convert_to_wide() but for first return value only: */ -+ status = *(int *)&pdc_result; - spin_unlock_irqrestore(&pdc_lock, flags); - - if (status == 0) -diff --git a/arch/parisc/kernel/kgdb.c b/arch/parisc/kernel/kgdb.c -index ab7620f695be1..b16fa9bac5f44 100644 ---- a/arch/parisc/kernel/kgdb.c -+++ b/arch/parisc/kernel/kgdb.c -@@ -208,23 +208,3 @@ int kgdb_arch_handle_exception(int trap, int signo, - } - return -1; - } -- --/* KGDB console driver which uses PDC to read chars from keyboard */ -- --static void kgdb_pdc_write_char(u8 chr) --{ -- /* no need to print char. kgdb will do it. */ --} -- --static struct kgdb_io kgdb_pdc_io_ops = { -- .name = "kgdb_pdc", -- .read_char = pdc_iodc_getc, -- .write_char = kgdb_pdc_write_char, --}; -- --static int __init kgdb_pdc_init(void) --{ -- kgdb_register_io_module(&kgdb_pdc_io_ops); -- return 0; --} --early_initcall(kgdb_pdc_init); -diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c -index 7d0989f523d03..cf3bf82323746 100644 ---- a/arch/parisc/kernel/pdc_cons.c -+++ b/arch/parisc/kernel/pdc_cons.c -@@ -12,37 +12,27 @@ - #include /* for PAGE0 */ - #include /* for iodc_call() proto and friends */ - --static DEFINE_SPINLOCK(pdc_console_lock); -- - static void pdc_console_write(struct console *co, const char *s, unsigned count) - { - int i = 0; -- unsigned long flags; - -- spin_lock_irqsave(&pdc_console_lock, flags); - do { - i += pdc_iodc_print(s + i, count - i); - } while (i < count); -- spin_unlock_irqrestore(&pdc_console_lock, flags); - } - - #ifdef CONFIG_KGDB - static int kgdb_pdc_read_char(void) - { -- int c; -- unsigned long flags; -- -- spin_lock_irqsave(&pdc_console_lock, flags); -- c = pdc_iodc_getc(); -- spin_unlock_irqrestore(&pdc_console_lock, flags); -+ int c = pdc_iodc_getc(); - - return (c <= 0) ? NO_POLL_CHAR : c; - } - - static void kgdb_pdc_write_char(u8 chr) - { -- if (PAGE0->mem_cons.cl_class != CL_DUPLEX) -- pdc_console_write(NULL, &chr, 1); -+ /* no need to print char as it's shown on standard console */ -+ /* pdc_iodc_print(&chr, 1); */ - } - - static struct kgdb_io kgdb_pdc_io_ops = { -diff --git a/arch/parisc/kernel/vdso32/Makefile b/arch/parisc/kernel/vdso32/Makefile -index 85b1c6d261d12..4459a48d23033 100644 ---- a/arch/parisc/kernel/vdso32/Makefile -+++ b/arch/parisc/kernel/vdso32/Makefile -@@ -26,7 +26,7 @@ $(obj)/vdso32_wrapper.o : $(obj)/vdso32.so FORCE - - # Force dependency (incbin is bad) - # link rule for the .so file, .lds has to be first --$(obj)/vdso32.so: $(src)/vdso32.lds $(obj-vdso32) $(obj-cvdso32) $(VDSO_LIBGCC) -+$(obj)/vdso32.so: $(src)/vdso32.lds $(obj-vdso32) $(obj-cvdso32) $(VDSO_LIBGCC) FORCE - $(call if_changed,vdso32ld) - - # assembly rules for the .S files -@@ -38,7 +38,7 @@ $(obj-cvdso32): %.o: %.c FORCE - - # actual build commands - quiet_cmd_vdso32ld = VDSO32L $@ -- cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@ -+ cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $(filter-out FORCE, $^) -o $@ - quiet_cmd_vdso32as = VDSO32A $@ - cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $< - quiet_cmd_vdso32cc = VDSO32C $@ -diff --git a/arch/parisc/kernel/vdso64/Makefile b/arch/parisc/kernel/vdso64/Makefile -index a30f5ec5eb4bf..f3d6045793f4c 100644 ---- a/arch/parisc/kernel/vdso64/Makefile -+++ b/arch/parisc/kernel/vdso64/Makefile -@@ -26,7 +26,7 @@ $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so FORCE - - # Force dependency (incbin is bad) - # link rule for the .so file, .lds has to be first --$(obj)/vdso64.so: $(src)/vdso64.lds $(obj-vdso64) $(VDSO_LIBGCC) -+$(obj)/vdso64.so: $(src)/vdso64.lds $(obj-vdso64) $(VDSO_LIBGCC) FORCE - $(call if_changed,vdso64ld) - - # assembly rules for the .S files -@@ -35,7 +35,7 @@ $(obj-vdso64): %.o: %.S FORCE - - # actual build commands - quiet_cmd_vdso64ld = VDSO64L $@ -- cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@ -+ cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $(filter-out FORCE, $^) -o $@ - quiet_cmd_vdso64as = VDSO64A $@ - cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< - -diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h -index 3cee7115441b4..e3d1f377bc5b5 100644 ---- a/arch/powerpc/include/asm/ftrace.h -+++ b/arch/powerpc/include/asm/ftrace.h -@@ -64,17 +64,6 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, - * those. - */ - #define ARCH_HAS_SYSCALL_MATCH_SYM_NAME --#ifdef CONFIG_PPC64_ELF_ABI_V1 --static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) --{ -- /* We need to skip past the initial dot, and the __se_sys alias */ -- return !strcmp(sym + 1, name) || -- (!strncmp(sym, ".__se_sys", 9) && !strcmp(sym + 6, name)) || -- (!strncmp(sym, ".ppc_", 5) && !strcmp(sym + 5, name + 4)) || -- (!strncmp(sym, ".ppc32_", 7) && !strcmp(sym + 7, name + 4)) || -- (!strncmp(sym, ".ppc64_", 7) && !strcmp(sym + 7, name + 4)); --} --#else - static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) - { - return !strcmp(sym, name) || -@@ -83,7 +72,6 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name - (!strncmp(sym, "ppc32_", 6) && !strcmp(sym + 6, name + 4)) || - (!strncmp(sym, "ppc64_", 6) && !strcmp(sym + 6, name + 4)); - } --#endif /* CONFIG_PPC64_ELF_ABI_V1 */ - #endif /* CONFIG_FTRACE_SYSCALLS */ - - #if defined(CONFIG_PPC64) && defined(CONFIG_FUNCTION_TRACER) -diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig -index 593cf09264d80..8e5fd56820189 100644 ---- a/arch/riscv/Kconfig -+++ b/arch/riscv/Kconfig -@@ -502,7 +502,7 @@ config KEXEC_FILE - select KEXEC_CORE - select KEXEC_ELF - select HAVE_IMA_KEXEC if IMA -- depends on 64BIT -+ depends on 64BIT && MMU - help - This is new version of kexec system call. This system call is - file based and takes file descriptors as system call argument -diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h -index eee260e8ab308..2b56769cb530c 100644 ---- a/arch/riscv/include/asm/kexec.h -+++ b/arch/riscv/include/asm/kexec.h -@@ -39,6 +39,7 @@ crash_setup_regs(struct pt_regs *newregs, - #define ARCH_HAS_KIMAGE_ARCH - - struct kimage_arch { -+ void *fdt; /* For CONFIG_KEXEC_FILE */ - unsigned long fdt_addr; - }; - -@@ -62,6 +63,10 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); - #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add -+ -+struct kimage; -+int arch_kimage_file_post_load_cleanup(struct kimage *image); -+#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup - #endif - - #endif -diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h -index 0099dc1161683..5ff1f19fd45c2 100644 ---- a/arch/riscv/include/asm/mmu.h -+++ b/arch/riscv/include/asm/mmu.h -@@ -19,6 +19,8 @@ typedef struct { - #ifdef CONFIG_SMP - /* A local icache flush is needed before user execution can resume. */ - cpumask_t icache_stale_mask; -+ /* A local tlb flush is needed before user execution can resume. */ -+ cpumask_t tlb_stale_mask; - #endif - } mm_context_t; - -diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h -index 92ec2d9d7273f..ec6fb83349ced 100644 ---- a/arch/riscv/include/asm/pgtable.h -+++ b/arch/riscv/include/asm/pgtable.h -@@ -415,7 +415,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, - * Relying on flush_tlb_fix_spurious_fault would suffice, but - * the extra traps reduce performance. So, eagerly SFENCE.VMA. - */ -- local_flush_tlb_page(address); -+ flush_tlb_page(vma, address); - } - - static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, -diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h -index 801019381dea3..907b9efd39a87 100644 ---- a/arch/riscv/include/asm/tlbflush.h -+++ b/arch/riscv/include/asm/tlbflush.h -@@ -22,6 +22,24 @@ static inline void local_flush_tlb_page(unsigned long addr) - { - ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory")); - } -+ -+static inline void local_flush_tlb_all_asid(unsigned long asid) -+{ -+ __asm__ __volatile__ ("sfence.vma x0, %0" -+ : -+ : "r" (asid) -+ : "memory"); -+} -+ -+static inline void local_flush_tlb_page_asid(unsigned long addr, -+ unsigned long asid) -+{ -+ __asm__ __volatile__ ("sfence.vma %0, %1" -+ : -+ : "r" (addr), "r" (asid) -+ : "memory"); -+} -+ - #else /* CONFIG_MMU */ - #define local_flush_tlb_all() do { } while (0) - #define local_flush_tlb_page(addr) do { } while (0) -diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c -index 0cb94992c15b3..5372b708fae21 100644 ---- a/arch/riscv/kernel/elf_kexec.c -+++ b/arch/riscv/kernel/elf_kexec.c -@@ -21,6 +21,18 @@ - #include - #include - -+int arch_kimage_file_post_load_cleanup(struct kimage *image) -+{ -+ kvfree(image->arch.fdt); -+ image->arch.fdt = NULL; -+ -+ vfree(image->elf_headers); -+ image->elf_headers = NULL; -+ image->elf_headers_sz = 0; -+ -+ return kexec_image_post_load_cleanup_default(image); -+} -+ - static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, - struct kexec_elf_info *elf_info, unsigned long old_pbase, - unsigned long new_pbase) -@@ -298,6 +310,8 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf, - pr_err("Error add DTB kbuf ret=%d\n", ret); - goto out_free_fdt; - } -+ /* Cache the fdt buffer address for memory cleanup */ -+ image->arch.fdt = fdt; - pr_notice("Loaded device tree at 0x%lx\n", kbuf.mem); - goto out; - -diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c -index 08d11a53f39e7..bcfe9eb55f80f 100644 ---- a/arch/riscv/kernel/stacktrace.c -+++ b/arch/riscv/kernel/stacktrace.c -@@ -58,7 +58,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, - } else { - fp = frame->fp; - pc = ftrace_graph_ret_addr(current, NULL, frame->ra, -- (unsigned long *)(fp - 8)); -+ &frame->ra); - } - - } -diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c -index 7acbfbd14557e..80ce9caba8d22 100644 ---- a/arch/riscv/mm/context.c -+++ b/arch/riscv/mm/context.c -@@ -196,6 +196,16 @@ switch_mm_fast: - - if (need_flush_tlb) - local_flush_tlb_all(); -+#ifdef CONFIG_SMP -+ else { -+ cpumask_t *mask = &mm->context.tlb_stale_mask; -+ -+ if (cpumask_test_cpu(cpu, mask)) { -+ cpumask_clear_cpu(cpu, mask); -+ local_flush_tlb_all_asid(cntx & asid_mask); -+ } -+ } -+#endif - } - - static void set_mm_noasid(struct mm_struct *mm) -diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c -index 37ed760d007c3..ce7dfc81bb3fe 100644 ---- a/arch/riscv/mm/tlbflush.c -+++ b/arch/riscv/mm/tlbflush.c -@@ -5,23 +5,7 @@ - #include - #include - #include -- --static inline void local_flush_tlb_all_asid(unsigned long asid) --{ -- __asm__ __volatile__ ("sfence.vma x0, %0" -- : -- : "r" (asid) -- : "memory"); --} -- --static inline void local_flush_tlb_page_asid(unsigned long addr, -- unsigned long asid) --{ -- __asm__ __volatile__ ("sfence.vma %0, %1" -- : -- : "r" (addr), "r" (asid) -- : "memory"); --} -+#include - - void flush_tlb_all(void) - { -@@ -31,6 +15,7 @@ void flush_tlb_all(void) - static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, - unsigned long size, unsigned long stride) - { -+ struct cpumask *pmask = &mm->context.tlb_stale_mask; - struct cpumask *cmask = mm_cpumask(mm); - unsigned int cpuid; - bool broadcast; -@@ -44,6 +29,15 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, - if (static_branch_unlikely(&use_asid_allocator)) { - unsigned long asid = atomic_long_read(&mm->context.id); - -+ /* -+ * TLB will be immediately flushed on harts concurrently -+ * executing this MM context. TLB flush on other harts -+ * is deferred until this MM context migrates there. -+ */ -+ cpumask_setall(pmask); -+ cpumask_clear_cpu(cpuid, pmask); -+ cpumask_andnot(pmask, pmask, cmask); -+ - if (broadcast) { - sbi_remote_sfence_vma_asid(cmask, start, size, asid); - } else if (size <= stride) { -diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c -index acb55b302b14c..3ac220dafec4a 100644 ---- a/arch/um/drivers/virt-pci.c -+++ b/arch/um/drivers/virt-pci.c -@@ -97,7 +97,8 @@ static int um_pci_send_cmd(struct um_pci_device *dev, - } - - buf = get_cpu_var(um_pci_msg_bufs); -- memcpy(buf, cmd, cmd_size); -+ if (buf) -+ memcpy(buf, cmd, cmd_size); - - if (posted) { - u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC); -@@ -182,6 +183,7 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; -+ size_t bytes = sizeof(buf->data); - - if (!dev) - return ULONG_MAX; -@@ -189,7 +191,8 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; - -- memset(buf->data, 0xff, sizeof(buf->data)); -+ if (buf) -+ memset(data, 0xff, bytes); - - switch (size) { - case 1: -@@ -204,7 +207,7 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, - goto out; - } - -- if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, 8)) -+ if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, bytes)) - goto out; - - switch (size) { -diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h -index 2adeaf4de4df6..b363fddc2a89e 100644 ---- a/arch/x86/events/intel/uncore.h -+++ b/arch/x86/events/intel/uncore.h -@@ -2,6 +2,7 @@ - #include - #include - #include -+#include - #include - - #include -diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c -index fcd95e93f479a..8f371f3cbbd24 100644 ---- a/arch/x86/events/intel/uncore_snbep.c -+++ b/arch/x86/events/intel/uncore_snbep.c -@@ -3804,6 +3804,21 @@ static const struct attribute_group *skx_iio_attr_update[] = { - NULL, - }; - -+static void pmu_clear_mapping_attr(const struct attribute_group **groups, -+ struct attribute_group *ag) -+{ -+ int i; -+ -+ for (i = 0; groups[i]; i++) { -+ if (groups[i] == ag) { -+ for (i++; groups[i]; i++) -+ groups[i - 1] = groups[i]; -+ groups[i - 1] = NULL; -+ break; -+ } -+ } -+} -+ - static int - pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) - { -@@ -3852,7 +3867,7 @@ clear_attrs: - clear_topology: - kfree(type->topology); - clear_attr_update: -- type->attr_update = NULL; -+ pmu_clear_mapping_attr(type->attr_update, ag); - return ret; - } - -@@ -5144,6 +5159,11 @@ static int icx_iio_get_topology(struct intel_uncore_type *type) - - static int icx_iio_set_mapping(struct intel_uncore_type *type) - { -+ /* Detect ICX-D system. This case is not supported */ -+ if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) { -+ pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group); -+ return -EPERM; -+ } - return pmu_iio_set_mapping(type, &icx_iio_mapping_group); - } - -diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c -index 1c87501e0fa3d..10fb5b5c9efa4 100644 ---- a/arch/x86/kernel/cpu/mce/amd.c -+++ b/arch/x86/kernel/cpu/mce/amd.c -@@ -788,6 +788,24 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) - return status & MCI_STATUS_DEFERRED; - } - -+static bool _log_error_deferred(unsigned int bank, u32 misc) -+{ -+ if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), -+ mca_msr_reg(bank, MCA_ADDR), misc)) -+ return false; -+ -+ /* -+ * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. -+ * Return true here to avoid accessing these registers. -+ */ -+ if (!mce_flags.smca) -+ return true; -+ -+ /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ -+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); -+ return true; -+} -+ - /* - * We have three scenarios for checking for Deferred errors: - * -@@ -799,19 +817,8 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) - */ - static void log_error_deferred(unsigned int bank) - { -- bool defrd; -- -- defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), -- mca_msr_reg(bank, MCA_ADDR), 0); -- -- if (!mce_flags.smca) -- return; -- -- /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ -- if (defrd) { -- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); -+ if (_log_error_deferred(bank, 0)) - return; -- } - - /* - * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check -@@ -832,7 +839,7 @@ static void amd_deferred_error_interrupt(void) - - static void log_error_thresholding(unsigned int bank, u64 misc) - { -- _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc); -+ _log_error_deferred(bank, misc); - } - - static void log_and_reset_block(struct threshold_block *block) -diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c -index 1fcbd671f1dff..048e38ec99e71 100644 ---- a/arch/x86/kernel/cpu/microcode/intel.c -+++ b/arch/x86/kernel/cpu/microcode/intel.c -@@ -621,7 +621,6 @@ void load_ucode_intel_ap(void) - else - iup = &intel_ucode_patch; - --reget: - if (!*iup) { - patch = __load_ucode_intel(&uci); - if (!patch) -@@ -632,12 +631,7 @@ reget: - - uci.mc = *iup; - -- if (apply_microcode_early(&uci, true)) { -- /* Mixed-silicon system? Try to refetch the proper patch: */ -- *iup = NULL; -- -- goto reget; -- } -+ apply_microcode_early(&uci, true); - } - - static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) -diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c -index 59e543b95a3c6..c2dde46a538e7 100644 ---- a/arch/x86/kernel/fpu/xstate.c -+++ b/arch/x86/kernel/fpu/xstate.c -@@ -440,8 +440,8 @@ static void __init __xstate_dump_leaves(void) - } - } - --#define XSTATE_WARN_ON(x) do { \ -- if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \ -+#define XSTATE_WARN_ON(x, fmt, ...) do { \ -+ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ - __xstate_dump_leaves(); \ - } \ - } while (0) -@@ -554,8 +554,7 @@ static bool __init check_xstate_against_struct(int nr) - (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { -- WARN_ONCE(1, "no structure for xstate: %d\n", nr); -- XSTATE_WARN_ON(1); -+ XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); - return false; - } - return true; -@@ -598,12 +597,13 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) - * XSAVES. - */ - if (!xsaves && xfeature_is_supervisor(i)) { -- XSTATE_WARN_ON(1); -+ XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); - return false; - } - } - size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); -- XSTATE_WARN_ON(size != kernel_size); -+ XSTATE_WARN_ON(size != kernel_size, -+ "size %u != kernel_size %u\n", size, kernel_size); - return size == kernel_size; - } - -diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c -index bd165004776d9..e07234ec7e237 100644 ---- a/arch/x86/kernel/ftrace.c -+++ b/arch/x86/kernel/ftrace.c -@@ -217,7 +217,9 @@ void ftrace_replace_code(int enable) - - ret = ftrace_verify_code(rec->ip, old); - if (ret) { -+ ftrace_expected = old; - ftrace_bug(ret, rec); -+ ftrace_expected = NULL; - return; - } - } -diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c -index eb8bc82846b99..5be7f23099e1f 100644 ---- a/arch/x86/kernel/kprobes/core.c -+++ b/arch/x86/kernel/kprobes/core.c -@@ -37,6 +37,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -281,12 +282,15 @@ static int can_probe(unsigned long paddr) - if (ret < 0) - return 0; - -+#ifdef CONFIG_KGDB - /* -- * Another debugging subsystem might insert this breakpoint. -- * In that case, we can't recover it. -+ * If there is a dynamically installed kgdb sw breakpoint, -+ * this function should not be probed. - */ -- if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) -+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && -+ kgdb_has_hit_break(addr)) - return 0; -+#endif - addr += insn.length; - } - -diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c -index e6b8c5362b945..e57e07b0edb64 100644 ---- a/arch/x86/kernel/kprobes/opt.c -+++ b/arch/x86/kernel/kprobes/opt.c -@@ -15,6 +15,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -279,19 +280,6 @@ static int insn_is_indirect_jump(struct insn *insn) - return ret; - } - --static bool is_padding_int3(unsigned long addr, unsigned long eaddr) --{ -- unsigned char ops; -- -- for (; addr < eaddr; addr++) { -- if (get_kernel_nofault(ops, (void *)addr) < 0 || -- ops != INT3_INSN_OPCODE) -- return false; -- } -- -- return true; --} -- - /* Decode whole function to ensure any instructions don't jump into target */ - static int can_optimize(unsigned long paddr) - { -@@ -334,15 +322,15 @@ static int can_optimize(unsigned long paddr) - ret = insn_decode_kernel(&insn, (void *)recovered_insn); - if (ret < 0) - return 0; -- -+#ifdef CONFIG_KGDB - /* -- * In the case of detecting unknown breakpoint, this could be -- * a padding INT3 between functions. Let's check that all the -- * rest of the bytes are also INT3. -+ * If there is a dynamically installed kgdb sw breakpoint, -+ * this function should not be probed. - */ -- if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) -- return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; -- -+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && -+ kgdb_has_hit_break(addr)) -+ return 0; -+#endif - /* Recover address */ - insn.kaddr = (void *)addr; - insn.next_byte = (void *)(addr + insn.length); -diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c -index d7639d126e6c7..bf5ce862c4daf 100644 ---- a/arch/x86/kvm/lapic.c -+++ b/arch/x86/kvm/lapic.c -@@ -2722,8 +2722,6 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); - } -- } else { -- kvm_lapic_xapic_id_updated(vcpu->arch.apic); - } - - return 0; -@@ -2759,6 +2757,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) - } - memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); - -+ if (!apic_x2apic_mode(apic)) -+ kvm_lapic_xapic_id_updated(apic); -+ - atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); - kvm_recalculate_apic_map(vcpu->kvm); - kvm_apic_set_version(vcpu); -diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c -index 5b0d4859e4b78..10c63b1bf92fa 100644 ---- a/arch/x86/kvm/vmx/nested.c -+++ b/arch/x86/kvm/vmx/nested.c -@@ -5100,24 +5100,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) - | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; - - /* -- * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks -- * that have higher priority than VM-Exit (see Intel SDM's pseudocode -- * for VMXON), as KVM must load valid CR0/CR4 values into hardware while -- * running the guest, i.e. KVM needs to check the _guest_ values. -+ * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter -+ * the guest and so cannot rely on hardware to perform the check, -+ * which has higher priority than VM-Exit (see Intel SDM's pseudocode -+ * for VMXON). - * -- * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and -- * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real -- * Mode, but KVM will never take the guest out of those modes. -+ * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 -+ * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't -+ * force any of the relevant guest state. For a restricted guest, KVM -+ * does force CR0.PE=1, but only to also force VM86 in order to emulate -+ * Real Mode, and so there's no need to check CR0.PE manually. - */ -- if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || -- !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { -+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* -- * CPL=0 and all other checks that are lower priority than VM-Exit must -- * be checked manually. -+ * The CPL is checked for "not in VMX operation" and for "in VMX root", -+ * and has higher priority than the VM-Fail due to being post-VMXON, -+ * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, -+ * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits -+ * from L2 to L1, i.e. there's no need to check for the vCPU being in -+ * VMX non-root. -+ * -+ * Forwarding the VM-Exit unconditionally, i.e. without performing the -+ * #UD checks (see above), is functionally ok because KVM doesn't allow -+ * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's -+ * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are -+ * missed by hardware due to shadowing CR0 and/or CR4. - */ - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); -@@ -5127,6 +5138,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) - if (vmx->nested.vmxon) - return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - -+ /* -+ * Invalid CR0/CR4 generates #GP. These checks are performed if and -+ * only if the vCPU isn't already in VMX operation, i.e. effectively -+ * have lower priority than the VM-Fail above. -+ */ -+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || -+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { -+ kvm_inject_gp(vcpu, 0); -+ return 1; -+ } -+ - if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) - != VMXON_NEEDED_FEATURES) { - kvm_inject_gp(vcpu, 0); -@@ -6808,7 +6830,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) - SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_XSAVES | -- SECONDARY_EXEC_TSC_SCALING; -+ SECONDARY_EXEC_TSC_SCALING | -+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - - /* - * We can emulate "VMCS shadowing," even if the hardware -diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c -index 8f95c7c014335..b12da2a6dec95 100644 ---- a/arch/x86/kvm/vmx/sgx.c -+++ b/arch/x86/kvm/vmx/sgx.c -@@ -182,8 +182,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, - /* Enforce CPUID restriction on max enclave size. */ - max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : - sgx_12_0->edx; -- if (size >= BIT_ULL(max_size_log2)) -+ if (size >= BIT_ULL(max_size_log2)) { - kvm_inject_gp(vcpu, 0); -+ return 1; -+ } - - /* - * sgx_virt_ecreate() returns: -diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c -index b0bc8897c924f..2a31b1ab0c9f2 100644 ---- a/arch/xtensa/kernel/xtensa_ksyms.c -+++ b/arch/xtensa/kernel/xtensa_ksyms.c -@@ -62,6 +62,7 @@ extern int __modsi3(int, int); - extern int __mulsi3(int, int); - extern unsigned int __udivsi3(unsigned int, unsigned int); - extern unsigned int __umodsi3(unsigned int, unsigned int); -+extern unsigned long long __umulsidi3(unsigned int, unsigned int); - - EXPORT_SYMBOL(__ashldi3); - EXPORT_SYMBOL(__ashrdi3); -@@ -71,6 +72,7 @@ EXPORT_SYMBOL(__modsi3); - EXPORT_SYMBOL(__mulsi3); - EXPORT_SYMBOL(__udivsi3); - EXPORT_SYMBOL(__umodsi3); -+EXPORT_SYMBOL(__umulsidi3); - - unsigned int __sync_fetch_and_and_4(volatile void *p, unsigned int v) - { -diff --git a/arch/xtensa/lib/Makefile b/arch/xtensa/lib/Makefile -index d4e9c397e3fde..7ecef0519a27c 100644 ---- a/arch/xtensa/lib/Makefile -+++ b/arch/xtensa/lib/Makefile -@@ -5,7 +5,7 @@ - - lib-y += memcopy.o memset.o checksum.o \ - ashldi3.o ashrdi3.o lshrdi3.o \ -- divsi3.o udivsi3.o modsi3.o umodsi3.o mulsi3.o \ -+ divsi3.o udivsi3.o modsi3.o umodsi3.o mulsi3.o umulsidi3.o \ - usercopy.o strncpy_user.o strnlen_user.o - lib-$(CONFIG_PCI) += pci-auto.o - lib-$(CONFIG_KCSAN) += kcsan-stubs.o -diff --git a/arch/xtensa/lib/umulsidi3.S b/arch/xtensa/lib/umulsidi3.S -new file mode 100644 -index 0000000000000..1360816479427 ---- /dev/null -+++ b/arch/xtensa/lib/umulsidi3.S -@@ -0,0 +1,230 @@ -+/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ -+#include -+#include -+#include -+ -+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 -+#define XCHAL_NO_MUL 1 -+#endif -+ -+ENTRY(__umulsidi3) -+ -+#ifdef __XTENSA_CALL0_ABI__ -+ abi_entry(32) -+ s32i a12, sp, 16 -+ s32i a13, sp, 20 -+ s32i a14, sp, 24 -+ s32i a15, sp, 28 -+#elif XCHAL_NO_MUL -+ /* This is not really a leaf function; allocate enough stack space -+ to allow CALL12s to a helper function. */ -+ abi_entry(32) -+#else -+ abi_entry_default -+#endif -+ -+#ifdef __XTENSA_EB__ -+#define wh a2 -+#define wl a3 -+#else -+#define wh a3 -+#define wl a2 -+#endif /* __XTENSA_EB__ */ -+ -+ /* This code is taken from the mulsf3 routine in ieee754-sf.S. -+ See more comments there. */ -+ -+#if XCHAL_HAVE_MUL32_HIGH -+ mull a6, a2, a3 -+ muluh wh, a2, a3 -+ mov wl, a6 -+ -+#else /* ! MUL32_HIGH */ -+ -+#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL -+ /* a0 and a8 will be clobbered by calling the multiply function -+ but a8 is not used here and need not be saved. */ -+ s32i a0, sp, 0 -+#endif -+ -+#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 -+ -+#define a2h a4 -+#define a3h a5 -+ -+ /* Get the high halves of the inputs into registers. */ -+ srli a2h, a2, 16 -+ srli a3h, a3, 16 -+ -+#define a2l a2 -+#define a3l a3 -+ -+#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 -+ /* Clear the high halves of the inputs. This does not matter -+ for MUL16 because the high bits are ignored. */ -+ extui a2, a2, 0, 16 -+ extui a3, a3, 0, 16 -+#endif -+#endif /* MUL16 || MUL32 */ -+ -+ -+#if XCHAL_HAVE_MUL16 -+ -+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ -+ mul16u dst, xreg ## xhalf, yreg ## yhalf -+ -+#elif XCHAL_HAVE_MUL32 -+ -+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ -+ mull dst, xreg ## xhalf, yreg ## yhalf -+ -+#elif XCHAL_HAVE_MAC16 -+ -+/* The preprocessor insists on inserting a space when concatenating after -+ a period in the definition of do_mul below. These macros are a workaround -+ using underscores instead of periods when doing the concatenation. */ -+#define umul_aa_ll umul.aa.ll -+#define umul_aa_lh umul.aa.lh -+#define umul_aa_hl umul.aa.hl -+#define umul_aa_hh umul.aa.hh -+ -+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ -+ umul_aa_ ## xhalf ## yhalf xreg, yreg; \ -+ rsr dst, ACCLO -+ -+#else /* no multiply hardware */ -+ -+#define set_arg_l(dst, src) \ -+ extui dst, src, 0, 16 -+#define set_arg_h(dst, src) \ -+ srli dst, src, 16 -+ -+#ifdef __XTENSA_CALL0_ABI__ -+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ -+ set_arg_ ## xhalf (a13, xreg); \ -+ set_arg_ ## yhalf (a14, yreg); \ -+ call0 .Lmul_mulsi3; \ -+ mov dst, a12 -+#else -+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ -+ set_arg_ ## xhalf (a14, xreg); \ -+ set_arg_ ## yhalf (a15, yreg); \ -+ call12 .Lmul_mulsi3; \ -+ mov dst, a14 -+#endif /* __XTENSA_CALL0_ABI__ */ -+ -+#endif /* no multiply hardware */ -+ -+ /* Add pp1 and pp2 into a6 with carry-out in a9. */ -+ do_mul(a6, a2, l, a3, h) /* pp 1 */ -+ do_mul(a11, a2, h, a3, l) /* pp 2 */ -+ movi a9, 0 -+ add a6, a6, a11 -+ bgeu a6, a11, 1f -+ addi a9, a9, 1 -+1: -+ /* Shift the high half of a9/a6 into position in a9. Note that -+ this value can be safely incremented without any carry-outs. */ -+ ssai 16 -+ src a9, a9, a6 -+ -+ /* Compute the low word into a6. */ -+ do_mul(a11, a2, l, a3, l) /* pp 0 */ -+ sll a6, a6 -+ add a6, a6, a11 -+ bgeu a6, a11, 1f -+ addi a9, a9, 1 -+1: -+ /* Compute the high word into wh. */ -+ do_mul(wh, a2, h, a3, h) /* pp 3 */ -+ add wh, wh, a9 -+ mov wl, a6 -+ -+#endif /* !MUL32_HIGH */ -+ -+#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL -+ /* Restore the original return address. */ -+ l32i a0, sp, 0 -+#endif -+#ifdef __XTENSA_CALL0_ABI__ -+ l32i a12, sp, 16 -+ l32i a13, sp, 20 -+ l32i a14, sp, 24 -+ l32i a15, sp, 28 -+ abi_ret(32) -+#else -+ abi_ret_default -+#endif -+ -+#if XCHAL_NO_MUL -+ -+ .macro do_addx2 dst, as, at, tmp -+#if XCHAL_HAVE_ADDX -+ addx2 \dst, \as, \at -+#else -+ slli \tmp, \as, 1 -+ add \dst, \tmp, \at -+#endif -+ .endm -+ -+ .macro do_addx4 dst, as, at, tmp -+#if XCHAL_HAVE_ADDX -+ addx4 \dst, \as, \at -+#else -+ slli \tmp, \as, 2 -+ add \dst, \tmp, \at -+#endif -+ .endm -+ -+ .macro do_addx8 dst, as, at, tmp -+#if XCHAL_HAVE_ADDX -+ addx8 \dst, \as, \at -+#else -+ slli \tmp, \as, 3 -+ add \dst, \tmp, \at -+#endif -+ .endm -+ -+ /* For Xtensa processors with no multiply hardware, this simplified -+ version of _mulsi3 is used for multiplying 16-bit chunks of -+ the floating-point mantissas. When using CALL0, this function -+ uses a custom ABI: the inputs are passed in a13 and a14, the -+ result is returned in a12, and a8 and a15 are clobbered. */ -+ .align 4 -+.Lmul_mulsi3: -+ abi_entry_default -+ -+ .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 -+ movi \dst, 0 -+1: add \tmp1, \src2, \dst -+ extui \tmp2, \src1, 0, 1 -+ movnez \dst, \tmp1, \tmp2 -+ -+ do_addx2 \tmp1, \src2, \dst, \tmp1 -+ extui \tmp2, \src1, 1, 1 -+ movnez \dst, \tmp1, \tmp2 -+ -+ do_addx4 \tmp1, \src2, \dst, \tmp1 -+ extui \tmp2, \src1, 2, 1 -+ movnez \dst, \tmp1, \tmp2 -+ -+ do_addx8 \tmp1, \src2, \dst, \tmp1 -+ extui \tmp2, \src1, 3, 1 -+ movnez \dst, \tmp1, \tmp2 -+ -+ srli \src1, \src1, 4 -+ slli \src2, \src2, 4 -+ bnez \src1, 1b -+ .endm -+ -+#ifdef __XTENSA_CALL0_ABI__ -+ mul_mulsi3_body a12, a13, a14, a15, a8 -+#else -+ /* The result will be written into a2, so save that argument in a4. */ -+ mov a4, a2 -+ mul_mulsi3_body a2, a4, a3, a5, a6 -+#endif -+ abi_ret_default -+#endif /* XCHAL_NO_MUL */ -+ -+ENDPROC(__umulsidi3) -diff --git a/block/mq-deadline.c b/block/mq-deadline.c -index 5639921dfa922..6672f1bce3795 100644 ---- a/block/mq-deadline.c -+++ b/block/mq-deadline.c -@@ -130,6 +130,20 @@ static u8 dd_rq_ioclass(struct request *rq) - return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); - } - -+/* -+ * get the request before `rq' in sector-sorted order -+ */ -+static inline struct request * -+deadline_earlier_request(struct request *rq) -+{ -+ struct rb_node *node = rb_prev(&rq->rb_node); -+ -+ if (node) -+ return rb_entry_rq(node); -+ -+ return NULL; -+} -+ - /* - * get the request after `rq' in sector-sorted order - */ -@@ -277,6 +291,39 @@ static inline int deadline_check_fifo(struct dd_per_prio *per_prio, - return 0; - } - -+/* -+ * Check if rq has a sequential request preceding it. -+ */ -+static bool deadline_is_seq_writes(struct deadline_data *dd, struct request *rq) -+{ -+ struct request *prev = deadline_earlier_request(rq); -+ -+ if (!prev) -+ return false; -+ -+ return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); -+} -+ -+/* -+ * Skip all write requests that are sequential from @rq, even if we cross -+ * a zone boundary. -+ */ -+static struct request *deadline_skip_seq_writes(struct deadline_data *dd, -+ struct request *rq) -+{ -+ sector_t pos = blk_rq_pos(rq); -+ sector_t skipped_sectors = 0; -+ -+ while (rq) { -+ if (blk_rq_pos(rq) != pos + skipped_sectors) -+ break; -+ skipped_sectors += blk_rq_sectors(rq); -+ rq = deadline_latter_request(rq); -+ } -+ -+ return rq; -+} -+ - /* - * For the specified data direction, return the next request to - * dispatch using arrival ordered lists. -@@ -297,11 +344,16 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, - - /* - * Look for a write request that can be dispatched, that is one with -- * an unlocked target zone. -+ * an unlocked target zone. For some HDDs, breaking a sequential -+ * write stream can lead to lower throughput, so make sure to preserve -+ * sequential write streams, even if that stream crosses into the next -+ * zones and these zones are unlocked. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { -- if (blk_req_can_dispatch_to_zone(rq)) -+ if (blk_req_can_dispatch_to_zone(rq) && -+ (blk_queue_nonrot(rq->q) || -+ !deadline_is_seq_writes(dd, rq))) - goto out; - } - rq = NULL; -@@ -331,13 +383,19 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, - - /* - * Look for a write request that can be dispatched, that is one with -- * an unlocked target zone. -+ * an unlocked target zone. For some HDDs, breaking a sequential -+ * write stream can lead to lower throughput, so make sure to preserve -+ * sequential write streams, even if that stream crosses into the next -+ * zones and these zones are unlocked. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - while (rq) { - if (blk_req_can_dispatch_to_zone(rq)) - break; -- rq = deadline_latter_request(rq); -+ if (blk_queue_nonrot(rq->q)) -+ rq = deadline_latter_request(rq); -+ else -+ rq = deadline_skip_seq_writes(dd, rq); - } - spin_unlock_irqrestore(&dd->zone_lock, flags); - -@@ -789,6 +847,18 @@ static void dd_prepare_request(struct request *rq) - rq->elv.priv[0] = NULL; - } - -+static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) -+{ -+ struct deadline_data *dd = hctx->queue->elevator->elevator_data; -+ enum dd_prio p; -+ -+ for (p = 0; p <= DD_PRIO_MAX; p++) -+ if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) -+ return true; -+ -+ return false; -+} -+ - /* - * Callback from inside blk_mq_free_request(). - * -@@ -828,9 +898,10 @@ static void dd_finish_request(struct request *rq) - - spin_lock_irqsave(&dd->zone_lock, flags); - blk_req_zone_write_unlock(rq); -- if (!list_empty(&per_prio->fifo_list[DD_WRITE])) -- blk_mq_sched_mark_restart_hctx(rq->mq_hctx); - spin_unlock_irqrestore(&dd->zone_lock, flags); -+ -+ if (dd_has_write_work(rq->mq_hctx)) -+ blk_mq_sched_mark_restart_hctx(rq->mq_hctx); - } - } - -diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c -index 13f10fbcd7f03..76b7e7f8894e7 100644 ---- a/drivers/acpi/video_detect.c -+++ b/drivers/acpi/video_detect.c -@@ -734,6 +734,16 @@ static bool google_cros_ec_present(void) - return acpi_dev_found("GOOG0004") || acpi_dev_found("GOOG000C"); - } - -+/* -+ * Windows 8 and newer no longer use the ACPI video interface, so it often -+ * does not work. So on win8+ systems prefer native brightness control. -+ * Chromebooks should always prefer native backlight control. -+ */ -+static bool prefer_native_over_acpi_video(void) -+{ -+ return acpi_osi_is_win8() || google_cros_ec_present(); -+} -+ - /* - * Determine which type of backlight interface to use on this system, - * First check cmdline, then dmi quirks, then do autodetect. -@@ -779,28 +789,16 @@ static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native) - if (apple_gmux_backlight_present()) - return acpi_backlight_apple_gmux; - -- /* Chromebooks should always prefer native backlight control. */ -- if (google_cros_ec_present() && native_available) -- return acpi_backlight_native; -+ /* Use ACPI video if available, except when native should be preferred. */ -+ if ((video_caps & ACPI_VIDEO_BACKLIGHT) && -+ !(native_available && prefer_native_over_acpi_video())) -+ return acpi_backlight_video; - -- /* On systems with ACPI video use either native or ACPI video. */ -- if (video_caps & ACPI_VIDEO_BACKLIGHT) { -- /* -- * Windows 8 and newer no longer use the ACPI video interface, -- * so it often does not work. If the ACPI tables are written -- * for win8 and native brightness ctl is available, use that. -- * -- * The native check deliberately is inside the if acpi-video -- * block on older devices without acpi-video support native -- * is usually not the best choice. -- */ -- if (acpi_osi_is_win8() && native_available) -- return acpi_backlight_native; -- else -- return acpi_backlight_video; -- } -+ /* Use native if available */ -+ if (native_available) -+ return acpi_backlight_native; - -- /* No ACPI video (old hw), use vendor specific fw methods. */ -+ /* No ACPI video/native (old hw), use vendor specific fw methods. */ - return acpi_backlight_vendor; - } - -@@ -812,18 +810,6 @@ EXPORT_SYMBOL(acpi_video_get_backlight_type); - - bool acpi_video_backlight_use_native(void) - { -- /* -- * Call __acpi_video_get_backlight_type() to let it know that -- * a native backlight is available. -- */ -- __acpi_video_get_backlight_type(true); -- -- /* -- * For now just always return true. There is a whole bunch of laptop -- * models where (video_caps & ACPI_VIDEO_BACKLIGHT) is false causing -- * __acpi_video_get_backlight_type() to return vendor, while these -- * models only have a native backlight control. -- */ -- return true; -+ return __acpi_video_get_backlight_type(true) == acpi_backlight_native; - } - EXPORT_SYMBOL(acpi_video_backlight_use_native); -diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h -index 7add8e79912b1..ff8e6ae1c6362 100644 ---- a/drivers/ata/ahci.h -+++ b/drivers/ata/ahci.h -@@ -24,6 +24,7 @@ - #include - #include - #include -+#include - - /* Enclosure Management Control */ - #define EM_CTRL_MSG_TYPE 0x000f0000 -@@ -53,12 +54,12 @@ enum { - AHCI_PORT_PRIV_FBS_DMA_SZ = AHCI_CMD_SLOT_SZ + - AHCI_CMD_TBL_AR_SZ + - (AHCI_RX_FIS_SZ * 16), -- AHCI_IRQ_ON_SG = (1 << 31), -- AHCI_CMD_ATAPI = (1 << 5), -- AHCI_CMD_WRITE = (1 << 6), -- AHCI_CMD_PREFETCH = (1 << 7), -- AHCI_CMD_RESET = (1 << 8), -- AHCI_CMD_CLR_BUSY = (1 << 10), -+ AHCI_IRQ_ON_SG = BIT(31), -+ AHCI_CMD_ATAPI = BIT(5), -+ AHCI_CMD_WRITE = BIT(6), -+ AHCI_CMD_PREFETCH = BIT(7), -+ AHCI_CMD_RESET = BIT(8), -+ AHCI_CMD_CLR_BUSY = BIT(10), - - RX_FIS_PIO_SETUP = 0x20, /* offset of PIO Setup FIS data */ - RX_FIS_D2H_REG = 0x40, /* offset of D2H Register FIS data */ -@@ -76,37 +77,37 @@ enum { - HOST_CAP2 = 0x24, /* host capabilities, extended */ - - /* HOST_CTL bits */ -- HOST_RESET = (1 << 0), /* reset controller; self-clear */ -- HOST_IRQ_EN = (1 << 1), /* global IRQ enable */ -- HOST_MRSM = (1 << 2), /* MSI Revert to Single Message */ -- HOST_AHCI_EN = (1 << 31), /* AHCI enabled */ -+ HOST_RESET = BIT(0), /* reset controller; self-clear */ -+ HOST_IRQ_EN = BIT(1), /* global IRQ enable */ -+ HOST_MRSM = BIT(2), /* MSI Revert to Single Message */ -+ HOST_AHCI_EN = BIT(31), /* AHCI enabled */ - - /* HOST_CAP bits */ -- HOST_CAP_SXS = (1 << 5), /* Supports External SATA */ -- HOST_CAP_EMS = (1 << 6), /* Enclosure Management support */ -- HOST_CAP_CCC = (1 << 7), /* Command Completion Coalescing */ -- HOST_CAP_PART = (1 << 13), /* Partial state capable */ -- HOST_CAP_SSC = (1 << 14), /* Slumber state capable */ -- HOST_CAP_PIO_MULTI = (1 << 15), /* PIO multiple DRQ support */ -- HOST_CAP_FBS = (1 << 16), /* FIS-based switching support */ -- HOST_CAP_PMP = (1 << 17), /* Port Multiplier support */ -- HOST_CAP_ONLY = (1 << 18), /* Supports AHCI mode only */ -- HOST_CAP_CLO = (1 << 24), /* Command List Override support */ -- HOST_CAP_LED = (1 << 25), /* Supports activity LED */ -- HOST_CAP_ALPM = (1 << 26), /* Aggressive Link PM support */ -- HOST_CAP_SSS = (1 << 27), /* Staggered Spin-up */ -- HOST_CAP_MPS = (1 << 28), /* Mechanical presence switch */ -- HOST_CAP_SNTF = (1 << 29), /* SNotification register */ -- HOST_CAP_NCQ = (1 << 30), /* Native Command Queueing */ -- HOST_CAP_64 = (1 << 31), /* PCI DAC (64-bit DMA) support */ -+ HOST_CAP_SXS = BIT(5), /* Supports External SATA */ -+ HOST_CAP_EMS = BIT(6), /* Enclosure Management support */ -+ HOST_CAP_CCC = BIT(7), /* Command Completion Coalescing */ -+ HOST_CAP_PART = BIT(13), /* Partial state capable */ -+ HOST_CAP_SSC = BIT(14), /* Slumber state capable */ -+ HOST_CAP_PIO_MULTI = BIT(15), /* PIO multiple DRQ support */ -+ HOST_CAP_FBS = BIT(16), /* FIS-based switching support */ -+ HOST_CAP_PMP = BIT(17), /* Port Multiplier support */ -+ HOST_CAP_ONLY = BIT(18), /* Supports AHCI mode only */ -+ HOST_CAP_CLO = BIT(24), /* Command List Override support */ -+ HOST_CAP_LED = BIT(25), /* Supports activity LED */ -+ HOST_CAP_ALPM = BIT(26), /* Aggressive Link PM support */ -+ HOST_CAP_SSS = BIT(27), /* Staggered Spin-up */ -+ HOST_CAP_MPS = BIT(28), /* Mechanical presence switch */ -+ HOST_CAP_SNTF = BIT(29), /* SNotification register */ -+ HOST_CAP_NCQ = BIT(30), /* Native Command Queueing */ -+ HOST_CAP_64 = BIT(31), /* PCI DAC (64-bit DMA) support */ - - /* HOST_CAP2 bits */ -- HOST_CAP2_BOH = (1 << 0), /* BIOS/OS handoff supported */ -- HOST_CAP2_NVMHCI = (1 << 1), /* NVMHCI supported */ -- HOST_CAP2_APST = (1 << 2), /* Automatic partial to slumber */ -- HOST_CAP2_SDS = (1 << 3), /* Support device sleep */ -- HOST_CAP2_SADM = (1 << 4), /* Support aggressive DevSlp */ -- HOST_CAP2_DESO = (1 << 5), /* DevSlp from slumber only */ -+ HOST_CAP2_BOH = BIT(0), /* BIOS/OS handoff supported */ -+ HOST_CAP2_NVMHCI = BIT(1), /* NVMHCI supported */ -+ HOST_CAP2_APST = BIT(2), /* Automatic partial to slumber */ -+ HOST_CAP2_SDS = BIT(3), /* Support device sleep */ -+ HOST_CAP2_SADM = BIT(4), /* Support aggressive DevSlp */ -+ HOST_CAP2_DESO = BIT(5), /* DevSlp from slumber only */ - - /* registers for each SATA port */ - PORT_LST_ADDR = 0x00, /* command list DMA addr */ -@@ -128,24 +129,24 @@ enum { - PORT_DEVSLP = 0x44, /* device sleep */ - - /* PORT_IRQ_{STAT,MASK} bits */ -- PORT_IRQ_COLD_PRES = (1 << 31), /* cold presence detect */ -- PORT_IRQ_TF_ERR = (1 << 30), /* task file error */ -- PORT_IRQ_HBUS_ERR = (1 << 29), /* host bus fatal error */ -- PORT_IRQ_HBUS_DATA_ERR = (1 << 28), /* host bus data error */ -- PORT_IRQ_IF_ERR = (1 << 27), /* interface fatal error */ -- PORT_IRQ_IF_NONFATAL = (1 << 26), /* interface non-fatal error */ -- PORT_IRQ_OVERFLOW = (1 << 24), /* xfer exhausted available S/G */ -- PORT_IRQ_BAD_PMP = (1 << 23), /* incorrect port multiplier */ -- -- PORT_IRQ_PHYRDY = (1 << 22), /* PhyRdy changed */ -- PORT_IRQ_DMPS = (1 << 7), /* mechanical presence status */ -- PORT_IRQ_CONNECT = (1 << 6), /* port connect change status */ -- PORT_IRQ_SG_DONE = (1 << 5), /* descriptor processed */ -- PORT_IRQ_UNK_FIS = (1 << 4), /* unknown FIS rx'd */ -- PORT_IRQ_SDB_FIS = (1 << 3), /* Set Device Bits FIS rx'd */ -- PORT_IRQ_DMAS_FIS = (1 << 2), /* DMA Setup FIS rx'd */ -- PORT_IRQ_PIOS_FIS = (1 << 1), /* PIO Setup FIS rx'd */ -- PORT_IRQ_D2H_REG_FIS = (1 << 0), /* D2H Register FIS rx'd */ -+ PORT_IRQ_COLD_PRES = BIT(31), /* cold presence detect */ -+ PORT_IRQ_TF_ERR = BIT(30), /* task file error */ -+ PORT_IRQ_HBUS_ERR = BIT(29), /* host bus fatal error */ -+ PORT_IRQ_HBUS_DATA_ERR = BIT(28), /* host bus data error */ -+ PORT_IRQ_IF_ERR = BIT(27), /* interface fatal error */ -+ PORT_IRQ_IF_NONFATAL = BIT(26), /* interface non-fatal error */ -+ PORT_IRQ_OVERFLOW = BIT(24), /* xfer exhausted available S/G */ -+ PORT_IRQ_BAD_PMP = BIT(23), /* incorrect port multiplier */ -+ -+ PORT_IRQ_PHYRDY = BIT(22), /* PhyRdy changed */ -+ PORT_IRQ_DMPS = BIT(7), /* mechanical presence status */ -+ PORT_IRQ_CONNECT = BIT(6), /* port connect change status */ -+ PORT_IRQ_SG_DONE = BIT(5), /* descriptor processed */ -+ PORT_IRQ_UNK_FIS = BIT(4), /* unknown FIS rx'd */ -+ PORT_IRQ_SDB_FIS = BIT(3), /* Set Device Bits FIS rx'd */ -+ PORT_IRQ_DMAS_FIS = BIT(2), /* DMA Setup FIS rx'd */ -+ PORT_IRQ_PIOS_FIS = BIT(1), /* PIO Setup FIS rx'd */ -+ PORT_IRQ_D2H_REG_FIS = BIT(0), /* D2H Register FIS rx'd */ - - PORT_IRQ_FREEZE = PORT_IRQ_HBUS_ERR | - PORT_IRQ_IF_ERR | -@@ -161,27 +162,27 @@ enum { - PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS, - - /* PORT_CMD bits */ -- PORT_CMD_ASP = (1 << 27), /* Aggressive Slumber/Partial */ -- PORT_CMD_ALPE = (1 << 26), /* Aggressive Link PM enable */ -- PORT_CMD_ATAPI = (1 << 24), /* Device is ATAPI */ -- PORT_CMD_FBSCP = (1 << 22), /* FBS Capable Port */ -- PORT_CMD_ESP = (1 << 21), /* External Sata Port */ -- PORT_CMD_CPD = (1 << 20), /* Cold Presence Detection */ -- PORT_CMD_MPSP = (1 << 19), /* Mechanical Presence Switch */ -- PORT_CMD_HPCP = (1 << 18), /* HotPlug Capable Port */ -- PORT_CMD_PMP = (1 << 17), /* PMP attached */ -- PORT_CMD_LIST_ON = (1 << 15), /* cmd list DMA engine running */ -- PORT_CMD_FIS_ON = (1 << 14), /* FIS DMA engine running */ -- PORT_CMD_FIS_RX = (1 << 4), /* Enable FIS receive DMA engine */ -- PORT_CMD_CLO = (1 << 3), /* Command list override */ -- PORT_CMD_POWER_ON = (1 << 2), /* Power up device */ -- PORT_CMD_SPIN_UP = (1 << 1), /* Spin up device */ -- PORT_CMD_START = (1 << 0), /* Enable port DMA engine */ -- -- PORT_CMD_ICC_MASK = (0xf << 28), /* i/f ICC state mask */ -- PORT_CMD_ICC_ACTIVE = (0x1 << 28), /* Put i/f in active state */ -- PORT_CMD_ICC_PARTIAL = (0x2 << 28), /* Put i/f in partial state */ -- PORT_CMD_ICC_SLUMBER = (0x6 << 28), /* Put i/f in slumber state */ -+ PORT_CMD_ASP = BIT(27), /* Aggressive Slumber/Partial */ -+ PORT_CMD_ALPE = BIT(26), /* Aggressive Link PM enable */ -+ PORT_CMD_ATAPI = BIT(24), /* Device is ATAPI */ -+ PORT_CMD_FBSCP = BIT(22), /* FBS Capable Port */ -+ PORT_CMD_ESP = BIT(21), /* External Sata Port */ -+ PORT_CMD_CPD = BIT(20), /* Cold Presence Detection */ -+ PORT_CMD_MPSP = BIT(19), /* Mechanical Presence Switch */ -+ PORT_CMD_HPCP = BIT(18), /* HotPlug Capable Port */ -+ PORT_CMD_PMP = BIT(17), /* PMP attached */ -+ PORT_CMD_LIST_ON = BIT(15), /* cmd list DMA engine running */ -+ PORT_CMD_FIS_ON = BIT(14), /* FIS DMA engine running */ -+ PORT_CMD_FIS_RX = BIT(4), /* Enable FIS receive DMA engine */ -+ PORT_CMD_CLO = BIT(3), /* Command list override */ -+ PORT_CMD_POWER_ON = BIT(2), /* Power up device */ -+ PORT_CMD_SPIN_UP = BIT(1), /* Spin up device */ -+ PORT_CMD_START = BIT(0), /* Enable port DMA engine */ -+ -+ PORT_CMD_ICC_MASK = (0xfu << 28), /* i/f ICC state mask */ -+ PORT_CMD_ICC_ACTIVE = (0x1u << 28), /* Put i/f in active state */ -+ PORT_CMD_ICC_PARTIAL = (0x2u << 28), /* Put i/f in partial state */ -+ PORT_CMD_ICC_SLUMBER = (0x6u << 28), /* Put i/f in slumber state */ - - /* PORT_CMD capabilities mask */ - PORT_CMD_CAP = PORT_CMD_HPCP | PORT_CMD_MPSP | -@@ -192,9 +193,9 @@ enum { - PORT_FBS_ADO_OFFSET = 12, /* FBS active dev optimization offset */ - PORT_FBS_DEV_OFFSET = 8, /* FBS device to issue offset */ - PORT_FBS_DEV_MASK = (0xf << PORT_FBS_DEV_OFFSET), /* FBS.DEV */ -- PORT_FBS_SDE = (1 << 2), /* FBS single device error */ -- PORT_FBS_DEC = (1 << 1), /* FBS device error clear */ -- PORT_FBS_EN = (1 << 0), /* Enable FBS */ -+ PORT_FBS_SDE = BIT(2), /* FBS single device error */ -+ PORT_FBS_DEC = BIT(1), /* FBS device error clear */ -+ PORT_FBS_EN = BIT(0), /* Enable FBS */ - - /* PORT_DEVSLP bits */ - PORT_DEVSLP_DM_OFFSET = 25, /* DITO multiplier offset */ -@@ -202,50 +203,50 @@ enum { - PORT_DEVSLP_DITO_OFFSET = 15, /* DITO offset */ - PORT_DEVSLP_MDAT_OFFSET = 10, /* Minimum assertion time */ - PORT_DEVSLP_DETO_OFFSET = 2, /* DevSlp exit timeout */ -- PORT_DEVSLP_DSP = (1 << 1), /* DevSlp present */ -- PORT_DEVSLP_ADSE = (1 << 0), /* Aggressive DevSlp enable */ -+ PORT_DEVSLP_DSP = BIT(1), /* DevSlp present */ -+ PORT_DEVSLP_ADSE = BIT(0), /* Aggressive DevSlp enable */ - - /* hpriv->flags bits */ - - #define AHCI_HFLAGS(flags) .private_data = (void *)(flags) - -- AHCI_HFLAG_NO_NCQ = (1 << 0), -- AHCI_HFLAG_IGN_IRQ_IF_ERR = (1 << 1), /* ignore IRQ_IF_ERR */ -- AHCI_HFLAG_IGN_SERR_INTERNAL = (1 << 2), /* ignore SERR_INTERNAL */ -- AHCI_HFLAG_32BIT_ONLY = (1 << 3), /* force 32bit */ -- AHCI_HFLAG_MV_PATA = (1 << 4), /* PATA port */ -- AHCI_HFLAG_NO_MSI = (1 << 5), /* no PCI MSI */ -- AHCI_HFLAG_NO_PMP = (1 << 6), /* no PMP */ -- AHCI_HFLAG_SECT255 = (1 << 8), /* max 255 sectors */ -- AHCI_HFLAG_YES_NCQ = (1 << 9), /* force NCQ cap on */ -- AHCI_HFLAG_NO_SUSPEND = (1 << 10), /* don't suspend */ -- AHCI_HFLAG_SRST_TOUT_IS_OFFLINE = (1 << 11), /* treat SRST timeout as -- link offline */ -- AHCI_HFLAG_NO_SNTF = (1 << 12), /* no sntf */ -- AHCI_HFLAG_NO_FPDMA_AA = (1 << 13), /* no FPDMA AA */ -- AHCI_HFLAG_YES_FBS = (1 << 14), /* force FBS cap on */ -- AHCI_HFLAG_DELAY_ENGINE = (1 << 15), /* do not start engine on -- port start (wait until -- error-handling stage) */ -- AHCI_HFLAG_NO_DEVSLP = (1 << 17), /* no device sleep */ -- AHCI_HFLAG_NO_FBS = (1 << 18), /* no FBS */ -+ AHCI_HFLAG_NO_NCQ = BIT(0), -+ AHCI_HFLAG_IGN_IRQ_IF_ERR = BIT(1), /* ignore IRQ_IF_ERR */ -+ AHCI_HFLAG_IGN_SERR_INTERNAL = BIT(2), /* ignore SERR_INTERNAL */ -+ AHCI_HFLAG_32BIT_ONLY = BIT(3), /* force 32bit */ -+ AHCI_HFLAG_MV_PATA = BIT(4), /* PATA port */ -+ AHCI_HFLAG_NO_MSI = BIT(5), /* no PCI MSI */ -+ AHCI_HFLAG_NO_PMP = BIT(6), /* no PMP */ -+ AHCI_HFLAG_SECT255 = BIT(8), /* max 255 sectors */ -+ AHCI_HFLAG_YES_NCQ = BIT(9), /* force NCQ cap on */ -+ AHCI_HFLAG_NO_SUSPEND = BIT(10), /* don't suspend */ -+ AHCI_HFLAG_SRST_TOUT_IS_OFFLINE = BIT(11), /* treat SRST timeout as -+ link offline */ -+ AHCI_HFLAG_NO_SNTF = BIT(12), /* no sntf */ -+ AHCI_HFLAG_NO_FPDMA_AA = BIT(13), /* no FPDMA AA */ -+ AHCI_HFLAG_YES_FBS = BIT(14), /* force FBS cap on */ -+ AHCI_HFLAG_DELAY_ENGINE = BIT(15), /* do not start engine on -+ port start (wait until -+ error-handling stage) */ -+ AHCI_HFLAG_NO_DEVSLP = BIT(17), /* no device sleep */ -+ AHCI_HFLAG_NO_FBS = BIT(18), /* no FBS */ - - #ifdef CONFIG_PCI_MSI -- AHCI_HFLAG_MULTI_MSI = (1 << 20), /* per-port MSI(-X) */ -+ AHCI_HFLAG_MULTI_MSI = BIT(20), /* per-port MSI(-X) */ - #else - /* compile out MSI infrastructure */ - AHCI_HFLAG_MULTI_MSI = 0, - #endif -- AHCI_HFLAG_WAKE_BEFORE_STOP = (1 << 22), /* wake before DMA stop */ -- AHCI_HFLAG_YES_ALPM = (1 << 23), /* force ALPM cap on */ -- AHCI_HFLAG_NO_WRITE_TO_RO = (1 << 24), /* don't write to read -- only registers */ -- AHCI_HFLAG_USE_LPM_POLICY = (1 << 25), /* chipset that should use -- SATA_MOBILE_LPM_POLICY -- as default lpm_policy */ -- AHCI_HFLAG_SUSPEND_PHYS = (1 << 26), /* handle PHYs during -- suspend/resume */ -- AHCI_HFLAG_NO_SXS = (1 << 28), /* SXS not supported */ -+ AHCI_HFLAG_WAKE_BEFORE_STOP = BIT(22), /* wake before DMA stop */ -+ AHCI_HFLAG_YES_ALPM = BIT(23), /* force ALPM cap on */ -+ AHCI_HFLAG_NO_WRITE_TO_RO = BIT(24), /* don't write to read -+ only registers */ -+ AHCI_HFLAG_USE_LPM_POLICY = BIT(25), /* chipset that should use -+ SATA_MOBILE_LPM_POLICY -+ as default lpm_policy */ -+ AHCI_HFLAG_SUSPEND_PHYS = BIT(26), /* handle PHYs during -+ suspend/resume */ -+ AHCI_HFLAG_NO_SXS = BIT(28), /* SXS not supported */ - - /* ap->flags bits */ - -@@ -261,22 +262,22 @@ enum { - EM_MAX_RETRY = 5, - - /* em_ctl bits */ -- EM_CTL_RST = (1 << 9), /* Reset */ -- EM_CTL_TM = (1 << 8), /* Transmit Message */ -- EM_CTL_MR = (1 << 0), /* Message Received */ -- EM_CTL_ALHD = (1 << 26), /* Activity LED */ -- EM_CTL_XMT = (1 << 25), /* Transmit Only */ -- EM_CTL_SMB = (1 << 24), /* Single Message Buffer */ -- EM_CTL_SGPIO = (1 << 19), /* SGPIO messages supported */ -- EM_CTL_SES = (1 << 18), /* SES-2 messages supported */ -- EM_CTL_SAFTE = (1 << 17), /* SAF-TE messages supported */ -- EM_CTL_LED = (1 << 16), /* LED messages supported */ -+ EM_CTL_RST = BIT(9), /* Reset */ -+ EM_CTL_TM = BIT(8), /* Transmit Message */ -+ EM_CTL_MR = BIT(0), /* Message Received */ -+ EM_CTL_ALHD = BIT(26), /* Activity LED */ -+ EM_CTL_XMT = BIT(25), /* Transmit Only */ -+ EM_CTL_SMB = BIT(24), /* Single Message Buffer */ -+ EM_CTL_SGPIO = BIT(19), /* SGPIO messages supported */ -+ EM_CTL_SES = BIT(18), /* SES-2 messages supported */ -+ EM_CTL_SAFTE = BIT(17), /* SAF-TE messages supported */ -+ EM_CTL_LED = BIT(16), /* LED messages supported */ - - /* em message type */ -- EM_MSG_TYPE_LED = (1 << 0), /* LED */ -- EM_MSG_TYPE_SAFTE = (1 << 1), /* SAF-TE */ -- EM_MSG_TYPE_SES2 = (1 << 2), /* SES-2 */ -- EM_MSG_TYPE_SGPIO = (1 << 3), /* SGPIO */ -+ EM_MSG_TYPE_LED = BIT(0), /* LED */ -+ EM_MSG_TYPE_SAFTE = BIT(1), /* SAF-TE */ -+ EM_MSG_TYPE_SES2 = BIT(2), /* SES-2 */ -+ EM_MSG_TYPE_SGPIO = BIT(3), /* SGPIO */ - }; - - struct ahci_cmd_hdr { -diff --git a/drivers/base/dd.c b/drivers/base/dd.c -index 3dda62503102f..9ae2b5c4fc496 100644 ---- a/drivers/base/dd.c -+++ b/drivers/base/dd.c -@@ -1162,7 +1162,11 @@ static int __driver_attach(struct device *dev, void *data) - return 0; - } else if (ret < 0) { - dev_dbg(dev, "Bus failed to match device: %d\n", ret); -- return ret; -+ /* -+ * Driver could not match with device, but may match with -+ * another device on the bus. -+ */ -+ return 0; - } /* ret > 0 means positive match */ - - if (driver_allows_async_probing(drv)) { -diff --git a/drivers/bus/mhi/host/pm.c b/drivers/bus/mhi/host/pm.c -index 4a42186ff1112..083459028a4b8 100644 ---- a/drivers/bus/mhi/host/pm.c -+++ b/drivers/bus/mhi/host/pm.c -@@ -301,7 +301,8 @@ int mhi_pm_m0_transition(struct mhi_controller *mhi_cntrl) - read_lock_irq(&mhi_chan->lock); - - /* Only ring DB if ring is not empty */ -- if (tre_ring->base && tre_ring->wp != tre_ring->rp) -+ if (tre_ring->base && tre_ring->wp != tre_ring->rp && -+ mhi_chan->ch_state == MHI_CH_STATE_ENABLED) - mhi_ring_chan_db(mhi_cntrl, mhi_chan); - read_unlock_irq(&mhi_chan->lock); - } -diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c -index d5ee52be176d3..5d403fb5bd929 100644 ---- a/drivers/char/ipmi/ipmi_msghandler.c -+++ b/drivers/char/ipmi/ipmi_msghandler.c -@@ -1330,6 +1330,7 @@ static void _ipmi_destroy_user(struct ipmi_user *user) - unsigned long flags; - struct cmd_rcvr *rcvr; - struct cmd_rcvr *rcvrs = NULL; -+ struct module *owner; - - if (!acquire_ipmi_user(user, &i)) { - /* -@@ -1392,8 +1393,9 @@ static void _ipmi_destroy_user(struct ipmi_user *user) - kfree(rcvr); - } - -+ owner = intf->owner; - kref_put(&intf->refcount, intf_free); -- module_put(intf->owner); -+ module_put(owner); - } - - int ipmi_destroy_user(struct ipmi_user *user) -diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c -index 6e357ad76f2eb..abddd7e43a9a6 100644 ---- a/drivers/char/ipmi/ipmi_si_intf.c -+++ b/drivers/char/ipmi/ipmi_si_intf.c -@@ -2153,6 +2153,20 @@ skip_fallback_noirq: - } - module_init(init_ipmi_si); - -+static void wait_msg_processed(struct smi_info *smi_info) -+{ -+ unsigned long jiffies_now; -+ long time_diff; -+ -+ while (smi_info->curr_msg || (smi_info->si_state != SI_NORMAL)) { -+ jiffies_now = jiffies; -+ time_diff = (((long)jiffies_now - (long)smi_info->last_timeout_jiffies) -+ * SI_USEC_PER_JIFFY); -+ smi_event_handler(smi_info, time_diff); -+ schedule_timeout_uninterruptible(1); -+ } -+} -+ - static void shutdown_smi(void *send_info) - { - struct smi_info *smi_info = send_info; -@@ -2187,16 +2201,13 @@ static void shutdown_smi(void *send_info) - * in the BMC. Note that timers and CPU interrupts are off, - * so no need for locks. - */ -- while (smi_info->curr_msg || (smi_info->si_state != SI_NORMAL)) { -- poll(smi_info); -- schedule_timeout_uninterruptible(1); -- } -+ wait_msg_processed(smi_info); -+ - if (smi_info->handlers) - disable_si_irq(smi_info); -- while (smi_info->curr_msg || (smi_info->si_state != SI_NORMAL)) { -- poll(smi_info); -- schedule_timeout_uninterruptible(1); -- } -+ -+ wait_msg_processed(smi_info); -+ - if (smi_info->handlers) - smi_info->handlers->cleanup(smi_info->si_sm); - -diff --git a/drivers/char/random.c b/drivers/char/random.c -index 69754155300ea..f5868dddbb618 100644 ---- a/drivers/char/random.c -+++ b/drivers/char/random.c -@@ -160,6 +160,9 @@ EXPORT_SYMBOL(wait_for_random_bytes); - * u8 get_random_u8() - * u16 get_random_u16() - * u32 get_random_u32() -+ * u32 get_random_u32_below(u32 ceil) -+ * u32 get_random_u32_above(u32 floor) -+ * u32 get_random_u32_inclusive(u32 floor, u32 ceil) - * u64 get_random_u64() - * unsigned long get_random_long() - * -@@ -510,6 +513,41 @@ DEFINE_BATCHED_ENTROPY(u16) - DEFINE_BATCHED_ENTROPY(u32) - DEFINE_BATCHED_ENTROPY(u64) - -+u32 __get_random_u32_below(u32 ceil) -+{ -+ /* -+ * This is the slow path for variable ceil. It is still fast, most of -+ * the time, by doing traditional reciprocal multiplication and -+ * opportunistically comparing the lower half to ceil itself, before -+ * falling back to computing a larger bound, and then rejecting samples -+ * whose lower half would indicate a range indivisible by ceil. The use -+ * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable -+ * in 32-bits. -+ */ -+ u32 rand = get_random_u32(); -+ u64 mult; -+ -+ /* -+ * This function is technically undefined for ceil == 0, and in fact -+ * for the non-underscored constant version in the header, we build bug -+ * on that. But for the non-constant case, it's convenient to have that -+ * evaluate to being a straight call to get_random_u32(), so that -+ * get_random_u32_inclusive() can work over its whole range without -+ * undefined behavior. -+ */ -+ if (unlikely(!ceil)) -+ return rand; -+ -+ mult = (u64)ceil * rand; -+ if (unlikely((u32)mult < ceil)) { -+ u32 bound = -ceil % ceil; -+ while (unlikely((u32)mult < bound)) -+ mult = (u64)ceil * get_random_u32(); -+ } -+ return mult >> 32; -+} -+EXPORT_SYMBOL(__get_random_u32_below); -+ - #ifdef CONFIG_SMP - /* - * This function is called when the CPU is coming up, with entry -diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c -index 69b3d61852ac6..7e56a42750ea5 100644 ---- a/drivers/cpufreq/cpufreq.c -+++ b/drivers/cpufreq/cpufreq.c -@@ -1207,6 +1207,7 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu) - if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL)) - goto err_free_rcpumask; - -+ init_completion(&policy->kobj_unregister); - ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, - cpufreq_global_kobject, "policy%u", cpu); - if (ret) { -@@ -1245,7 +1246,6 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu) - init_rwsem(&policy->rwsem); - spin_lock_init(&policy->transition_lock); - init_waitqueue_head(&policy->transition_wait); -- init_completion(&policy->kobj_unregister); - INIT_WORK(&policy->update, handle_update); - - policy->cpu = cpu; -diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig -index c30b5a39c2ac2..4a618d80e106f 100644 ---- a/drivers/crypto/Kconfig -+++ b/drivers/crypto/Kconfig -@@ -790,8 +790,8 @@ config CRYPTO_DEV_CCREE - select CRYPTO_ECB - select CRYPTO_CTR - select CRYPTO_XTS -- select CRYPTO_SM4 -- select CRYPTO_SM3 -+ select CRYPTO_SM4_GENERIC -+ select CRYPTO_SM3_GENERIC - help - Say 'Y' to enable a driver for the REE interface of the Arm - TrustZone CryptoCell family of processors. Currently the -diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c -index 792d6da7f0c07..084d052fddccb 100644 ---- a/drivers/crypto/ccp/sp-pci.c -+++ b/drivers/crypto/ccp/sp-pci.c -@@ -381,6 +381,15 @@ static const struct psp_vdata pspv3 = { - .inten_reg = 0x10690, - .intsts_reg = 0x10694, - }; -+ -+static const struct psp_vdata pspv4 = { -+ .sev = &sevv2, -+ .tee = &teev1, -+ .feature_reg = 0x109fc, -+ .inten_reg = 0x10690, -+ .intsts_reg = 0x10694, -+}; -+ - #endif - - static const struct sp_dev_vdata dev_vdata[] = { -@@ -426,7 +435,7 @@ static const struct sp_dev_vdata dev_vdata[] = { - { /* 5 */ - .bar = 2, - #ifdef CONFIG_CRYPTO_DEV_SP_PSP -- .psp_vdata = &pspv2, -+ .psp_vdata = &pspv4, - #endif - }, - { /* 6 */ -diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig -index 27e1fa9120639..743ce4fc3158c 100644 ---- a/drivers/crypto/hisilicon/Kconfig -+++ b/drivers/crypto/hisilicon/Kconfig -@@ -26,7 +26,7 @@ config CRYPTO_DEV_HISI_SEC2 - select CRYPTO_SHA1 - select CRYPTO_SHA256 - select CRYPTO_SHA512 -- select CRYPTO_SM4 -+ select CRYPTO_SM4_GENERIC - depends on PCI && PCI_MSI - depends on UACCE || UACCE=n - depends on ARM64 || (COMPILE_TEST && 64BIT) -diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c -index 31e24df18877f..20d0dcd50344b 100644 ---- a/drivers/crypto/n2_core.c -+++ b/drivers/crypto/n2_core.c -@@ -1229,6 +1229,7 @@ struct n2_hash_tmpl { - const u8 *hash_init; - u8 hw_op_hashsz; - u8 digest_size; -+ u8 statesize; - u8 block_size; - u8 auth_type; - u8 hmac_type; -@@ -1260,6 +1261,7 @@ static const struct n2_hash_tmpl hash_tmpls[] = { - .hmac_type = AUTH_TYPE_HMAC_MD5, - .hw_op_hashsz = MD5_DIGEST_SIZE, - .digest_size = MD5_DIGEST_SIZE, -+ .statesize = sizeof(struct md5_state), - .block_size = MD5_HMAC_BLOCK_SIZE }, - { .name = "sha1", - .hash_zero = sha1_zero_message_hash, -@@ -1268,6 +1270,7 @@ static const struct n2_hash_tmpl hash_tmpls[] = { - .hmac_type = AUTH_TYPE_HMAC_SHA1, - .hw_op_hashsz = SHA1_DIGEST_SIZE, - .digest_size = SHA1_DIGEST_SIZE, -+ .statesize = sizeof(struct sha1_state), - .block_size = SHA1_BLOCK_SIZE }, - { .name = "sha256", - .hash_zero = sha256_zero_message_hash, -@@ -1276,6 +1279,7 @@ static const struct n2_hash_tmpl hash_tmpls[] = { - .hmac_type = AUTH_TYPE_HMAC_SHA256, - .hw_op_hashsz = SHA256_DIGEST_SIZE, - .digest_size = SHA256_DIGEST_SIZE, -+ .statesize = sizeof(struct sha256_state), - .block_size = SHA256_BLOCK_SIZE }, - { .name = "sha224", - .hash_zero = sha224_zero_message_hash, -@@ -1284,6 +1288,7 @@ static const struct n2_hash_tmpl hash_tmpls[] = { - .hmac_type = AUTH_TYPE_RESERVED, - .hw_op_hashsz = SHA256_DIGEST_SIZE, - .digest_size = SHA224_DIGEST_SIZE, -+ .statesize = sizeof(struct sha256_state), - .block_size = SHA224_BLOCK_SIZE }, - }; - #define NUM_HASH_TMPLS ARRAY_SIZE(hash_tmpls) -@@ -1424,6 +1429,7 @@ static int __n2_register_one_ahash(const struct n2_hash_tmpl *tmpl) - - halg = &ahash->halg; - halg->digestsize = tmpl->digest_size; -+ halg->statesize = tmpl->statesize; - - base = &halg->base; - snprintf(base->cra_name, CRYPTO_MAX_ALG_NAME, "%s", tmpl->name); -diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c -index f9ae5ad284ffb..c4f32c32dfd50 100644 ---- a/drivers/cxl/core/region.c -+++ b/drivers/cxl/core/region.c -@@ -1226,7 +1226,7 @@ static int cxl_region_attach(struct cxl_region *cxlr, - struct cxl_endpoint_decoder *cxled_target; - struct cxl_memdev *cxlmd_target; - -- cxled_target = p->targets[pos]; -+ cxled_target = p->targets[i]; - if (!cxled_target) - continue; - -@@ -1923,6 +1923,9 @@ static int cxl_region_probe(struct device *dev) - */ - up_read(&cxl_region_rwsem); - -+ if (rc) -+ return rc; -+ - switch (cxlr->mode) { - case CXL_DECODER_PMEM: - return devm_cxl_add_pmem_region(cxlr); -diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c -index 63347a5ae5999..8c5f6f7fca112 100644 ---- a/drivers/devfreq/devfreq.c -+++ b/drivers/devfreq/devfreq.c -@@ -776,8 +776,7 @@ static void remove_sysfs_files(struct devfreq *devfreq, - * @dev: the device to add devfreq feature. - * @profile: device-specific profile to run devfreq. - * @governor_name: name of the policy to choose frequency. -- * @data: private data for the governor. The devfreq framework does not -- * touch this value. -+ * @data: devfreq driver pass to governors, governor should not change it. - */ - struct devfreq *devfreq_add_device(struct device *dev, - struct devfreq_dev_profile *profile, -@@ -1011,8 +1010,7 @@ static void devm_devfreq_dev_release(struct device *dev, void *res) - * @dev: the device to add devfreq feature. - * @profile: device-specific profile to run devfreq. - * @governor_name: name of the policy to choose frequency. -- * @data: private data for the governor. The devfreq framework does not -- * touch this value. -+ * @data: devfreq driver pass to governors, governor should not change it. - * - * This function manages automatically the memory of devfreq device using device - * resource management and simplify the free operation for memory of devfreq -diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c -index ab9db7adb3ade..d69672ccacc49 100644 ---- a/drivers/devfreq/governor_userspace.c -+++ b/drivers/devfreq/governor_userspace.c -@@ -21,7 +21,7 @@ struct userspace_data { - - static int devfreq_userspace_func(struct devfreq *df, unsigned long *freq) - { -- struct userspace_data *data = df->data; -+ struct userspace_data *data = df->governor_data; - - if (data->valid) - *freq = data->user_frequency; -@@ -40,7 +40,7 @@ static ssize_t set_freq_store(struct device *dev, struct device_attribute *attr, - int err = 0; - - mutex_lock(&devfreq->lock); -- data = devfreq->data; -+ data = devfreq->governor_data; - - sscanf(buf, "%lu", &wanted); - data->user_frequency = wanted; -@@ -60,7 +60,7 @@ static ssize_t set_freq_show(struct device *dev, - int err = 0; - - mutex_lock(&devfreq->lock); -- data = devfreq->data; -+ data = devfreq->governor_data; - - if (data->valid) - err = sprintf(buf, "%lu\n", data->user_frequency); -@@ -91,7 +91,7 @@ static int userspace_init(struct devfreq *devfreq) - goto out; - } - data->valid = false; -- devfreq->data = data; -+ devfreq->governor_data = data; - - err = sysfs_create_group(&devfreq->dev.kobj, &dev_attr_group); - out: -@@ -107,8 +107,8 @@ static void userspace_exit(struct devfreq *devfreq) - if (devfreq->dev.kobj.sd) - sysfs_remove_group(&devfreq->dev.kobj, &dev_attr_group); - -- kfree(devfreq->data); -- devfreq->data = NULL; -+ kfree(devfreq->governor_data); -+ devfreq->governor_data = NULL; - } - - static int devfreq_userspace_handler(struct devfreq *devfreq, -diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c -index 0a638c97702a5..15f63452a9bec 100644 ---- a/drivers/edac/edac_mc_sysfs.c -+++ b/drivers/edac/edac_mc_sysfs.c -@@ -298,6 +298,14 @@ DEVICE_CHANNEL(ch6_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 6); - DEVICE_CHANNEL(ch7_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 7); -+DEVICE_CHANNEL(ch8_dimm_label, S_IRUGO | S_IWUSR, -+ channel_dimm_label_show, channel_dimm_label_store, 8); -+DEVICE_CHANNEL(ch9_dimm_label, S_IRUGO | S_IWUSR, -+ channel_dimm_label_show, channel_dimm_label_store, 9); -+DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, -+ channel_dimm_label_show, channel_dimm_label_store, 10); -+DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, -+ channel_dimm_label_show, channel_dimm_label_store, 11); - - /* Total possible dynamic DIMM Label attribute file table */ - static struct attribute *dynamic_csrow_dimm_attr[] = { -@@ -309,6 +317,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { - &dev_attr_legacy_ch5_dimm_label.attr.attr, - &dev_attr_legacy_ch6_dimm_label.attr.attr, - &dev_attr_legacy_ch7_dimm_label.attr.attr, -+ &dev_attr_legacy_ch8_dimm_label.attr.attr, -+ &dev_attr_legacy_ch9_dimm_label.attr.attr, -+ &dev_attr_legacy_ch10_dimm_label.attr.attr, -+ &dev_attr_legacy_ch11_dimm_label.attr.attr, - NULL - }; - -@@ -329,6 +341,14 @@ DEVICE_CHANNEL(ch6_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 6); - DEVICE_CHANNEL(ch7_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 7); -+DEVICE_CHANNEL(ch8_ce_count, S_IRUGO, -+ channel_ce_count_show, NULL, 8); -+DEVICE_CHANNEL(ch9_ce_count, S_IRUGO, -+ channel_ce_count_show, NULL, 9); -+DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, -+ channel_ce_count_show, NULL, 10); -+DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, -+ channel_ce_count_show, NULL, 11); - - /* Total possible dynamic ce_count attribute file table */ - static struct attribute *dynamic_csrow_ce_count_attr[] = { -@@ -340,6 +360,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { - &dev_attr_legacy_ch5_ce_count.attr.attr, - &dev_attr_legacy_ch6_ce_count.attr.attr, - &dev_attr_legacy_ch7_ce_count.attr.attr, -+ &dev_attr_legacy_ch8_ce_count.attr.attr, -+ &dev_attr_legacy_ch9_ce_count.attr.attr, -+ &dev_attr_legacy_ch10_ce_count.attr.attr, -+ &dev_attr_legacy_ch11_ce_count.attr.attr, - NULL - }; - -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c -index 913f22d41673d..0be85d19a6f3e 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c -@@ -3005,14 +3005,15 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) - continue; - } - -- /* skip suspend of gfx and psp for S0ix -+ /* skip suspend of gfx/mes and psp for S0ix - * gfx is in gfxoff state, so on resume it will exit gfxoff just - * like at runtime. PSP is also part of the always on hardware - * so no need to suspend it. - */ - if (adev->in_s0ix && - (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || -- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) -+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || -+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) - continue; - - /* XXX handle errors */ -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -index bf2d50c8c92ad..d8dfbb9b735dc 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -@@ -2040,6 +2040,15 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, - "See modparam exp_hw_support\n"); - return -ENODEV; - } -+ /* differentiate between P10 and P11 asics with the same DID */ -+ if (pdev->device == 0x67FF && -+ (pdev->revision == 0xE3 || -+ pdev->revision == 0xE7 || -+ pdev->revision == 0xF3 || -+ pdev->revision == 0xF7)) { -+ flags &= ~AMD_ASIC_MASK; -+ flags |= CHIP_POLARIS10; -+ } - - /* Due to hardware bugs, S/G Display on raven requires a 1:1 IOMMU mapping, - * however, SME requires an indirect IOMMU mapping because the encryption -@@ -2109,12 +2118,12 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, - - pci_set_drvdata(pdev, ddev); - -- ret = amdgpu_driver_load_kms(adev, ent->driver_data); -+ ret = amdgpu_driver_load_kms(adev, flags); - if (ret) - goto err_pci; - - retry_init: -- ret = drm_dev_register(ddev, ent->driver_data); -+ ret = drm_dev_register(ddev, flags); - if (ret == -EAGAIN && ++retry <= 3) { - DRM_INFO("retry init %d\n", retry); - /* Don't request EX mode too frequently which is attacking */ -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -index 2e8f6cd7a7293..3df13d841e4d5 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -@@ -1509,7 +1509,8 @@ u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo) - uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, - uint32_t domain) - { -- if (domain == (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) { -+ if ((domain == (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) && -+ ((adev->asic_type == CHIP_CARRIZO) || (adev->asic_type == CHIP_STONEY))) { - domain = AMDGPU_GEM_DOMAIN_VRAM; - if (adev->gmc.real_vram_size <= AMDGPU_SG_THRESHOLD) - domain = AMDGPU_GEM_DOMAIN_GTT; -diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c -index f141fadd2d86f..725876b4f02ed 100644 ---- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c -+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c -@@ -1339,7 +1339,8 @@ static int mes_v11_0_late_init(void *handle) - { - struct amdgpu_device *adev = (struct amdgpu_device *)handle; - -- if (!amdgpu_in_reset(adev) && -+ /* it's only intended for use in mes_self_test case, not for s0ix and reset */ -+ if (!amdgpu_in_reset(adev) && !adev->in_s0ix && - (adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))) - amdgpu_mes_self_test(adev); - -diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c -index 998b5d17b271b..0e664d0cc8d51 100644 ---- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c -+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c -@@ -319,7 +319,7 @@ static void mmhub_v2_0_init_cache_regs(struct amdgpu_device *adev) - - tmp = mmMMVM_L2_CNTL5_DEFAULT; - tmp = REG_SET_FIELD(tmp, MMVM_L2_CNTL5, L2_CACHE_SMALLK_FRAGMENT_SIZE, 0); -- WREG32_SOC15(GC, 0, mmMMVM_L2_CNTL5, tmp); -+ WREG32_SOC15(MMHUB, 0, mmMMVM_L2_CNTL5, tmp); - } - - static void mmhub_v2_0_enable_system_domain(struct amdgpu_device *adev) -diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c -index 1b027d069ab40..4638ea7c2eec5 100644 ---- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c -+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c -@@ -243,7 +243,7 @@ static void mmhub_v2_3_init_cache_regs(struct amdgpu_device *adev) - - tmp = mmMMVM_L2_CNTL5_DEFAULT; - tmp = REG_SET_FIELD(tmp, MMVM_L2_CNTL5, L2_CACHE_SMALLK_FRAGMENT_SIZE, 0); -- WREG32_SOC15(GC, 0, mmMMVM_L2_CNTL5, tmp); -+ WREG32_SOC15(MMHUB, 0, mmMMVM_L2_CNTL5, tmp); - } - - static void mmhub_v2_3_enable_system_domain(struct amdgpu_device *adev) -diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c -index a1d26c4d80b8c..16cc82215e2e1 100644 ---- a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c -+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c -@@ -275,7 +275,7 @@ static void mmhub_v3_0_init_cache_regs(struct amdgpu_device *adev) - - tmp = regMMVM_L2_CNTL5_DEFAULT; - tmp = REG_SET_FIELD(tmp, MMVM_L2_CNTL5, L2_CACHE_SMALLK_FRAGMENT_SIZE, 0); -- WREG32_SOC15(GC, 0, regMMVM_L2_CNTL5, tmp); -+ WREG32_SOC15(MMHUB, 0, regMMVM_L2_CNTL5, tmp); - } - - static void mmhub_v3_0_enable_system_domain(struct amdgpu_device *adev) -diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_1.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_1.c -index e8058edc1d108..6bdf2ef0298d6 100644 ---- a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_1.c -+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_1.c -@@ -269,7 +269,7 @@ static void mmhub_v3_0_1_init_cache_regs(struct amdgpu_device *adev) - - tmp = regMMVM_L2_CNTL5_DEFAULT; - tmp = REG_SET_FIELD(tmp, MMVM_L2_CNTL5, L2_CACHE_SMALLK_FRAGMENT_SIZE, 0); -- WREG32_SOC15(GC, 0, regMMVM_L2_CNTL5, tmp); -+ WREG32_SOC15(MMHUB, 0, regMMVM_L2_CNTL5, tmp); - } - - static void mmhub_v3_0_1_enable_system_domain(struct amdgpu_device *adev) -diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_2.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_2.c -index 770be0a8f7ce7..45465acaa943a 100644 ---- a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_2.c -+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0_2.c -@@ -268,7 +268,7 @@ static void mmhub_v3_0_2_init_cache_regs(struct amdgpu_device *adev) - - tmp = regMMVM_L2_CNTL5_DEFAULT; - tmp = REG_SET_FIELD(tmp, MMVM_L2_CNTL5, L2_CACHE_SMALLK_FRAGMENT_SIZE, 0); -- WREG32_SOC15(GC, 0, regMMVM_L2_CNTL5, tmp); -+ WREG32_SOC15(MMHUB, 0, regMMVM_L2_CNTL5, tmp); - } - - static void mmhub_v3_0_2_enable_system_domain(struct amdgpu_device *adev) -diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -index 512c32327eb11..c2c26fbea5129 100644 ---- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -@@ -1512,6 +1512,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) - case IP_VERSION(3, 0, 1): - case IP_VERSION(3, 1, 2): - case IP_VERSION(3, 1, 3): -+ case IP_VERSION(3, 1, 4): - case IP_VERSION(3, 1, 5): - case IP_VERSION(3, 1, 6): - init_data.flags.gpu_vm_support = true; -diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h -index b76f0f7e42998..d6b964cf73bd1 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h -+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h -@@ -522,9 +522,9 @@ typedef enum { - TEMP_HOTSPOT_M, - TEMP_MEM, - TEMP_VR_GFX, -+ TEMP_VR_SOC, - TEMP_VR_MEM0, - TEMP_VR_MEM1, -- TEMP_VR_SOC, - TEMP_VR_U, - TEMP_LIQUID0, - TEMP_LIQUID1, -diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h -index 865d6358918d2..a9122b3b15322 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h -+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h -@@ -28,6 +28,7 @@ - #define SMU13_DRIVER_IF_VERSION_INV 0xFFFFFFFF - #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 - #define SMU13_DRIVER_IF_VERSION_ALDE 0x08 -+#define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_0 0x34 - #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_4 0x07 - #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_5 0x04 - #define SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_10 0x32 -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c -index 89f0f6eb19f3d..8e4830a311bde 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c -@@ -289,6 +289,8 @@ int smu_v13_0_check_fw_version(struct smu_context *smu) - smu->smc_driver_if_version = SMU13_DRIVER_IF_VERSION_ALDE; - break; - case IP_VERSION(13, 0, 0): -+ smu->smc_driver_if_version = SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_0; -+ break; - case IP_VERSION(13, 0, 10): - smu->smc_driver_if_version = SMU13_DRIVER_IF_VERSION_SMU_V13_0_0_10; - break; -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c -index f0121d1716301..b8430601304f0 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c -@@ -187,6 +187,8 @@ static struct cmn2asic_mapping smu_v13_0_0_feature_mask_map[SMU_FEATURE_COUNT] = - FEA_MAP(MEM_TEMP_READ), - FEA_MAP(ATHUB_MMHUB_PG), - FEA_MAP(SOC_PCC), -+ [SMU_FEATURE_DPM_VCLK_BIT] = {1, FEATURE_MM_DPM_BIT}, -+ [SMU_FEATURE_DPM_DCLK_BIT] = {1, FEATURE_MM_DPM_BIT}, - }; - - static struct cmn2asic_mapping smu_v13_0_0_table_map[SMU_TABLE_COUNT] = { -@@ -517,6 +519,23 @@ static int smu_v13_0_0_set_default_dpm_table(struct smu_context *smu) - dpm_table); - if (ret) - return ret; -+ -+ /* -+ * Update the reported maximum shader clock to the value -+ * which can be guarded to be achieved on all cards. This -+ * is aligned with Window setting. And considering that value -+ * might be not the peak frequency the card can achieve, it -+ * is normal some real-time clock frequency can overtake this -+ * labelled maximum clock frequency(for example in pp_dpm_sclk -+ * sysfs output). -+ */ -+ if (skutable->DriverReportedClocks.GameClockAc && -+ (dpm_table->dpm_levels[dpm_table->count - 1].value > -+ skutable->DriverReportedClocks.GameClockAc)) { -+ dpm_table->dpm_levels[dpm_table->count - 1].value = -+ skutable->DriverReportedClocks.GameClockAc; -+ dpm_table->max = skutable->DriverReportedClocks.GameClockAc; -+ } - } else { - dpm_table->count = 1; - dpm_table->dpm_levels[0].value = smu->smu_table.boot_values.gfxclk / 100; -@@ -779,6 +798,57 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu, - return ret; - } - -+static int smu_v13_0_0_get_dpm_ultimate_freq(struct smu_context *smu, -+ enum smu_clk_type clk_type, -+ uint32_t *min, -+ uint32_t *max) -+{ -+ struct smu_13_0_dpm_context *dpm_context = -+ smu->smu_dpm.dpm_context; -+ struct smu_13_0_dpm_table *dpm_table; -+ -+ switch (clk_type) { -+ case SMU_MCLK: -+ case SMU_UCLK: -+ /* uclk dpm table */ -+ dpm_table = &dpm_context->dpm_tables.uclk_table; -+ break; -+ case SMU_GFXCLK: -+ case SMU_SCLK: -+ /* gfxclk dpm table */ -+ dpm_table = &dpm_context->dpm_tables.gfx_table; -+ break; -+ case SMU_SOCCLK: -+ /* socclk dpm table */ -+ dpm_table = &dpm_context->dpm_tables.soc_table; -+ break; -+ case SMU_FCLK: -+ /* fclk dpm table */ -+ dpm_table = &dpm_context->dpm_tables.fclk_table; -+ break; -+ case SMU_VCLK: -+ case SMU_VCLK1: -+ /* vclk dpm table */ -+ dpm_table = &dpm_context->dpm_tables.vclk_table; -+ break; -+ case SMU_DCLK: -+ case SMU_DCLK1: -+ /* dclk dpm table */ -+ dpm_table = &dpm_context->dpm_tables.dclk_table; -+ break; -+ default: -+ dev_err(smu->adev->dev, "Unsupported clock type!\n"); -+ return -EINVAL; -+ } -+ -+ if (min) -+ *min = dpm_table->min; -+ if (max) -+ *max = dpm_table->max; -+ -+ return 0; -+} -+ - static int smu_v13_0_0_read_sensor(struct smu_context *smu, - enum amd_pp_sensors sensor, - void *data, -@@ -1281,9 +1351,17 @@ static int smu_v13_0_0_populate_umd_state_clk(struct smu_context *smu) - &dpm_context->dpm_tables.fclk_table; - struct smu_umd_pstate_table *pstate_table = - &smu->pstate_table; -+ struct smu_table_context *table_context = &smu->smu_table; -+ PPTable_t *pptable = table_context->driver_pptable; -+ DriverReportedClocks_t driver_clocks = -+ pptable->SkuTable.DriverReportedClocks; - - pstate_table->gfxclk_pstate.min = gfx_table->min; -- pstate_table->gfxclk_pstate.peak = gfx_table->max; -+ if (driver_clocks.GameClockAc && -+ (driver_clocks.GameClockAc < gfx_table->max)) -+ pstate_table->gfxclk_pstate.peak = driver_clocks.GameClockAc; -+ else -+ pstate_table->gfxclk_pstate.peak = gfx_table->max; - - pstate_table->uclk_pstate.min = mem_table->min; - pstate_table->uclk_pstate.peak = mem_table->max; -@@ -1300,12 +1378,12 @@ static int smu_v13_0_0_populate_umd_state_clk(struct smu_context *smu) - pstate_table->fclk_pstate.min = fclk_table->min; - pstate_table->fclk_pstate.peak = fclk_table->max; - -- /* -- * For now, just use the mininum clock frequency. -- * TODO: update them when the real pstate settings available -- */ -- pstate_table->gfxclk_pstate.standard = gfx_table->min; -- pstate_table->uclk_pstate.standard = mem_table->min; -+ if (driver_clocks.BaseClockAc && -+ driver_clocks.BaseClockAc < gfx_table->max) -+ pstate_table->gfxclk_pstate.standard = driver_clocks.BaseClockAc; -+ else -+ pstate_table->gfxclk_pstate.standard = gfx_table->max; -+ pstate_table->uclk_pstate.standard = mem_table->max; - pstate_table->socclk_pstate.standard = soc_table->min; - pstate_table->vclk_pstate.standard = vclk_table->min; - pstate_table->dclk_pstate.standard = dclk_table->min; -@@ -1339,12 +1417,23 @@ out: - static int smu_v13_0_0_get_fan_speed_pwm(struct smu_context *smu, - uint32_t *speed) - { -+ int ret; -+ - if (!speed) - return -EINVAL; - -- return smu_v13_0_0_get_smu_metrics_data(smu, -- METRICS_CURR_FANPWM, -- speed); -+ ret = smu_v13_0_0_get_smu_metrics_data(smu, -+ METRICS_CURR_FANPWM, -+ speed); -+ if (ret) { -+ dev_err(smu->adev->dev, "Failed to get fan speed(PWM)!"); -+ return ret; -+ } -+ -+ /* Convert the PMFW output which is in percent to pwm(255) based */ -+ *speed = MIN(*speed * 255 / 100, 255); -+ -+ return 0; - } - - static int smu_v13_0_0_get_fan_speed_rpm(struct smu_context *smu, -@@ -1813,7 +1902,7 @@ static const struct pptable_funcs smu_v13_0_0_ppt_funcs = { - .get_enabled_mask = smu_cmn_get_enabled_mask, - .dpm_set_vcn_enable = smu_v13_0_set_vcn_enable, - .dpm_set_jpeg_enable = smu_v13_0_set_jpeg_enable, -- .get_dpm_ultimate_freq = smu_v13_0_get_dpm_ultimate_freq, -+ .get_dpm_ultimate_freq = smu_v13_0_0_get_dpm_ultimate_freq, - .get_vbios_bootup_values = smu_v13_0_get_vbios_bootup_values, - .read_sensor = smu_v13_0_0_read_sensor, - .feature_is_enabled = smu_cmn_feature_is_enabled, -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c -index 39deb06a86ba3..222924363a681 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c -@@ -189,6 +189,8 @@ static struct cmn2asic_mapping smu_v13_0_7_feature_mask_map[SMU_FEATURE_COUNT] = - FEA_MAP(MEM_TEMP_READ), - FEA_MAP(ATHUB_MMHUB_PG), - FEA_MAP(SOC_PCC), -+ [SMU_FEATURE_DPM_VCLK_BIT] = {1, FEATURE_MM_DPM_BIT}, -+ [SMU_FEATURE_DPM_DCLK_BIT] = {1, FEATURE_MM_DPM_BIT}, - }; - - static struct cmn2asic_mapping smu_v13_0_7_table_map[SMU_TABLE_COUNT] = { -@@ -1359,12 +1361,23 @@ static int smu_v13_0_7_populate_umd_state_clk(struct smu_context *smu) - static int smu_v13_0_7_get_fan_speed_pwm(struct smu_context *smu, - uint32_t *speed) - { -+ int ret; -+ - if (!speed) - return -EINVAL; - -- return smu_v13_0_7_get_smu_metrics_data(smu, -- METRICS_CURR_FANPWM, -- speed); -+ ret = smu_v13_0_7_get_smu_metrics_data(smu, -+ METRICS_CURR_FANPWM, -+ speed); -+ if (ret) { -+ dev_err(smu->adev->dev, "Failed to get fan speed(PWM)!"); -+ return ret; -+ } -+ -+ /* Convert the PMFW output which is in percent to pwm(255) based */ -+ *speed = MIN(*speed * 255 / 100, 255); -+ -+ return 0; - } - - static int smu_v13_0_7_get_fan_speed_rpm(struct smu_context *smu, -diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c -index 61c29ce74b035..27de2a97f1d11 100644 ---- a/drivers/gpu/drm/drm_connector.c -+++ b/drivers/gpu/drm/drm_connector.c -@@ -582,6 +582,9 @@ void drm_connector_cleanup(struct drm_connector *connector) - mutex_destroy(&connector->mutex); - - memset(connector, 0, sizeof(*connector)); -+ -+ if (dev->registered) -+ drm_sysfs_hotplug_event(dev); - } - EXPORT_SYMBOL(drm_connector_cleanup); - -diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c -index cc386f8a7116e..5cf13e52f7c94 100644 ---- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c -+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c -@@ -258,7 +258,12 @@ struct etnaviv_vram_mapping *etnaviv_gem_mapping_get( - if (mapping->use == 0) { - mutex_lock(&mmu_context->lock); - if (mapping->context == mmu_context) -- mapping->use += 1; -+ if (va && mapping->iova != va) { -+ etnaviv_iommu_reap_mapping(mapping); -+ mapping = NULL; -+ } else { -+ mapping->use += 1; -+ } - else - mapping = NULL; - mutex_unlock(&mmu_context->lock); -diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c -index dc1aa738c4f18..55479cb8b1ac3 100644 ---- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c -+++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c -@@ -135,6 +135,19 @@ static void etnaviv_iommu_remove_mapping(struct etnaviv_iommu_context *context, - drm_mm_remove_node(&mapping->vram_node); - } - -+void etnaviv_iommu_reap_mapping(struct etnaviv_vram_mapping *mapping) -+{ -+ struct etnaviv_iommu_context *context = mapping->context; -+ -+ lockdep_assert_held(&context->lock); -+ WARN_ON(mapping->use); -+ -+ etnaviv_iommu_remove_mapping(context, mapping); -+ etnaviv_iommu_context_put(mapping->context); -+ mapping->context = NULL; -+ list_del_init(&mapping->mmu_node); -+} -+ - static int etnaviv_iommu_find_iova(struct etnaviv_iommu_context *context, - struct drm_mm_node *node, size_t size) - { -@@ -202,10 +215,7 @@ static int etnaviv_iommu_find_iova(struct etnaviv_iommu_context *context, - * this mapping. - */ - list_for_each_entry_safe(m, n, &list, scan_node) { -- etnaviv_iommu_remove_mapping(context, m); -- etnaviv_iommu_context_put(m->context); -- m->context = NULL; -- list_del_init(&m->mmu_node); -+ etnaviv_iommu_reap_mapping(m); - list_del_init(&m->scan_node); - } - -@@ -257,10 +267,7 @@ static int etnaviv_iommu_insert_exact(struct etnaviv_iommu_context *context, - } - - list_for_each_entry_safe(m, n, &scan_list, scan_node) { -- etnaviv_iommu_remove_mapping(context, m); -- etnaviv_iommu_context_put(m->context); -- m->context = NULL; -- list_del_init(&m->mmu_node); -+ etnaviv_iommu_reap_mapping(m); - list_del_init(&m->scan_node); - } - -diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.h b/drivers/gpu/drm/etnaviv/etnaviv_mmu.h -index e4a0b7d09c2ea..c01a147f0dfdd 100644 ---- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.h -+++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.h -@@ -91,6 +91,7 @@ int etnaviv_iommu_map_gem(struct etnaviv_iommu_context *context, - struct etnaviv_vram_mapping *mapping, u64 va); - void etnaviv_iommu_unmap_gem(struct etnaviv_iommu_context *context, - struct etnaviv_vram_mapping *mapping); -+void etnaviv_iommu_reap_mapping(struct etnaviv_vram_mapping *mapping); - - int etnaviv_iommu_get_suballoc_va(struct etnaviv_iommu_context *ctx, - struct etnaviv_vram_mapping *mapping, -diff --git a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c -index 75e8cc4337c93..fce69fa446d58 100644 ---- a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c -+++ b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c -@@ -137,9 +137,9 @@ static enum port intel_dsi_seq_port_to_port(struct intel_dsi *intel_dsi, - return ffs(intel_dsi->ports) - 1; - - if (seq_port) { -- if (intel_dsi->ports & PORT_B) -+ if (intel_dsi->ports & BIT(PORT_B)) - return PORT_B; -- else if (intel_dsi->ports & PORT_C) -+ else if (intel_dsi->ports & BIT(PORT_C)) - return PORT_C; - } - -diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c -index 845023c14eb36..f461e34cc5f07 100644 ---- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c -+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c -@@ -729,32 +729,69 @@ static int eb_reserve(struct i915_execbuffer *eb) - bool unpinned; - - /* -- * Attempt to pin all of the buffers into the GTT. -- * This is done in 2 phases: -+ * We have one more buffers that we couldn't bind, which could be due to -+ * various reasons. To resolve this we have 4 passes, with every next -+ * level turning the screws tighter: - * -- * 1. Unbind all objects that do not match the GTT constraints for -- * the execbuffer (fenceable, mappable, alignment etc). -- * 2. Bind new objects. -+ * 0. Unbind all objects that do not match the GTT constraints for the -+ * execbuffer (fenceable, mappable, alignment etc). Bind all new -+ * objects. This avoids unnecessary unbinding of later objects in order -+ * to make room for the earlier objects *unless* we need to defragment. - * -- * This avoid unnecessary unbinding of later objects in order to make -- * room for the earlier objects *unless* we need to defragment. -+ * 1. Reorder the buffers, where objects with the most restrictive -+ * placement requirements go first (ignoring fixed location buffers for -+ * now). For example, objects needing the mappable aperture (the first -+ * 256M of GTT), should go first vs objects that can be placed just -+ * about anywhere. Repeat the previous pass. - * -- * Defragmenting is skipped if all objects are pinned at a fixed location. -+ * 2. Consider buffers that are pinned at a fixed location. Also try to -+ * evict the entire VM this time, leaving only objects that we were -+ * unable to lock. Try again to bind the buffers. (still using the new -+ * buffer order). -+ * -+ * 3. We likely have object lock contention for one or more stubborn -+ * objects in the VM, for which we need to evict to make forward -+ * progress (perhaps we are fighting the shrinker?). When evicting the -+ * VM this time around, anything that we can't lock we now track using -+ * the busy_bo, using the full lock (after dropping the vm->mutex to -+ * prevent deadlocks), instead of trylock. We then continue to evict the -+ * VM, this time with the stubborn object locked, which we can now -+ * hopefully unbind (if still bound in the VM). Repeat until the VM is -+ * evicted. Finally we should be able bind everything. - */ -- for (pass = 0; pass <= 2; pass++) { -+ for (pass = 0; pass <= 3; pass++) { - int pin_flags = PIN_USER | PIN_VALIDATE; - - if (pass == 0) - pin_flags |= PIN_NONBLOCK; - - if (pass >= 1) -- unpinned = eb_unbind(eb, pass == 2); -+ unpinned = eb_unbind(eb, pass >= 2); - - if (pass == 2) { - err = mutex_lock_interruptible(&eb->context->vm->mutex); - if (!err) { -- err = i915_gem_evict_vm(eb->context->vm, &eb->ww); -+ err = i915_gem_evict_vm(eb->context->vm, &eb->ww, NULL); -+ mutex_unlock(&eb->context->vm->mutex); -+ } -+ if (err) -+ return err; -+ } -+ -+ if (pass == 3) { -+retry: -+ err = mutex_lock_interruptible(&eb->context->vm->mutex); -+ if (!err) { -+ struct drm_i915_gem_object *busy_bo = NULL; -+ -+ err = i915_gem_evict_vm(eb->context->vm, &eb->ww, &busy_bo); - mutex_unlock(&eb->context->vm->mutex); -+ if (err && busy_bo) { -+ err = i915_gem_object_lock(busy_bo, &eb->ww); -+ i915_gem_object_put(busy_bo); -+ if (!err) -+ goto retry; -+ } - } - if (err) - return err; -diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c -index e63329bc80659..354c1d6dab846 100644 ---- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c -+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c -@@ -369,7 +369,7 @@ retry: - if (vma == ERR_PTR(-ENOSPC)) { - ret = mutex_lock_interruptible(&ggtt->vm.mutex); - if (!ret) { -- ret = i915_gem_evict_vm(&ggtt->vm, &ww); -+ ret = i915_gem_evict_vm(&ggtt->vm, &ww, NULL); - mutex_unlock(&ggtt->vm.mutex); - } - if (ret) -diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c -index 369006c5317f2..a40bc17acead8 100644 ---- a/drivers/gpu/drm/i915/gem/i915_gem_object.c -+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c -@@ -761,6 +761,9 @@ bool i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj) - if (!HAS_FLAT_CCS(to_i915(obj->base.dev))) - return false; - -+ if (obj->flags & I915_BO_ALLOC_CCS_AUX) -+ return true; -+ - for (i = 0; i < obj->mm.n_placements; i++) { - /* Compression is not allowed for the objects with smem placement */ - if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM) -diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h -index d0d6772e6f36a..ab4c2f90a5643 100644 ---- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h -+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h -@@ -327,16 +327,18 @@ struct drm_i915_gem_object { - * dealing with userspace objects the CPU fault handler is free to ignore this. - */ - #define I915_BO_ALLOC_GPU_ONLY BIT(6) -+#define I915_BO_ALLOC_CCS_AUX BIT(7) - #define I915_BO_ALLOC_FLAGS (I915_BO_ALLOC_CONTIGUOUS | \ - I915_BO_ALLOC_VOLATILE | \ - I915_BO_ALLOC_CPU_CLEAR | \ - I915_BO_ALLOC_USER | \ - I915_BO_ALLOC_PM_VOLATILE | \ - I915_BO_ALLOC_PM_EARLY | \ -- I915_BO_ALLOC_GPU_ONLY) --#define I915_BO_READONLY BIT(7) --#define I915_TILING_QUIRK_BIT 8 /* unknown swizzling; do not release! */ --#define I915_BO_PROTECTED BIT(9) -+ I915_BO_ALLOC_GPU_ONLY | \ -+ I915_BO_ALLOC_CCS_AUX) -+#define I915_BO_READONLY BIT(8) -+#define I915_TILING_QUIRK_BIT 9 /* unknown swizzling; do not release! */ -+#define I915_BO_PROTECTED BIT(10) - /** - * @mem_flags - Mutable placement-related flags - * -diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c -index 07e49f22f2de3..7e67742bc65e0 100644 ---- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c -+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c -@@ -50,6 +50,7 @@ static int i915_ttm_backup(struct i915_gem_apply_to_region *apply, - container_of(bo->bdev, typeof(*i915), bdev); - struct drm_i915_gem_object *backup; - struct ttm_operation_ctx ctx = {}; -+ unsigned int flags; - int err = 0; - - if (bo->resource->mem_type == I915_PL_SYSTEM || obj->ttm.backup) -@@ -65,7 +66,22 @@ static int i915_ttm_backup(struct i915_gem_apply_to_region *apply, - if (obj->flags & I915_BO_ALLOC_PM_VOLATILE) - return 0; - -- backup = i915_gem_object_create_shmem(i915, obj->base.size); -+ /* -+ * It seems that we might have some framebuffers still pinned at this -+ * stage, but for such objects we might also need to deal with the CCS -+ * aux state. Make sure we force the save/restore of the CCS state, -+ * otherwise we might observe display corruption, when returning from -+ * suspend. -+ */ -+ flags = 0; -+ if (i915_gem_object_needs_ccs_pages(obj)) { -+ WARN_ON_ONCE(!i915_gem_object_is_framebuffer(obj)); -+ WARN_ON_ONCE(!pm_apply->allow_gpu); -+ -+ flags = I915_BO_ALLOC_CCS_AUX; -+ } -+ backup = i915_gem_object_create_region(i915->mm.regions[INTEL_REGION_SMEM], -+ obj->base.size, 0, flags); - if (IS_ERR(backup)) - return PTR_ERR(backup); - -diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c -index aaaf1906026c1..ee072c7d62eb1 100644 ---- a/drivers/gpu/drm/i915/gt/intel_migrate.c -+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c -@@ -341,6 +341,16 @@ static int emit_no_arbitration(struct i915_request *rq) - return 0; - } - -+static int max_pte_pkt_size(struct i915_request *rq, int pkt) -+{ -+ struct intel_ring *ring = rq->ring; -+ -+ pkt = min_t(int, pkt, (ring->space - rq->reserved_space) / sizeof(u32) + 5); -+ pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5); -+ -+ return pkt; -+} -+ - static int emit_pte(struct i915_request *rq, - struct sgt_dma *it, - enum i915_cache_level cache_level, -@@ -387,8 +397,7 @@ static int emit_pte(struct i915_request *rq, - return PTR_ERR(cs); - - /* Pack as many PTE updates as possible into a single MI command */ -- pkt = min_t(int, dword_length, ring->space / sizeof(u32) + 5); -- pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5); -+ pkt = max_pte_pkt_size(rq, dword_length); - - hdr = cs; - *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */ -@@ -421,8 +430,7 @@ static int emit_pte(struct i915_request *rq, - } - } - -- pkt = min_t(int, dword_rem, ring->space / sizeof(u32) + 5); -- pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5); -+ pkt = max_pte_pkt_size(rq, dword_rem); - - hdr = cs; - *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); -diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c -index f025ee4fa5261..a4b4d9b7d26c7 100644 ---- a/drivers/gpu/drm/i915/i915_gem_evict.c -+++ b/drivers/gpu/drm/i915/i915_gem_evict.c -@@ -416,6 +416,11 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, - * @vm: Address space to cleanse - * @ww: An optional struct i915_gem_ww_ctx. If not NULL, i915_gem_evict_vm - * will be able to evict vma's locked by the ww as well. -+ * @busy_bo: Optional pointer to struct drm_i915_gem_object. If not NULL, then -+ * in the event i915_gem_evict_vm() is unable to trylock an object for eviction, -+ * then @busy_bo will point to it. -EBUSY is also returned. The caller must drop -+ * the vm->mutex, before trying again to acquire the contended lock. The caller -+ * also owns a reference to the object. - * - * This function evicts all vmas from a vm. - * -@@ -425,7 +430,8 @@ int i915_gem_evict_for_node(struct i915_address_space *vm, - * To clarify: This is for freeing up virtual address space, not for freeing - * memory in e.g. the shrinker. - */ --int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) -+int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww, -+ struct drm_i915_gem_object **busy_bo) - { - int ret = 0; - -@@ -457,15 +463,22 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) - * the resv is shared among multiple objects, we still - * need the object ref. - */ -- if (dying_vma(vma) || -+ if (!i915_gem_object_get_rcu(vma->obj) || - (ww && (dma_resv_locking_ctx(vma->obj->base.resv) == &ww->ctx))) { - __i915_vma_pin(vma); - list_add(&vma->evict_link, &locked_eviction_list); - continue; - } - -- if (!i915_gem_object_trylock(vma->obj, ww)) -+ if (!i915_gem_object_trylock(vma->obj, ww)) { -+ if (busy_bo) { -+ *busy_bo = vma->obj; /* holds ref */ -+ ret = -EBUSY; -+ break; -+ } -+ i915_gem_object_put(vma->obj); - continue; -+ } - - __i915_vma_pin(vma); - list_add(&vma->evict_link, &eviction_list); -@@ -473,25 +486,29 @@ int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx *ww) - if (list_empty(&eviction_list) && list_empty(&locked_eviction_list)) - break; - -- ret = 0; - /* Unbind locked objects first, before unlocking the eviction_list */ - list_for_each_entry_safe(vma, vn, &locked_eviction_list, evict_link) { - __i915_vma_unpin(vma); - -- if (ret == 0) -+ if (ret == 0) { - ret = __i915_vma_unbind(vma); -- if (ret != -EINTR) /* "Get me out of here!" */ -- ret = 0; -+ if (ret != -EINTR) /* "Get me out of here!" */ -+ ret = 0; -+ } -+ if (!dying_vma(vma)) -+ i915_gem_object_put(vma->obj); - } - - list_for_each_entry_safe(vma, vn, &eviction_list, evict_link) { - __i915_vma_unpin(vma); -- if (ret == 0) -+ if (ret == 0) { - ret = __i915_vma_unbind(vma); -- if (ret != -EINTR) /* "Get me out of here!" */ -- ret = 0; -+ if (ret != -EINTR) /* "Get me out of here!" */ -+ ret = 0; -+ } - - i915_gem_object_unlock(vma->obj); -+ i915_gem_object_put(vma->obj); - } - } while (ret == 0); - -diff --git a/drivers/gpu/drm/i915/i915_gem_evict.h b/drivers/gpu/drm/i915/i915_gem_evict.h -index e593c530f9bd7..bf0ee0e4fe608 100644 ---- a/drivers/gpu/drm/i915/i915_gem_evict.h -+++ b/drivers/gpu/drm/i915/i915_gem_evict.h -@@ -11,6 +11,7 @@ - struct drm_mm_node; - struct i915_address_space; - struct i915_gem_ww_ctx; -+struct drm_i915_gem_object; - - int __must_check i915_gem_evict_something(struct i915_address_space *vm, - struct i915_gem_ww_ctx *ww, -@@ -23,6 +24,7 @@ int __must_check i915_gem_evict_for_node(struct i915_address_space *vm, - struct drm_mm_node *node, - unsigned int flags); - int i915_gem_evict_vm(struct i915_address_space *vm, -- struct i915_gem_ww_ctx *ww); -+ struct i915_gem_ww_ctx *ww, -+ struct drm_i915_gem_object **busy_bo); - - #endif /* __I915_GEM_EVICT_H__ */ -diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c -index f17c09ead7d77..4d06875de14a1 100644 ---- a/drivers/gpu/drm/i915/i915_vma.c -+++ b/drivers/gpu/drm/i915/i915_vma.c -@@ -1569,7 +1569,7 @@ static int __i915_ggtt_pin(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, - * locked objects when called from execbuf when pinning - * is removed. This would probably regress badly. - */ -- i915_gem_evict_vm(vm, NULL); -+ i915_gem_evict_vm(vm, NULL, NULL); - mutex_unlock(&vm->mutex); - } - } while (1); -diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c -index 8c6517d29b8e0..37068542aafe7 100644 ---- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c -+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c -@@ -344,7 +344,7 @@ static int igt_evict_vm(void *arg) - - /* Everything is pinned, nothing should happen */ - mutex_lock(&ggtt->vm.mutex); -- err = i915_gem_evict_vm(&ggtt->vm, NULL); -+ err = i915_gem_evict_vm(&ggtt->vm, NULL, NULL); - mutex_unlock(&ggtt->vm.mutex); - if (err) { - pr_err("i915_gem_evict_vm on a full GGTT returned err=%d]\n", -@@ -356,7 +356,7 @@ static int igt_evict_vm(void *arg) - - for_i915_gem_ww(&ww, err, false) { - mutex_lock(&ggtt->vm.mutex); -- err = i915_gem_evict_vm(&ggtt->vm, &ww); -+ err = i915_gem_evict_vm(&ggtt->vm, &ww, NULL); - mutex_unlock(&ggtt->vm.mutex); - } - -diff --git a/drivers/gpu/drm/ingenic/ingenic-drm-drv.c b/drivers/gpu/drm/ingenic/ingenic-drm-drv.c -index ab0515d2c420a..4499a04f7c138 100644 ---- a/drivers/gpu/drm/ingenic/ingenic-drm-drv.c -+++ b/drivers/gpu/drm/ingenic/ingenic-drm-drv.c -@@ -1629,7 +1629,11 @@ static int ingenic_drm_init(void) - return err; - } - -- return platform_driver_register(&ingenic_drm_driver); -+ err = platform_driver_register(&ingenic_drm_driver); -+ if (IS_ENABLED(CONFIG_DRM_INGENIC_IPU) && err) -+ platform_driver_unregister(ingenic_ipu_driver_ptr); -+ -+ return err; - } - module_init(ingenic_drm_init); - -diff --git a/drivers/gpu/drm/mgag200/mgag200_g200se.c b/drivers/gpu/drm/mgag200/mgag200_g200se.c -index be389ed91cbd8..bd6e573c9a1a3 100644 ---- a/drivers/gpu/drm/mgag200/mgag200_g200se.c -+++ b/drivers/gpu/drm/mgag200/mgag200_g200se.c -@@ -284,7 +284,8 @@ static void mgag200_g200se_04_pixpllc_atomic_update(struct drm_crtc *crtc, - pixpllcp = pixpllc->p - 1; - pixpllcs = pixpllc->s; - -- xpixpllcm = pixpllcm | ((pixpllcn & BIT(8)) >> 1); -+ // For G200SE A, BIT(7) should be set unconditionally. -+ xpixpllcm = BIT(7) | pixpllcm; - xpixpllcn = pixpllcn; - xpixpllcp = (pixpllcs << 3) | pixpllcp; - -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c -index 214829c32ed87..7a2f262414ad4 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c -@@ -308,7 +308,8 @@ void vmw_kms_cursor_snoop(struct vmw_surface *srf, - if (cmd->dma.guest.ptr.offset % PAGE_SIZE || - box->x != 0 || box->y != 0 || box->z != 0 || - box->srcx != 0 || box->srcy != 0 || box->srcz != 0 || -- box->d != 1 || box_count != 1) { -+ box->d != 1 || box_count != 1 || -+ box->w > 64 || box->h > 64) { - /* TODO handle none page aligned offsets */ - /* TODO handle more dst & src != 0 */ - /* TODO handle more then one copy */ -diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h -index e27fb27a36bfa..82713ef3aaa64 100644 ---- a/drivers/hid/hid-ids.h -+++ b/drivers/hid/hid-ids.h -@@ -412,6 +412,7 @@ - #define USB_DEVICE_ID_HP_X2_10_COVER 0x0755 - #define I2C_DEVICE_ID_HP_ENVY_X360_15 0x2d05 - #define I2C_DEVICE_ID_HP_ENVY_X360_15T_DR100 0x29CF -+#define I2C_DEVICE_ID_HP_ENVY_X360_EU0009NV 0x2CF9 - #define I2C_DEVICE_ID_HP_SPECTRE_X360_15 0x2817 - #define USB_DEVICE_ID_ASUS_UX550VE_TOUCHSCREEN 0x2544 - #define USB_DEVICE_ID_ASUS_UX550_TOUCHSCREEN 0x2706 -diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c -index d728a94c642eb..3ee5a9fea20e6 100644 ---- a/drivers/hid/hid-input.c -+++ b/drivers/hid/hid-input.c -@@ -380,6 +380,8 @@ static const struct hid_device_id hid_battery_quirks[] = { - HID_BATTERY_QUIRK_IGNORE }, - { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_ENVY_X360_15T_DR100), - HID_BATTERY_QUIRK_IGNORE }, -+ { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_ENVY_X360_EU0009NV), -+ HID_BATTERY_QUIRK_IGNORE }, - { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_15), - HID_BATTERY_QUIRK_IGNORE }, - { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_SURFACE_GO_TOUCHSCREEN), -diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c -index 1a2d425bf5687..34029d1161073 100644 ---- a/drivers/iommu/amd/init.c -+++ b/drivers/iommu/amd/init.c -@@ -3402,18 +3402,24 @@ static int __init parse_amd_iommu_options(char *str) - static int __init parse_ivrs_ioapic(char *str) - { - u32 seg = 0, bus, dev, fn; -- int ret, id, i; -+ int id, i; - u32 devid; - -- ret = sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn); -- if (ret != 4) { -- ret = sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn); -- if (ret != 5) { -- pr_err("Invalid command line: ivrs_ioapic%s\n", str); -- return 1; -- } -+ if (sscanf(str, "=%d@%x:%x.%x", &id, &bus, &dev, &fn) == 4 || -+ sscanf(str, "=%d@%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) -+ goto found; -+ -+ if (sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn) == 4 || -+ sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) { -+ pr_warn("ivrs_ioapic%s option format deprecated; use ivrs_ioapic=%d@%04x:%02x:%02x.%d instead\n", -+ str, id, seg, bus, dev, fn); -+ goto found; - } - -+ pr_err("Invalid command line: ivrs_ioapic%s\n", str); -+ return 1; -+ -+found: - if (early_ioapic_map_size == EARLY_MAP_SIZE) { - pr_err("Early IOAPIC map overflow - ignoring ivrs_ioapic%s\n", - str); -@@ -3434,18 +3440,24 @@ static int __init parse_ivrs_ioapic(char *str) - static int __init parse_ivrs_hpet(char *str) - { - u32 seg = 0, bus, dev, fn; -- int ret, id, i; -+ int id, i; - u32 devid; - -- ret = sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn); -- if (ret != 4) { -- ret = sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn); -- if (ret != 5) { -- pr_err("Invalid command line: ivrs_hpet%s\n", str); -- return 1; -- } -+ if (sscanf(str, "=%d@%x:%x.%x", &id, &bus, &dev, &fn) == 4 || -+ sscanf(str, "=%d@%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) -+ goto found; -+ -+ if (sscanf(str, "[%d]=%x:%x.%x", &id, &bus, &dev, &fn) == 4 || -+ sscanf(str, "[%d]=%x:%x:%x.%x", &id, &seg, &bus, &dev, &fn) == 5) { -+ pr_warn("ivrs_hpet%s option format deprecated; use ivrs_hpet=%d@%04x:%02x:%02x.%d instead\n", -+ str, id, seg, bus, dev, fn); -+ goto found; - } - -+ pr_err("Invalid command line: ivrs_hpet%s\n", str); -+ return 1; -+ -+found: - if (early_hpet_map_size == EARLY_MAP_SIZE) { - pr_err("Early HPET map overflow - ignoring ivrs_hpet%s\n", - str); -@@ -3466,19 +3478,36 @@ static int __init parse_ivrs_hpet(char *str) - static int __init parse_ivrs_acpihid(char *str) - { - u32 seg = 0, bus, dev, fn; -- char *hid, *uid, *p; -+ char *hid, *uid, *p, *addr; - char acpiid[ACPIHID_UID_LEN + ACPIHID_HID_LEN] = {0}; -- int ret, i; -- -- ret = sscanf(str, "[%x:%x.%x]=%s", &bus, &dev, &fn, acpiid); -- if (ret != 4) { -- ret = sscanf(str, "[%x:%x:%x.%x]=%s", &seg, &bus, &dev, &fn, acpiid); -- if (ret != 5) { -- pr_err("Invalid command line: ivrs_acpihid(%s)\n", str); -- return 1; -+ int i; -+ -+ addr = strchr(str, '@'); -+ if (!addr) { -+ if (sscanf(str, "[%x:%x.%x]=%s", &bus, &dev, &fn, acpiid) == 4 || -+ sscanf(str, "[%x:%x:%x.%x]=%s", &seg, &bus, &dev, &fn, acpiid) == 5) { -+ pr_warn("ivrs_acpihid%s option format deprecated; use ivrs_acpihid=%s@%04x:%02x:%02x.%d instead\n", -+ str, acpiid, seg, bus, dev, fn); -+ goto found; - } -+ goto not_found; - } - -+ /* We have the '@', make it the terminator to get just the acpiid */ -+ *addr++ = 0; -+ -+ if (sscanf(str, "=%s", acpiid) != 1) -+ goto not_found; -+ -+ if (sscanf(addr, "%x:%x.%x", &bus, &dev, &fn) == 3 || -+ sscanf(addr, "%x:%x:%x.%x", &seg, &bus, &dev, &fn) == 4) -+ goto found; -+ -+not_found: -+ pr_err("Invalid command line: ivrs_acpihid%s\n", str); -+ return 1; -+ -+found: - p = acpiid; - hid = strsep(&p, ":"); - uid = p; -@@ -3488,6 +3517,13 @@ static int __init parse_ivrs_acpihid(char *str) - return 1; - } - -+ /* -+ * Ignore leading zeroes after ':', so e.g., AMDI0095:00 -+ * will match AMDI0095:0 in the second strcmp in acpi_dev_hid_uid_match -+ */ -+ while (*uid == '0' && *(uid + 1)) -+ uid++; -+ - i = early_acpihid_map_size++; - memcpy(early_acpihid_map[i].hid, hid, strlen(hid)); - memcpy(early_acpihid_map[i].uid, uid, strlen(uid)); -diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c -index ab13b73802650..83a5975bcc729 100644 ---- a/drivers/md/dm-cache-metadata.c -+++ b/drivers/md/dm-cache-metadata.c -@@ -551,11 +551,13 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, - return r; - } - --static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd) -+static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd, -+ bool destroy_bm) - { - dm_sm_destroy(cmd->metadata_sm); - dm_tm_destroy(cmd->tm); -- dm_block_manager_destroy(cmd->bm); -+ if (destroy_bm) -+ dm_block_manager_destroy(cmd->bm); - } - - typedef unsigned long (*flags_mutator)(unsigned long); -@@ -826,7 +828,7 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, - cmd2 = lookup(bdev); - if (cmd2) { - mutex_unlock(&table_lock); -- __destroy_persistent_data_objects(cmd); -+ __destroy_persistent_data_objects(cmd, true); - kfree(cmd); - return cmd2; - } -@@ -874,7 +876,7 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd) - mutex_unlock(&table_lock); - - if (!cmd->fail_io) -- __destroy_persistent_data_objects(cmd); -+ __destroy_persistent_data_objects(cmd, true); - kfree(cmd); - } - } -@@ -1807,14 +1809,52 @@ int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result) - - int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) - { -- int r; -+ int r = -EINVAL; -+ struct dm_block_manager *old_bm = NULL, *new_bm = NULL; -+ -+ /* fail_io is double-checked with cmd->root_lock held below */ -+ if (unlikely(cmd->fail_io)) -+ return r; -+ -+ /* -+ * Replacement block manager (new_bm) is created and old_bm destroyed outside of -+ * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of -+ * shrinker associated with the block manager's bufio client vs cmd root_lock). -+ * - must take shrinker_rwsem without holding cmd->root_lock -+ */ -+ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, -+ CACHE_MAX_CONCURRENT_LOCKS); - - WRITE_LOCK(cmd); -- __destroy_persistent_data_objects(cmd); -- r = __create_persistent_data_objects(cmd, false); -+ if (cmd->fail_io) { -+ WRITE_UNLOCK(cmd); -+ goto out; -+ } -+ -+ __destroy_persistent_data_objects(cmd, false); -+ old_bm = cmd->bm; -+ if (IS_ERR(new_bm)) { -+ DMERR("could not create block manager during abort"); -+ cmd->bm = NULL; -+ r = PTR_ERR(new_bm); -+ goto out_unlock; -+ } -+ -+ cmd->bm = new_bm; -+ r = __open_or_format_metadata(cmd, false); -+ if (r) { -+ cmd->bm = NULL; -+ goto out_unlock; -+ } -+ new_bm = NULL; -+out_unlock: - if (r) - cmd->fail_io = true; - WRITE_UNLOCK(cmd); -+ dm_block_manager_destroy(old_bm); -+out: -+ if (new_bm && !IS_ERR(new_bm)) -+ dm_block_manager_destroy(new_bm); - - return r; - } -diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c -index 54a8d5c9a44ea..5e92fac90b675 100644 ---- a/drivers/md/dm-cache-target.c -+++ b/drivers/md/dm-cache-target.c -@@ -907,16 +907,16 @@ static void abort_transaction(struct cache *cache) - if (get_cache_mode(cache) >= CM_READ_ONLY) - return; - -- if (dm_cache_metadata_set_needs_check(cache->cmd)) { -- DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); -- set_cache_mode(cache, CM_FAIL); -- } -- - DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); - if (dm_cache_metadata_abort(cache->cmd)) { - DMERR("%s: failed to abort metadata transaction", dev_name); - set_cache_mode(cache, CM_FAIL); - } -+ -+ if (dm_cache_metadata_set_needs_check(cache->cmd)) { -+ DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); -+ set_cache_mode(cache, CM_FAIL); -+ } - } - - static void metadata_operation_failed(struct cache *cache, const char *op, int r) -@@ -1887,6 +1887,7 @@ static void destroy(struct cache *cache) - if (cache->prison) - dm_bio_prison_destroy_v2(cache->prison); - -+ cancel_delayed_work_sync(&cache->waker); - if (cache->wq) - destroy_workqueue(cache->wq); - -diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c -index 2f1cc66d26412..29e0b85eeaf09 100644 ---- a/drivers/md/dm-clone-target.c -+++ b/drivers/md/dm-clone-target.c -@@ -1958,6 +1958,7 @@ static void clone_dtr(struct dm_target *ti) - - mempool_exit(&clone->hydration_pool); - dm_kcopyd_client_destroy(clone->kcopyd_client); -+ cancel_delayed_work_sync(&clone->waker); - destroy_workqueue(clone->wq); - hash_table_exit(clone); - dm_clone_metadata_close(clone->cmd); -diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c -index e97e9f97456d4..1388ee35571e0 100644 ---- a/drivers/md/dm-integrity.c -+++ b/drivers/md/dm-integrity.c -@@ -4558,6 +4558,8 @@ static void dm_integrity_dtr(struct dm_target *ti) - BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); - BUG_ON(!list_empty(&ic->wait_list)); - -+ if (ic->mode == 'B') -+ cancel_delayed_work_sync(&ic->bitmap_flush_work); - if (ic->metadata_wq) - destroy_workqueue(ic->metadata_wq); - if (ic->wait_wq) -diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c -index a27395c8621ff..6bcc4c4786d89 100644 ---- a/drivers/md/dm-thin-metadata.c -+++ b/drivers/md/dm-thin-metadata.c -@@ -724,6 +724,15 @@ static int __open_metadata(struct dm_pool_metadata *pmd) - goto bad_cleanup_data_sm; - } - -+ /* -+ * For pool metadata opening process, root setting is redundant -+ * because it will be set again in __begin_transaction(). But dm -+ * pool aborting process really needs to get last transaction's -+ * root to avoid accessing broken btree. -+ */ -+ pmd->root = le64_to_cpu(disk_super->data_mapping_root); -+ pmd->details_root = le64_to_cpu(disk_super->device_details_root); -+ - __setup_btree_details(pmd); - dm_bm_unlock(sblock); - -@@ -776,13 +785,15 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f - return r; - } - --static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd) -+static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd, -+ bool destroy_bm) - { - dm_sm_destroy(pmd->data_sm); - dm_sm_destroy(pmd->metadata_sm); - dm_tm_destroy(pmd->nb_tm); - dm_tm_destroy(pmd->tm); -- dm_block_manager_destroy(pmd->bm); -+ if (destroy_bm) -+ dm_block_manager_destroy(pmd->bm); - } - - static int __begin_transaction(struct dm_pool_metadata *pmd) -@@ -989,7 +1000,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) - } - pmd_write_unlock(pmd); - if (!pmd->fail_io) -- __destroy_persistent_data_objects(pmd); -+ __destroy_persistent_data_objects(pmd, true); - - kfree(pmd); - return 0; -@@ -1860,19 +1871,52 @@ static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) - int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) - { - int r = -EINVAL; -+ struct dm_block_manager *old_bm = NULL, *new_bm = NULL; -+ -+ /* fail_io is double-checked with pmd->root_lock held below */ -+ if (unlikely(pmd->fail_io)) -+ return r; -+ -+ /* -+ * Replacement block manager (new_bm) is created and old_bm destroyed outside of -+ * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of -+ * shrinker associated with the block manager's bufio client vs pmd root_lock). -+ * - must take shrinker_rwsem without holding pmd->root_lock -+ */ -+ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, -+ THIN_MAX_CONCURRENT_LOCKS); - - pmd_write_lock(pmd); -- if (pmd->fail_io) -+ if (pmd->fail_io) { -+ pmd_write_unlock(pmd); - goto out; -+ } - - __set_abort_with_changes_flags(pmd); -- __destroy_persistent_data_objects(pmd); -- r = __create_persistent_data_objects(pmd, false); -+ __destroy_persistent_data_objects(pmd, false); -+ old_bm = pmd->bm; -+ if (IS_ERR(new_bm)) { -+ DMERR("could not create block manager during abort"); -+ pmd->bm = NULL; -+ r = PTR_ERR(new_bm); -+ goto out_unlock; -+ } -+ -+ pmd->bm = new_bm; -+ r = __open_or_format_metadata(pmd, false); -+ if (r) { -+ pmd->bm = NULL; -+ goto out_unlock; -+ } -+ new_bm = NULL; -+out_unlock: - if (r) - pmd->fail_io = true; -- --out: - pmd_write_unlock(pmd); -+ dm_block_manager_destroy(old_bm); -+out: -+ if (new_bm && !IS_ERR(new_bm)) -+ dm_block_manager_destroy(new_bm); - - return r; - } -diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c -index e76c96c760a9b..196f82559ad6b 100644 ---- a/drivers/md/dm-thin.c -+++ b/drivers/md/dm-thin.c -@@ -2889,6 +2889,8 @@ static void __pool_destroy(struct pool *pool) - dm_bio_prison_destroy(pool->prison); - dm_kcopyd_client_destroy(pool->copier); - -+ cancel_delayed_work_sync(&pool->waker); -+ cancel_delayed_work_sync(&pool->no_space_timeout); - if (pool->wq) - destroy_workqueue(pool->wq); - -@@ -3540,20 +3542,28 @@ static int pool_preresume(struct dm_target *ti) - */ - r = bind_control_target(pool, ti); - if (r) -- return r; -+ goto out; - - r = maybe_resize_data_dev(ti, &need_commit1); - if (r) -- return r; -+ goto out; - - r = maybe_resize_metadata_dev(ti, &need_commit2); - if (r) -- return r; -+ goto out; - - if (need_commit1 || need_commit2) - (void) commit(pool); -+out: -+ /* -+ * When a thin-pool is PM_FAIL, it cannot be rebuilt if -+ * bio is in deferred list. Therefore need to return 0 -+ * to allow pool_resume() to flush IO. -+ */ -+ if (r && get_pool_mode(pool) == PM_FAIL) -+ r = 0; - -- return 0; -+ return r; - } - - static void pool_suspend_active_thins(struct pool *pool) -diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c -index 63ece30114e53..e7cc6ba1b657f 100644 ---- a/drivers/md/md-bitmap.c -+++ b/drivers/md/md-bitmap.c -@@ -486,7 +486,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap) - sb = kmap_atomic(bitmap->storage.sb_page); - pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); - pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); -- pr_debug(" version: %d\n", le32_to_cpu(sb->version)); -+ pr_debug(" version: %u\n", le32_to_cpu(sb->version)); - pr_debug(" uuid: %08x.%08x.%08x.%08x\n", - le32_to_cpu(*(__le32 *)(sb->uuid+0)), - le32_to_cpu(*(__le32 *)(sb->uuid+4)), -@@ -497,11 +497,11 @@ void md_bitmap_print_sb(struct bitmap *bitmap) - pr_debug("events cleared: %llu\n", - (unsigned long long) le64_to_cpu(sb->events_cleared)); - pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); -- pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); -- pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); -+ pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize)); -+ pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep)); - pr_debug(" sync size: %llu KB\n", - (unsigned long long)le64_to_cpu(sb->sync_size)/2); -- pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); -+ pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb); - } - -@@ -2105,7 +2105,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - bytes = DIV_ROUND_UP(chunks, 8); - if (!bitmap->mddev->bitmap_info.external) - bytes += sizeof(bitmap_super_t); -- } while (bytes > (space << 9)); -+ } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) < -+ (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1)); - } else - chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; - -@@ -2150,7 +2151,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - bitmap->counts.missing_pages = pages; - bitmap->counts.chunkshift = chunkshift; - bitmap->counts.chunks = chunks; -- bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + -+ bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift + - BITMAP_BLOCK_SHIFT); - - blocks = min(old_counts.chunks << old_counts.chunkshift, -@@ -2176,8 +2177,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, - bitmap->counts.missing_pages = old_counts.pages; - bitmap->counts.chunkshift = old_counts.chunkshift; - bitmap->counts.chunks = old_counts.chunks; -- bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + -- BITMAP_BLOCK_SHIFT); -+ bitmap->mddev->bitmap_info.chunksize = -+ 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT); - blocks = old_counts.chunks << old_counts.chunkshift; - pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); - break; -@@ -2537,6 +2538,9 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len) - if (csize < 512 || - !is_power_of_2(csize)) - return -EINVAL; -+ if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE * -+ sizeof(((bitmap_super_t *)0)->chunksize)))) -+ return -EOVERFLOW; - mddev->bitmap_info.chunksize = csize; - return len; - } -diff --git a/drivers/media/dvb-core/dmxdev.c b/drivers/media/dvb-core/dmxdev.c -index f6ee678107d37..9ce5f010de3f8 100644 ---- a/drivers/media/dvb-core/dmxdev.c -+++ b/drivers/media/dvb-core/dmxdev.c -@@ -790,6 +790,11 @@ static int dvb_demux_open(struct inode *inode, struct file *file) - if (mutex_lock_interruptible(&dmxdev->mutex)) - return -ERESTARTSYS; - -+ if (dmxdev->exit) { -+ mutex_unlock(&dmxdev->mutex); -+ return -ENODEV; -+ } -+ - for (i = 0; i < dmxdev->filternum; i++) - if (dmxdev->filter[i].state == DMXDEV_STATE_FREE) - break; -@@ -1448,7 +1453,10 @@ EXPORT_SYMBOL(dvb_dmxdev_init); - - void dvb_dmxdev_release(struct dmxdev *dmxdev) - { -+ mutex_lock(&dmxdev->mutex); - dmxdev->exit = 1; -+ mutex_unlock(&dmxdev->mutex); -+ - if (dmxdev->dvbdev->users > 1) { - wait_event(dmxdev->dvbdev->wait_queue, - dmxdev->dvbdev->users == 1); -diff --git a/drivers/media/dvb-core/dvbdev.c b/drivers/media/dvb-core/dvbdev.c -index 9934728734af9..a31d52cb6d62c 100644 ---- a/drivers/media/dvb-core/dvbdev.c -+++ b/drivers/media/dvb-core/dvbdev.c -@@ -335,6 +335,7 @@ static int dvb_create_media_entity(struct dvb_device *dvbdev, - GFP_KERNEL); - if (!dvbdev->pads) { - kfree(dvbdev->entity); -+ dvbdev->entity = NULL; - return -ENOMEM; - } - } -diff --git a/drivers/media/dvb-frontends/stv0288.c b/drivers/media/dvb-frontends/stv0288.c -index 3d54a0ec86afd..3ae1f3a2f1420 100644 ---- a/drivers/media/dvb-frontends/stv0288.c -+++ b/drivers/media/dvb-frontends/stv0288.c -@@ -440,9 +440,8 @@ static int stv0288_set_frontend(struct dvb_frontend *fe) - struct stv0288_state *state = fe->demodulator_priv; - struct dtv_frontend_properties *c = &fe->dtv_property_cache; - -- char tm; -- unsigned char tda[3]; -- u8 reg, time_out = 0; -+ u8 tda[3], reg, time_out = 0; -+ s8 tm; - - dprintk("%s : FE_SET_FRONTEND\n", __func__); - -diff --git a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_ctrl.c b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_ctrl.c -index 72d70984e99a6..6d3c92045c05f 100644 ---- a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_ctrl.c -+++ b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_ctrl.c -@@ -468,8 +468,10 @@ void s5p_mfc_close_mfc_inst(struct s5p_mfc_dev *dev, struct s5p_mfc_ctx *ctx) - s5p_mfc_hw_call(dev->mfc_ops, try_run, dev); - /* Wait until instance is returned or timeout occurred */ - if (s5p_mfc_wait_for_done_ctx(ctx, -- S5P_MFC_R2H_CMD_CLOSE_INSTANCE_RET, 0)) -+ S5P_MFC_R2H_CMD_CLOSE_INSTANCE_RET, 0)){ -+ clear_work_bit_irqsave(ctx); - mfc_err("Err returning instance\n"); -+ } - - /* Free resources */ - s5p_mfc_hw_call(dev->mfc_ops, release_codec_buffers, ctx); -diff --git a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_enc.c b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_enc.c -index b65e506665af7..f62703cebb77c 100644 ---- a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_enc.c -+++ b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_enc.c -@@ -1218,6 +1218,7 @@ static int enc_post_frame_start(struct s5p_mfc_ctx *ctx) - unsigned long mb_y_addr, mb_c_addr; - int slice_type; - unsigned int strm_size; -+ bool src_ready; - - slice_type = s5p_mfc_hw_call(dev->mfc_ops, get_enc_slice_type, dev); - strm_size = s5p_mfc_hw_call(dev->mfc_ops, get_enc_strm_size, dev); -@@ -1257,7 +1258,8 @@ static int enc_post_frame_start(struct s5p_mfc_ctx *ctx) - } - } - } -- if ((ctx->src_queue_cnt > 0) && (ctx->state == MFCINST_RUNNING)) { -+ if (ctx->src_queue_cnt > 0 && (ctx->state == MFCINST_RUNNING || -+ ctx->state == MFCINST_FINISHING)) { - mb_entry = list_entry(ctx->src_queue.next, struct s5p_mfc_buf, - list); - if (mb_entry->flags & MFC_BUF_FLAG_USED) { -@@ -1288,7 +1290,13 @@ static int enc_post_frame_start(struct s5p_mfc_ctx *ctx) - vb2_set_plane_payload(&mb_entry->b->vb2_buf, 0, strm_size); - vb2_buffer_done(&mb_entry->b->vb2_buf, VB2_BUF_STATE_DONE); - } -- if ((ctx->src_queue_cnt == 0) || (ctx->dst_queue_cnt == 0)) -+ -+ src_ready = true; -+ if (ctx->state == MFCINST_RUNNING && ctx->src_queue_cnt == 0) -+ src_ready = false; -+ if (ctx->state == MFCINST_FINISHING && ctx->ref_queue_cnt == 0) -+ src_ready = false; -+ if (!src_ready || ctx->dst_queue_cnt == 0) - clear_work_bit(ctx); - - return 0; -diff --git a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_opr_v6.c b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_opr_v6.c -index 8227004f67469..c0df5ac9fcff2 100644 ---- a/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_opr_v6.c -+++ b/drivers/media/platform/samsung/s5p-mfc/s5p_mfc_opr_v6.c -@@ -1060,7 +1060,7 @@ static int s5p_mfc_set_enc_params_h264(struct s5p_mfc_ctx *ctx) - } - - /* aspect ratio VUI */ -- readl(mfc_regs->e_h264_options); -+ reg = readl(mfc_regs->e_h264_options); - reg &= ~(0x1 << 5); - reg |= ((p_h264->vui_sar & 0x1) << 5); - writel(reg, mfc_regs->e_h264_options); -@@ -1083,7 +1083,7 @@ static int s5p_mfc_set_enc_params_h264(struct s5p_mfc_ctx *ctx) - - /* intra picture period for H.264 open GOP */ - /* control */ -- readl(mfc_regs->e_h264_options); -+ reg = readl(mfc_regs->e_h264_options); - reg &= ~(0x1 << 4); - reg |= ((p_h264->open_gop & 0x1) << 4); - writel(reg, mfc_regs->e_h264_options); -@@ -1097,23 +1097,23 @@ static int s5p_mfc_set_enc_params_h264(struct s5p_mfc_ctx *ctx) - } - - /* 'WEIGHTED_BI_PREDICTION' for B is disable */ -- readl(mfc_regs->e_h264_options); -+ reg = readl(mfc_regs->e_h264_options); - reg &= ~(0x3 << 9); - writel(reg, mfc_regs->e_h264_options); - - /* 'CONSTRAINED_INTRA_PRED_ENABLE' is disable */ -- readl(mfc_regs->e_h264_options); -+ reg = readl(mfc_regs->e_h264_options); - reg &= ~(0x1 << 14); - writel(reg, mfc_regs->e_h264_options); - - /* ASO */ -- readl(mfc_regs->e_h264_options); -+ reg = readl(mfc_regs->e_h264_options); - reg &= ~(0x1 << 6); - reg |= ((p_h264->aso & 0x1) << 6); - writel(reg, mfc_regs->e_h264_options); - - /* hier qp enable */ -- readl(mfc_regs->e_h264_options); -+ reg = readl(mfc_regs->e_h264_options); - reg &= ~(0x1 << 8); - reg |= ((p_h264->open_gop & 0x1) << 8); - writel(reg, mfc_regs->e_h264_options); -@@ -1134,7 +1134,7 @@ static int s5p_mfc_set_enc_params_h264(struct s5p_mfc_ctx *ctx) - writel(reg, mfc_regs->e_h264_num_t_layer); - - /* frame packing SEI generation */ -- readl(mfc_regs->e_h264_options); -+ reg = readl(mfc_regs->e_h264_options); - reg &= ~(0x1 << 25); - reg |= ((p_h264->sei_frame_packing & 0x1) << 25); - writel(reg, mfc_regs->e_h264_options); -diff --git a/drivers/mmc/host/sdhci-sprd.c b/drivers/mmc/host/sdhci-sprd.c -index bec3f9e3cd3fa..525f979e2a974 100644 ---- a/drivers/mmc/host/sdhci-sprd.c -+++ b/drivers/mmc/host/sdhci-sprd.c -@@ -228,13 +228,15 @@ static inline void _sdhci_sprd_set_clock(struct sdhci_host *host, - div = ((div & 0x300) >> 2) | ((div & 0xFF) << 8); - sdhci_enable_clk(host, div); - -- /* enable auto gate sdhc_enable_auto_gate */ -- val = sdhci_readl(host, SDHCI_SPRD_REG_32_BUSY_POSI); -- mask = SDHCI_SPRD_BIT_OUTR_CLK_AUTO_EN | -- SDHCI_SPRD_BIT_INNR_CLK_AUTO_EN; -- if (mask != (val & mask)) { -- val |= mask; -- sdhci_writel(host, val, SDHCI_SPRD_REG_32_BUSY_POSI); -+ /* Enable CLK_AUTO when the clock is greater than 400K. */ -+ if (clk > 400000) { -+ val = sdhci_readl(host, SDHCI_SPRD_REG_32_BUSY_POSI); -+ mask = SDHCI_SPRD_BIT_OUTR_CLK_AUTO_EN | -+ SDHCI_SPRD_BIT_INNR_CLK_AUTO_EN; -+ if (mask != (val & mask)) { -+ val |= mask; -+ sdhci_writel(host, val, SDHCI_SPRD_REG_32_BUSY_POSI); -+ } - } - } - -diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c -index 0cf1a1797ea32..2e0655c0b606f 100644 ---- a/drivers/mtd/spi-nor/core.c -+++ b/drivers/mtd/spi-nor/core.c -@@ -1184,6 +1184,8 @@ spi_nor_find_best_erase_type(const struct spi_nor_erase_map *map, - continue; - - erase = &map->erase_type[i]; -+ if (!erase->size) -+ continue; - - /* Alignment is not mandatory for overlaid regions */ - if (region->offset & SNOR_OVERLAID_REGION && -diff --git a/drivers/mtd/spi-nor/gigadevice.c b/drivers/mtd/spi-nor/gigadevice.c -index 119b38e6fc2a3..d57ddaf1525b3 100644 ---- a/drivers/mtd/spi-nor/gigadevice.c -+++ b/drivers/mtd/spi-nor/gigadevice.c -@@ -8,19 +8,29 @@ - - #include "core.h" - --static void gd25q256_default_init(struct spi_nor *nor) -+static int -+gd25q256_post_bfpt(struct spi_nor *nor, -+ const struct sfdp_parameter_header *bfpt_header, -+ const struct sfdp_bfpt *bfpt) - { - /* -- * Some manufacturer like GigaDevice may use different -- * bit to set QE on different memories, so the MFR can't -- * indicate the quad_enable method for this case, we need -- * to set it in the default_init fixup hook. -+ * GD25Q256C supports the first version of JESD216 which does not define -+ * the Quad Enable methods. Overwrite the default Quad Enable method. -+ * -+ * GD25Q256 GENERATION | SFDP MAJOR VERSION | SFDP MINOR VERSION -+ * GD25Q256C | SFDP_JESD216_MAJOR | SFDP_JESD216_MINOR -+ * GD25Q256D | SFDP_JESD216_MAJOR | SFDP_JESD216B_MINOR -+ * GD25Q256E | SFDP_JESD216_MAJOR | SFDP_JESD216B_MINOR - */ -- nor->params->quad_enable = spi_nor_sr1_bit6_quad_enable; -+ if (bfpt_header->major == SFDP_JESD216_MAJOR && -+ bfpt_header->minor == SFDP_JESD216_MINOR) -+ nor->params->quad_enable = spi_nor_sr1_bit6_quad_enable; -+ -+ return 0; - } - - static const struct spi_nor_fixups gd25q256_fixups = { -- .default_init = gd25q256_default_init, -+ .post_bfpt = gd25q256_post_bfpt, - }; - - static const struct flash_info gigadevice_nor_parts[] = { -diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c -index 33f723a9f471b..b4e0fc7f65bdf 100644 ---- a/drivers/net/ethernet/renesas/ravb_main.c -+++ b/drivers/net/ethernet/renesas/ravb_main.c -@@ -2903,12 +2903,12 @@ static int ravb_remove(struct platform_device *pdev) - priv->desc_bat_dma); - /* Set reset mode */ - ravb_write(ndev, CCC_OPC_RESET, CCC); -- pm_runtime_put_sync(&pdev->dev); - unregister_netdev(ndev); - if (info->nc_queues) - netif_napi_del(&priv->napi[RAVB_NC]); - netif_napi_del(&priv->napi[RAVB_BE]); - ravb_mdio_release(priv); -+ pm_runtime_put_sync(&pdev->dev); - pm_runtime_disable(&pdev->dev); - reset_control_assert(priv->rstc); - free_netdev(ndev); -diff --git a/drivers/net/wireless/microchip/wilc1000/sdio.c b/drivers/net/wireless/microchip/wilc1000/sdio.c -index 7390f94cd4ca2..a05bda7b9a3ba 100644 ---- a/drivers/net/wireless/microchip/wilc1000/sdio.c -+++ b/drivers/net/wireless/microchip/wilc1000/sdio.c -@@ -20,6 +20,7 @@ static const struct sdio_device_id wilc_sdio_ids[] = { - { SDIO_DEVICE(SDIO_VENDOR_ID_MICROCHIP_WILC, SDIO_DEVICE_ID_MICROCHIP_WILC1000) }, - { }, - }; -+MODULE_DEVICE_TABLE(sdio, wilc_sdio_ids); - - #define WILC_SDIO_BLOCK_SIZE 512 - -diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c -index e6c01db393f95..f26d2ba8a3715 100644 ---- a/drivers/of/kexec.c -+++ b/drivers/of/kexec.c -@@ -281,7 +281,7 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image, - const char *cmdline, size_t extra_fdt_size) - { - void *fdt; -- int ret, chosen_node; -+ int ret, chosen_node, len; - const void *prop; - size_t fdt_size; - -@@ -324,19 +324,19 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image, - goto out; - - /* Did we boot using an initrd? */ -- prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", NULL); -+ prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", &len); - if (prop) { - u64 tmp_start, tmp_end, tmp_size; - -- tmp_start = fdt64_to_cpu(*((const fdt64_t *) prop)); -+ tmp_start = of_read_number(prop, len / 4); - -- prop = fdt_getprop(fdt, chosen_node, "linux,initrd-end", NULL); -+ prop = fdt_getprop(fdt, chosen_node, "linux,initrd-end", &len); - if (!prop) { - ret = -EINVAL; - goto out; - } - -- tmp_end = fdt64_to_cpu(*((const fdt64_t *) prop)); -+ tmp_end = of_read_number(prop, len / 4); - - /* - * kexec reserves exact initrd size, while firmware may -diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c -index d4be9d2ee74d9..8bdc5e043831c 100644 ---- a/drivers/parisc/led.c -+++ b/drivers/parisc/led.c -@@ -137,6 +137,9 @@ static int start_task(void) - - /* Create the work queue and queue the LED task */ - led_wq = create_singlethread_workqueue("led_wq"); -+ if (!led_wq) -+ return -ENOMEM; -+ - queue_delayed_work(led_wq, &led_task, 0); - - return 0; -diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c -index e402f05068a53..66d9ab2886468 100644 ---- a/drivers/pci/doe.c -+++ b/drivers/pci/doe.c -@@ -29,6 +29,9 @@ - #define PCI_DOE_FLAG_CANCEL 0 - #define PCI_DOE_FLAG_DEAD 1 - -+/* Max data object length is 2^18 dwords */ -+#define PCI_DOE_MAX_LENGTH (1 << 18) -+ - /** - * struct pci_doe_mb - State for a single DOE mailbox - * -@@ -107,6 +110,7 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb, - { - struct pci_dev *pdev = doe_mb->pdev; - int offset = doe_mb->cap_offset; -+ size_t length; - u32 val; - int i; - -@@ -123,15 +127,20 @@ static int pci_doe_send_req(struct pci_doe_mb *doe_mb, - if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) - return -EIO; - -+ /* Length is 2 DW of header + length of payload in DW */ -+ length = 2 + task->request_pl_sz / sizeof(u32); -+ if (length > PCI_DOE_MAX_LENGTH) -+ return -EIO; -+ if (length == PCI_DOE_MAX_LENGTH) -+ length = 0; -+ - /* Write DOE Header */ - val = FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_VID, task->prot.vid) | - FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, task->prot.type); - pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, val); -- /* Length is 2 DW of header + length of payload in DW */ - pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, - FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, -- 2 + task->request_pl_sz / -- sizeof(u32))); -+ length)); - for (i = 0; i < task->request_pl_sz / sizeof(u32); i++) - pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, - task->request_pl[i]); -@@ -178,7 +187,10 @@ static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *tas - pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0); - - length = FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, val); -- if (length > SZ_1M || length < 2) -+ /* A value of 0x0 indicates max data object length */ -+ if (!length) -+ length = PCI_DOE_MAX_LENGTH; -+ if (length < 2) - return -EIO; - - /* First 2 dwords have already been read */ -diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c -index 0a2eeb82cebde..ba38fc47d35e9 100644 ---- a/drivers/pci/pci-sysfs.c -+++ b/drivers/pci/pci-sysfs.c -@@ -1175,11 +1175,9 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) - - sysfs_bin_attr_init(res_attr); - if (write_combine) { -- pdev->res_attr_wc[num] = res_attr; - sprintf(res_attr_name, "resource%d_wc", num); - res_attr->mmap = pci_mmap_resource_wc; - } else { -- pdev->res_attr[num] = res_attr; - sprintf(res_attr_name, "resource%d", num); - if (pci_resource_flags(pdev, num) & IORESOURCE_IO) { - res_attr->read = pci_read_resource_io; -@@ -1197,10 +1195,17 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) - res_attr->size = pci_resource_len(pdev, num); - res_attr->private = (void *)(unsigned long)num; - retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); -- if (retval) -+ if (retval) { - kfree(res_attr); -+ return retval; -+ } -+ -+ if (write_combine) -+ pdev->res_attr_wc[num] = res_attr; -+ else -+ pdev->res_attr[num] = res_attr; - -- return retval; -+ return 0; - } - - /** -diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -index 2127aba3550b5..ab615ab4e4409 100644 ---- a/drivers/pci/pci.c -+++ b/drivers/pci/pci.c -@@ -6447,6 +6447,8 @@ bool pci_device_is_present(struct pci_dev *pdev) - { - u32 v; - -+ /* Check PF if pdev is a VF, since VF Vendor/Device IDs are 0xffff */ -+ pdev = pci_physfn(pdev); - if (pci_dev_is_disconnected(pdev)) - return false; - return pci_bus_read_dev_vendor_id(pdev->bus, pdev->devfn, &v, 0); -diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c -index ba9d761ec49a7..91f8ee79000df 100644 ---- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c -+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c -@@ -1121,9 +1121,46 @@ static const struct qmp_phy_cfg sdm845_usb3phy_cfg = { - .pwrdn_delay_max = POWER_DOWN_DELAY_US_MAX, - }; - -+static const struct qmp_phy_cfg sdm845_dpphy_cfg = { -+ .type = PHY_TYPE_DP, -+ .lanes = 2, -+ -+ .serdes_tbl = qmp_v3_dp_serdes_tbl, -+ .serdes_tbl_num = ARRAY_SIZE(qmp_v3_dp_serdes_tbl), -+ .tx_tbl = qmp_v3_dp_tx_tbl, -+ .tx_tbl_num = ARRAY_SIZE(qmp_v3_dp_tx_tbl), -+ -+ .serdes_tbl_rbr = qmp_v3_dp_serdes_tbl_rbr, -+ .serdes_tbl_rbr_num = ARRAY_SIZE(qmp_v3_dp_serdes_tbl_rbr), -+ .serdes_tbl_hbr = qmp_v3_dp_serdes_tbl_hbr, -+ .serdes_tbl_hbr_num = ARRAY_SIZE(qmp_v3_dp_serdes_tbl_hbr), -+ .serdes_tbl_hbr2 = qmp_v3_dp_serdes_tbl_hbr2, -+ .serdes_tbl_hbr2_num = ARRAY_SIZE(qmp_v3_dp_serdes_tbl_hbr2), -+ .serdes_tbl_hbr3 = qmp_v3_dp_serdes_tbl_hbr3, -+ .serdes_tbl_hbr3_num = ARRAY_SIZE(qmp_v3_dp_serdes_tbl_hbr3), -+ -+ .swing_hbr_rbr = &qmp_dp_v3_voltage_swing_hbr_rbr, -+ .pre_emphasis_hbr_rbr = &qmp_dp_v3_pre_emphasis_hbr_rbr, -+ .swing_hbr3_hbr2 = &qmp_dp_v3_voltage_swing_hbr3_hbr2, -+ .pre_emphasis_hbr3_hbr2 = &qmp_dp_v3_pre_emphasis_hbr3_hbr2, -+ -+ .clk_list = qmp_v3_phy_clk_l, -+ .num_clks = ARRAY_SIZE(qmp_v3_phy_clk_l), -+ .reset_list = msm8996_usb3phy_reset_l, -+ .num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l), -+ .vreg_list = qmp_phy_vreg_l, -+ .num_vregs = ARRAY_SIZE(qmp_phy_vreg_l), -+ .regs = qmp_v3_usb3phy_regs_layout, -+ -+ .dp_aux_init = qcom_qmp_v3_phy_dp_aux_init, -+ .configure_dp_tx = qcom_qmp_v3_phy_configure_dp_tx, -+ .configure_dp_phy = qcom_qmp_v3_phy_configure_dp_phy, -+ .calibrate_dp_phy = qcom_qmp_v3_dp_phy_calibrate, -+}; -+ - static const struct qmp_phy_combo_cfg sdm845_usb3dpphy_cfg = { - .usb_cfg = &sdm845_usb3phy_cfg, -- .dp_cfg = &sc7180_dpphy_cfg, -+ .dp_cfg = &sdm845_dpphy_cfg, - }; - - static const struct qmp_phy_cfg sm8150_usb3phy_cfg = { -@@ -1184,8 +1221,8 @@ static const struct qmp_phy_cfg sc8180x_dpphy_cfg = { - - .clk_list = qmp_v3_phy_clk_l, - .num_clks = ARRAY_SIZE(qmp_v3_phy_clk_l), -- .reset_list = sc7180_usb3phy_reset_l, -- .num_resets = ARRAY_SIZE(sc7180_usb3phy_reset_l), -+ .reset_list = msm8996_usb3phy_reset_l, -+ .num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l), - .vreg_list = qmp_phy_vreg_l, - .num_vregs = ARRAY_SIZE(qmp_phy_vreg_l), - .regs = qmp_v3_usb3phy_regs_layout, -@@ -1328,8 +1365,8 @@ static const struct qmp_phy_cfg sm8250_dpphy_cfg = { - .swing_hbr3_hbr2 = &qmp_dp_v3_voltage_swing_hbr3_hbr2, - .pre_emphasis_hbr3_hbr2 = &qmp_dp_v3_pre_emphasis_hbr3_hbr2, - -- .clk_list = qmp_v4_phy_clk_l, -- .num_clks = ARRAY_SIZE(qmp_v4_phy_clk_l), -+ .clk_list = qmp_v4_sm8250_usbphy_clk_l, -+ .num_clks = ARRAY_SIZE(qmp_v4_sm8250_usbphy_clk_l), - .reset_list = msm8996_usb3phy_reset_l, - .num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l), - .vreg_list = qmp_phy_vreg_l, -diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c -index 3ea8fc6a9ca36..fc3d47a759443 100644 ---- a/drivers/platform/x86/ideapad-laptop.c -+++ b/drivers/platform/x86/ideapad-laptop.c -@@ -30,6 +30,7 @@ - #include - #include - #include -+#include - - #include - -@@ -37,20 +38,23 @@ - - #define IDEAPAD_RFKILL_DEV_NUM 3 - --#if IS_ENABLED(CONFIG_ACPI_WMI) --static const char *const ideapad_wmi_fnesc_events[] = { -- "26CAB2E5-5CF1-46AE-AAC3-4A12B6BA50E6", /* Yoga 3 */ -- "56322276-8493-4CE8-A783-98C991274F5E", /* Yoga 700 */ -- "8FC0DE0C-B4E4-43FD-B0F3-8871711C1294", /* Legion 5 */ --}; --#endif -- - enum { - CFG_CAP_BT_BIT = 16, - CFG_CAP_3G_BIT = 17, - CFG_CAP_WIFI_BIT = 18, - CFG_CAP_CAM_BIT = 19, -- CFG_CAP_TOUCHPAD_BIT = 30, -+ -+ /* -+ * These are OnScreenDisplay support bits that can be useful to determine -+ * whether a hotkey exists/should show OSD. But they aren't particularly -+ * meaningful since they were introduced later, i.e. 2010 IdeaPads -+ * don't have these, but they still have had OSD for hotkeys. -+ */ -+ CFG_OSD_NUMLK_BIT = 27, -+ CFG_OSD_CAPSLK_BIT = 28, -+ CFG_OSD_MICMUTE_BIT = 29, -+ CFG_OSD_TOUCHPAD_BIT = 30, -+ CFG_OSD_CAM_BIT = 31, - }; - - enum { -@@ -130,7 +134,7 @@ struct ideapad_private { - struct ideapad_dytc_priv *dytc; - struct dentry *debug; - unsigned long cfg; -- const char *fnesc_guid; -+ unsigned long r_touchpad_val; - struct { - bool conservation_mode : 1; - bool dytc : 1; -@@ -140,6 +144,7 @@ struct ideapad_private { - bool hw_rfkill_switch : 1; - bool kbd_bl : 1; - bool touchpad_ctrl_via_ec : 1; -+ bool ctrl_ps2_aux_port : 1; - bool usb_charging : 1; - } features; - struct { -@@ -171,6 +176,48 @@ MODULE_PARM_DESC(set_fn_lock_led, - "Enable driver based updates of the fn-lock LED on fn-lock changes. " - "If you need this please report this to: platform-driver-x86@vger.kernel.org"); - -+static bool ctrl_ps2_aux_port; -+module_param(ctrl_ps2_aux_port, bool, 0444); -+MODULE_PARM_DESC(ctrl_ps2_aux_port, -+ "Enable driver based PS/2 aux port en-/dis-abling on touchpad on/off toggle. " -+ "If you need this please report this to: platform-driver-x86@vger.kernel.org"); -+ -+/* -+ * shared data -+ */ -+ -+static struct ideapad_private *ideapad_shared; -+static DEFINE_MUTEX(ideapad_shared_mutex); -+ -+static int ideapad_shared_init(struct ideapad_private *priv) -+{ -+ int ret; -+ -+ mutex_lock(&ideapad_shared_mutex); -+ -+ if (!ideapad_shared) { -+ ideapad_shared = priv; -+ ret = 0; -+ } else { -+ dev_warn(&priv->adev->dev, "found multiple platform devices\n"); -+ ret = -EINVAL; -+ } -+ -+ mutex_unlock(&ideapad_shared_mutex); -+ -+ return ret; -+} -+ -+static void ideapad_shared_exit(struct ideapad_private *priv) -+{ -+ mutex_lock(&ideapad_shared_mutex); -+ -+ if (ideapad_shared == priv) -+ ideapad_shared = NULL; -+ -+ mutex_unlock(&ideapad_shared_mutex); -+} -+ - /* - * ACPI Helpers - */ -@@ -386,8 +433,19 @@ static int debugfs_cfg_show(struct seq_file *s, void *data) - seq_puts(s, " wifi"); - if (test_bit(CFG_CAP_CAM_BIT, &priv->cfg)) - seq_puts(s, " camera"); -- if (test_bit(CFG_CAP_TOUCHPAD_BIT, &priv->cfg)) -+ seq_puts(s, "\n"); -+ -+ seq_puts(s, "OSD support:"); -+ if (test_bit(CFG_OSD_NUMLK_BIT, &priv->cfg)) -+ seq_puts(s, " num-lock"); -+ if (test_bit(CFG_OSD_CAPSLK_BIT, &priv->cfg)) -+ seq_puts(s, " caps-lock"); -+ if (test_bit(CFG_OSD_MICMUTE_BIT, &priv->cfg)) -+ seq_puts(s, " mic-mute"); -+ if (test_bit(CFG_OSD_TOUCHPAD_BIT, &priv->cfg)) - seq_puts(s, " touchpad"); -+ if (test_bit(CFG_OSD_CAM_BIT, &priv->cfg)) -+ seq_puts(s, " camera"); - seq_puts(s, "\n"); - - seq_puts(s, "Graphics: "); -@@ -593,6 +651,8 @@ static ssize_t touchpad_show(struct device *dev, - if (err) - return err; - -+ priv->r_touchpad_val = result; -+ - return sysfs_emit(buf, "%d\n", !!result); - } - -@@ -612,6 +672,8 @@ static ssize_t touchpad_store(struct device *dev, - if (err) - return err; - -+ priv->r_touchpad_val = state; -+ - return count; - } - -@@ -680,8 +742,7 @@ static umode_t ideapad_is_visible(struct kobject *kobj, - else if (attr == &dev_attr_fn_lock.attr) - supported = priv->features.fn_lock; - else if (attr == &dev_attr_touchpad.attr) -- supported = priv->features.touchpad_ctrl_via_ec && -- test_bit(CFG_CAP_TOUCHPAD_BIT, &priv->cfg); -+ supported = priv->features.touchpad_ctrl_via_ec; - else if (attr == &dev_attr_usb_charging.attr) - supported = priv->features.usb_charging; - -@@ -1089,6 +1150,8 @@ static void ideapad_sysfs_exit(struct ideapad_private *priv) - /* - * input device - */ -+#define IDEAPAD_WMI_KEY 0x100 -+ - static const struct key_entry ideapad_keymap[] = { - { KE_KEY, 6, { KEY_SWITCHVIDEOMODE } }, - { KE_KEY, 7, { KEY_CAMERA } }, -@@ -1101,7 +1164,30 @@ static const struct key_entry ideapad_keymap[] = { - { KE_KEY, 65, { KEY_PROG4 } }, - { KE_KEY, 66, { KEY_TOUCHPAD_OFF } }, - { KE_KEY, 67, { KEY_TOUCHPAD_ON } }, -+ { KE_KEY, 68, { KEY_TOUCHPAD_TOGGLE } }, - { KE_KEY, 128, { KEY_ESC } }, -+ -+ /* -+ * WMI keys -+ */ -+ -+ /* FnLock (handled by the firmware) */ -+ { KE_IGNORE, 0x02 | IDEAPAD_WMI_KEY }, -+ /* Esc (handled by the firmware) */ -+ { KE_IGNORE, 0x03 | IDEAPAD_WMI_KEY }, -+ /* Customizable Lenovo Hotkey ("star" with 'S' inside) */ -+ { KE_KEY, 0x01 | IDEAPAD_WMI_KEY, { KEY_FAVORITES } }, -+ /* Dark mode toggle */ -+ { KE_KEY, 0x13 | IDEAPAD_WMI_KEY, { KEY_PROG1 } }, -+ /* Sound profile switch */ -+ { KE_KEY, 0x12 | IDEAPAD_WMI_KEY, { KEY_PROG2 } }, -+ /* Lenovo Virtual Background application */ -+ { KE_KEY, 0x28 | IDEAPAD_WMI_KEY, { KEY_PROG3 } }, -+ /* Lenovo Support */ -+ { KE_KEY, 0x27 | IDEAPAD_WMI_KEY, { KEY_HELP } }, -+ /* Refresh Rate Toggle */ -+ { KE_KEY, 0x0a | IDEAPAD_WMI_KEY, { KEY_DISPLAYTOGGLE } }, -+ - { KE_END }, - }; - -@@ -1414,26 +1500,41 @@ static void ideapad_kbd_bl_exit(struct ideapad_private *priv) - /* - * module init/exit - */ --static void ideapad_sync_touchpad_state(struct ideapad_private *priv) -+static void ideapad_sync_touchpad_state(struct ideapad_private *priv, bool send_events) - { - unsigned long value; -+ unsigned char param; -+ int ret; - -- if (!priv->features.touchpad_ctrl_via_ec) -+ /* Without reading from EC touchpad LED doesn't switch state */ -+ ret = read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &value); -+ if (ret) - return; - -- /* Without reading from EC touchpad LED doesn't switch state */ -- if (!read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &value)) { -- unsigned char param; -+ /* -+ * Some IdeaPads don't really turn off touchpad - they only -+ * switch the LED state. We (de)activate KBC AUX port to turn -+ * touchpad off and on. We send KEY_TOUCHPAD_OFF and -+ * KEY_TOUCHPAD_ON to not to get out of sync with LED -+ */ -+ if (priv->features.ctrl_ps2_aux_port) -+ i8042_command(¶m, value ? I8042_CMD_AUX_ENABLE : I8042_CMD_AUX_DISABLE); -+ -+ if (send_events) { - /* -- * Some IdeaPads don't really turn off touchpad - they only -- * switch the LED state. We (de)activate KBC AUX port to turn -- * touchpad off and on. We send KEY_TOUCHPAD_OFF and -- * KEY_TOUCHPAD_ON to not to get out of sync with LED -+ * On older models the EC controls the touchpad and toggles it -+ * on/off itself, in this case we report KEY_TOUCHPAD_ON/_OFF. -+ * If the EC did not toggle, report KEY_TOUCHPAD_TOGGLE. - */ -- i8042_command(¶m, value ? I8042_CMD_AUX_ENABLE : I8042_CMD_AUX_DISABLE); -- ideapad_input_report(priv, value ? 67 : 66); -- sysfs_notify(&priv->platform_device->dev.kobj, NULL, "touchpad"); -+ if (value != priv->r_touchpad_val) { -+ ideapad_input_report(priv, value ? 67 : 66); -+ sysfs_notify(&priv->platform_device->dev.kobj, NULL, "touchpad"); -+ } else { -+ ideapad_input_report(priv, 68); -+ } - } -+ -+ priv->r_touchpad_val = value; - } - - static void ideapad_acpi_notify(acpi_handle handle, u32 event, void *data) -@@ -1474,7 +1575,7 @@ static void ideapad_acpi_notify(acpi_handle handle, u32 event, void *data) - ideapad_sync_rfk_state(priv); - break; - case 5: -- ideapad_sync_touchpad_state(priv); -+ ideapad_sync_touchpad_state(priv, true); - break; - case 4: - ideapad_backlight_notify_brightness(priv); -@@ -1505,33 +1606,6 @@ static void ideapad_acpi_notify(acpi_handle handle, u32 event, void *data) - } - } - --#if IS_ENABLED(CONFIG_ACPI_WMI) --static void ideapad_wmi_notify(u32 value, void *context) --{ -- struct ideapad_private *priv = context; -- unsigned long result; -- -- switch (value) { -- case 128: -- ideapad_input_report(priv, value); -- break; -- case 208: -- if (!priv->features.set_fn_lock_led) -- break; -- -- if (!eval_hals(priv->adev->handle, &result)) { -- bool state = test_bit(HALS_FNLOCK_STATE_BIT, &result); -- -- exec_sals(priv->adev->handle, state ? SALS_FNLOCK_ON : SALS_FNLOCK_OFF); -- } -- break; -- default: -- dev_info(&priv->platform_device->dev, -- "Unknown WMI event: %u\n", value); -- } --} --#endif -- - /* On some models we need to call exec_sals(SALS_FNLOCK_ON/OFF) to set the LED */ - static const struct dmi_system_id set_fn_lock_led_list[] = { - { -@@ -1563,6 +1637,23 @@ static const struct dmi_system_id hw_rfkill_list[] = { - {} - }; - -+/* -+ * On some models the EC toggles the touchpad muted LED on touchpad toggle -+ * hotkey presses, but the EC does not actually disable the touchpad itself. -+ * On these models the driver needs to explicitly enable/disable the i8042 -+ * (PS/2) aux port. -+ */ -+static const struct dmi_system_id ctrl_ps2_aux_port_list[] = { -+ { -+ /* Lenovo Ideapad Z570 */ -+ .matches = { -+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), -+ DMI_MATCH(DMI_PRODUCT_VERSION, "Ideapad Z570"), -+ }, -+ }, -+ {} -+}; -+ - static const struct dmi_system_id no_touchpad_switch_list[] = { - { - .ident = "Lenovo Yoga 3 Pro 1370", -@@ -1590,6 +1681,8 @@ static void ideapad_check_features(struct ideapad_private *priv) - set_fn_lock_led || dmi_check_system(set_fn_lock_led_list); - priv->features.hw_rfkill_switch = - hw_rfkill_switch || dmi_check_system(hw_rfkill_list); -+ priv->features.ctrl_ps2_aux_port = -+ ctrl_ps2_aux_port || dmi_check_system(ctrl_ps2_aux_port_list); - - /* Most ideapads with ELAN0634 touchpad don't use EC touchpad switch */ - if (acpi_dev_present("ELAN0634", NULL, -1)) -@@ -1622,6 +1715,118 @@ static void ideapad_check_features(struct ideapad_private *priv) - } - } - -+#if IS_ENABLED(CONFIG_ACPI_WMI) -+/* -+ * WMI driver -+ */ -+enum ideapad_wmi_event_type { -+ IDEAPAD_WMI_EVENT_ESC, -+ IDEAPAD_WMI_EVENT_FN_KEYS, -+}; -+ -+struct ideapad_wmi_private { -+ enum ideapad_wmi_event_type event; -+}; -+ -+static int ideapad_wmi_probe(struct wmi_device *wdev, const void *context) -+{ -+ struct ideapad_wmi_private *wpriv; -+ -+ wpriv = devm_kzalloc(&wdev->dev, sizeof(*wpriv), GFP_KERNEL); -+ if (!wpriv) -+ return -ENOMEM; -+ -+ *wpriv = *(const struct ideapad_wmi_private *)context; -+ -+ dev_set_drvdata(&wdev->dev, wpriv); -+ return 0; -+} -+ -+static void ideapad_wmi_notify(struct wmi_device *wdev, union acpi_object *data) -+{ -+ struct ideapad_wmi_private *wpriv = dev_get_drvdata(&wdev->dev); -+ struct ideapad_private *priv; -+ unsigned long result; -+ -+ mutex_lock(&ideapad_shared_mutex); -+ -+ priv = ideapad_shared; -+ if (!priv) -+ goto unlock; -+ -+ switch (wpriv->event) { -+ case IDEAPAD_WMI_EVENT_ESC: -+ ideapad_input_report(priv, 128); -+ break; -+ case IDEAPAD_WMI_EVENT_FN_KEYS: -+ if (priv->features.set_fn_lock_led && -+ !eval_hals(priv->adev->handle, &result)) { -+ bool state = test_bit(HALS_FNLOCK_STATE_BIT, &result); -+ -+ exec_sals(priv->adev->handle, state ? SALS_FNLOCK_ON : SALS_FNLOCK_OFF); -+ } -+ -+ if (data->type != ACPI_TYPE_INTEGER) { -+ dev_warn(&wdev->dev, -+ "WMI event data is not an integer\n"); -+ break; -+ } -+ -+ dev_dbg(&wdev->dev, "WMI fn-key event: 0x%llx\n", -+ data->integer.value); -+ -+ ideapad_input_report(priv, -+ data->integer.value | IDEAPAD_WMI_KEY); -+ -+ break; -+ } -+unlock: -+ mutex_unlock(&ideapad_shared_mutex); -+} -+ -+static const struct ideapad_wmi_private ideapad_wmi_context_esc = { -+ .event = IDEAPAD_WMI_EVENT_ESC -+}; -+ -+static const struct ideapad_wmi_private ideapad_wmi_context_fn_keys = { -+ .event = IDEAPAD_WMI_EVENT_FN_KEYS -+}; -+ -+static const struct wmi_device_id ideapad_wmi_ids[] = { -+ { "26CAB2E5-5CF1-46AE-AAC3-4A12B6BA50E6", &ideapad_wmi_context_esc }, /* Yoga 3 */ -+ { "56322276-8493-4CE8-A783-98C991274F5E", &ideapad_wmi_context_esc }, /* Yoga 700 */ -+ { "8FC0DE0C-B4E4-43FD-B0F3-8871711C1294", &ideapad_wmi_context_fn_keys }, /* Legion 5 */ -+ {}, -+}; -+MODULE_DEVICE_TABLE(wmi, ideapad_wmi_ids); -+ -+static struct wmi_driver ideapad_wmi_driver = { -+ .driver = { -+ .name = "ideapad_wmi", -+ }, -+ .id_table = ideapad_wmi_ids, -+ .probe = ideapad_wmi_probe, -+ .notify = ideapad_wmi_notify, -+}; -+ -+static int ideapad_wmi_driver_register(void) -+{ -+ return wmi_driver_register(&ideapad_wmi_driver); -+} -+ -+static void ideapad_wmi_driver_unregister(void) -+{ -+ return wmi_driver_unregister(&ideapad_wmi_driver); -+} -+ -+#else -+static inline int ideapad_wmi_driver_register(void) { return 0; } -+static inline void ideapad_wmi_driver_unregister(void) { } -+#endif -+ -+/* -+ * ACPI driver -+ */ - static int ideapad_acpi_add(struct platform_device *pdev) - { - struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); -@@ -1670,16 +1875,12 @@ static int ideapad_acpi_add(struct platform_device *pdev) - if (!priv->features.hw_rfkill_switch) - write_ec_cmd(priv->adev->handle, VPCCMD_W_RF, 1); - -- /* The same for Touchpad */ -- if (!priv->features.touchpad_ctrl_via_ec) -- write_ec_cmd(priv->adev->handle, VPCCMD_W_TOUCHPAD, 1); -- - for (i = 0; i < IDEAPAD_RFKILL_DEV_NUM; i++) - if (test_bit(ideapad_rfk_data[i].cfgbit, &priv->cfg)) - ideapad_register_rfkill(priv, i); - - ideapad_sync_rfk_state(priv); -- ideapad_sync_touchpad_state(priv); -+ ideapad_sync_touchpad_state(priv, false); - - err = ideapad_dytc_profile_init(priv); - if (err) { -@@ -1703,30 +1904,16 @@ static int ideapad_acpi_add(struct platform_device *pdev) - goto notification_failed; - } - --#if IS_ENABLED(CONFIG_ACPI_WMI) -- for (i = 0; i < ARRAY_SIZE(ideapad_wmi_fnesc_events); i++) { -- status = wmi_install_notify_handler(ideapad_wmi_fnesc_events[i], -- ideapad_wmi_notify, priv); -- if (ACPI_SUCCESS(status)) { -- priv->fnesc_guid = ideapad_wmi_fnesc_events[i]; -- break; -- } -- } -- -- if (ACPI_FAILURE(status) && status != AE_NOT_EXIST) { -- err = -EIO; -- goto notification_failed_wmi; -- } --#endif -+ err = ideapad_shared_init(priv); -+ if (err) -+ goto shared_init_failed; - - return 0; - --#if IS_ENABLED(CONFIG_ACPI_WMI) --notification_failed_wmi: -+shared_init_failed: - acpi_remove_notify_handler(priv->adev->handle, - ACPI_DEVICE_NOTIFY, - ideapad_acpi_notify); --#endif - - notification_failed: - ideapad_backlight_exit(priv); -@@ -1752,10 +1939,7 @@ static int ideapad_acpi_remove(struct platform_device *pdev) - struct ideapad_private *priv = dev_get_drvdata(&pdev->dev); - int i; - --#if IS_ENABLED(CONFIG_ACPI_WMI) -- if (priv->fnesc_guid) -- wmi_remove_notify_handler(priv->fnesc_guid); --#endif -+ ideapad_shared_exit(priv); - - acpi_remove_notify_handler(priv->adev->handle, - ACPI_DEVICE_NOTIFY, -@@ -1781,7 +1965,7 @@ static int ideapad_acpi_resume(struct device *dev) - struct ideapad_private *priv = dev_get_drvdata(dev); - - ideapad_sync_rfk_state(priv); -- ideapad_sync_touchpad_state(priv); -+ ideapad_sync_touchpad_state(priv, false); - - if (priv->dytc) - dytc_profile_refresh(priv); -@@ -1807,7 +1991,30 @@ static struct platform_driver ideapad_acpi_driver = { - }, - }; - --module_platform_driver(ideapad_acpi_driver); -+static int __init ideapad_laptop_init(void) -+{ -+ int err; -+ -+ err = ideapad_wmi_driver_register(); -+ if (err) -+ return err; -+ -+ err = platform_driver_register(&ideapad_acpi_driver); -+ if (err) { -+ ideapad_wmi_driver_unregister(); -+ return err; -+ } -+ -+ return 0; -+} -+module_init(ideapad_laptop_init) -+ -+static void __exit ideapad_laptop_exit(void) -+{ -+ ideapad_wmi_driver_unregister(); -+ platform_driver_unregister(&ideapad_acpi_driver); -+} -+module_exit(ideapad_laptop_exit) - - MODULE_AUTHOR("David Woodhouse "); - MODULE_DESCRIPTION("IdeaPad ACPI Extras"); -diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c -index 8f9c571d72578..00ac7e381441a 100644 ---- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c -+++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c -@@ -203,6 +203,7 @@ static const struct x86_cpu_id intel_uncore_cpu_ids[] = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), -+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, NULL), - {} - }; - MODULE_DEVICE_TABLE(x86cpu, intel_uncore_cpu_ids); -diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c -index 8476dfef4e626..a1d91736a03b8 100644 ---- a/drivers/platform/x86/thinkpad_acpi.c -+++ b/drivers/platform/x86/thinkpad_acpi.c -@@ -5572,6 +5572,7 @@ static enum led_brightness light_sysfs_get(struct led_classdev *led_cdev) - static struct tpacpi_led_classdev tpacpi_led_thinklight = { - .led_classdev = { - .name = "tpacpi::thinklight", -+ .max_brightness = 1, - .brightness_set_blocking = &light_sysfs_set, - .brightness_get = &light_sysfs_get, - } -diff --git a/drivers/platform/x86/x86-android-tablets.c b/drivers/platform/x86/x86-android-tablets.c -index 4acd6fa8d43b8..123a4618db55f 100644 ---- a/drivers/platform/x86/x86-android-tablets.c -+++ b/drivers/platform/x86/x86-android-tablets.c -@@ -5,7 +5,7 @@ - * devices typically have a bunch of things hardcoded, rather than specified - * in their DSDT. - * -- * Copyright (C) 2021 Hans de Goede -+ * Copyright (C) 2021-2022 Hans de Goede - */ - - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -@@ -265,6 +265,56 @@ static struct gpiod_lookup_table int3496_gpo2_pin22_gpios = { - }, - }; - -+/* -+ * Advantech MICA-071 -+ * This is a standard Windows tablet, but it has an extra "quick launch" button -+ * which is not described in the ACPI tables in anyway. -+ * Use the x86-android-tablets infra to create a gpio-button device for this. -+ */ -+static struct gpio_keys_button advantech_mica_071_button = { -+ .code = KEY_PROG1, -+ /* .gpio gets filled in by advantech_mica_071_init() */ -+ .active_low = true, -+ .desc = "prog1_key", -+ .type = EV_KEY, -+ .wakeup = false, -+ .debounce_interval = 50, -+}; -+ -+static const struct gpio_keys_platform_data advantech_mica_071_button_pdata __initconst = { -+ .buttons = &advantech_mica_071_button, -+ .nbuttons = 1, -+ .name = "prog1_key", -+}; -+ -+static const struct platform_device_info advantech_mica_071_pdevs[] __initconst = { -+ { -+ .name = "gpio-keys", -+ .id = PLATFORM_DEVID_AUTO, -+ .data = &advantech_mica_071_button_pdata, -+ .size_data = sizeof(advantech_mica_071_button_pdata), -+ }, -+}; -+ -+static int __init advantech_mica_071_init(void) -+{ -+ struct gpio_desc *gpiod; -+ int ret; -+ -+ ret = x86_android_tablet_get_gpiod("INT33FC:00", 2, &gpiod); -+ if (ret < 0) -+ return ret; -+ advantech_mica_071_button.gpio = desc_to_gpio(gpiod); -+ -+ return 0; -+} -+ -+static const struct x86_dev_info advantech_mica_071_info __initconst = { -+ .pdev_info = advantech_mica_071_pdevs, -+ .pdev_count = ARRAY_SIZE(advantech_mica_071_pdevs), -+ .init = advantech_mica_071_init, -+}; -+ - /* Asus ME176C and TF103C tablets shared data */ - static struct gpio_keys_button asus_me176c_tf103c_lid = { - .code = SW_LID, -@@ -987,6 +1037,212 @@ static void lenovo_yoga_tab2_830_1050_exit(void) - } - } - -+/* Lenovo Yoga Tab 3 Pro YT3-X90F */ -+ -+/* -+ * There are 2 batteries, with 2 bq27500 fuel-gauges and 2 bq25892 chargers, -+ * "bq25890-charger-1" is instantiated from: drivers/i2c/busses/i2c-cht-wc.c. -+ */ -+static const char * const lenovo_yt3_bq25892_0_suppliers[] = { "cht_wcove_pwrsrc" }; -+static const char * const bq25890_1_psy[] = { "bq25890-charger-1" }; -+ -+static const struct property_entry fg_bq25890_1_supply_props[] = { -+ PROPERTY_ENTRY_STRING_ARRAY("supplied-from", bq25890_1_psy), -+ { } -+}; -+ -+static const struct software_node fg_bq25890_1_supply_node = { -+ .properties = fg_bq25890_1_supply_props, -+}; -+ -+/* bq25892 charger settings for the flat lipo battery behind the screen */ -+static const struct property_entry lenovo_yt3_bq25892_0_props[] = { -+ PROPERTY_ENTRY_STRING_ARRAY("supplied-from", lenovo_yt3_bq25892_0_suppliers), -+ PROPERTY_ENTRY_STRING("linux,power-supply-name", "bq25892-second-chrg"), -+ PROPERTY_ENTRY_U32("linux,iinlim-percentage", 40), -+ PROPERTY_ENTRY_BOOL("linux,skip-reset"), -+ /* Values taken from Android Factory Image */ -+ PROPERTY_ENTRY_U32("ti,charge-current", 2048000), -+ PROPERTY_ENTRY_U32("ti,battery-regulation-voltage", 4352000), -+ PROPERTY_ENTRY_U32("ti,termination-current", 128000), -+ PROPERTY_ENTRY_U32("ti,precharge-current", 128000), -+ PROPERTY_ENTRY_U32("ti,minimum-sys-voltage", 3700000), -+ PROPERTY_ENTRY_U32("ti,boost-voltage", 4998000), -+ PROPERTY_ENTRY_U32("ti,boost-max-current", 500000), -+ PROPERTY_ENTRY_BOOL("ti,use-ilim-pin"), -+ { } -+}; -+ -+static const struct software_node lenovo_yt3_bq25892_0_node = { -+ .properties = lenovo_yt3_bq25892_0_props, -+}; -+ -+static const struct x86_i2c_client_info lenovo_yt3_i2c_clients[] __initconst = { -+ { -+ /* bq27500 fuel-gauge for the flat lipo battery behind the screen */ -+ .board_info = { -+ .type = "bq27500", -+ .addr = 0x55, -+ .dev_name = "bq27500_0", -+ .swnode = &fg_bq25890_supply_node, -+ }, -+ .adapter_path = "\\_SB_.PCI0.I2C1", -+ }, { -+ /* bq25892 charger for the flat lipo battery behind the screen */ -+ .board_info = { -+ .type = "bq25892", -+ .addr = 0x6b, -+ .dev_name = "bq25892_0", -+ .swnode = &lenovo_yt3_bq25892_0_node, -+ }, -+ .adapter_path = "\\_SB_.PCI0.I2C1", -+ .irq_data = { -+ .type = X86_ACPI_IRQ_TYPE_GPIOINT, -+ .chip = "INT33FF:01", -+ .index = 5, -+ .trigger = ACPI_EDGE_SENSITIVE, -+ .polarity = ACPI_ACTIVE_LOW, -+ }, -+ }, { -+ /* bq27500 fuel-gauge for the round li-ion cells in the hinge */ -+ .board_info = { -+ .type = "bq27500", -+ .addr = 0x55, -+ .dev_name = "bq27500_1", -+ .swnode = &fg_bq25890_1_supply_node, -+ }, -+ .adapter_path = "\\_SB_.PCI0.I2C2", -+ } -+}; -+ -+static int __init lenovo_yt3_init(void) -+{ -+ struct gpio_desc *gpiod; -+ int ret; -+ -+ /* -+ * The "bq25892_0" charger IC has its /CE (Charge-Enable) and OTG pins -+ * connected to GPIOs, rather then having them hardwired to the correct -+ * values as is normally done. -+ * -+ * The bq25890_charger driver controls these through I2C, but this only -+ * works if not overridden by the pins. Set these pins here: -+ * 1. Set /CE to 0 to allow charging. -+ * 2. Set OTG to 0 disable V5 boost output since the 5V boost output of -+ * the main "bq25892_1" charger is used when necessary. -+ */ -+ -+ /* /CE pin */ -+ ret = x86_android_tablet_get_gpiod("INT33FF:02", 22, &gpiod); -+ if (ret < 0) -+ return ret; -+ -+ /* -+ * The gpio_desc returned by x86_android_tablet_get_gpiod() is a "raw" -+ * gpio_desc, that is there is no way to pass lookup-flags like -+ * GPIO_ACTIVE_LOW. Set the GPIO to 0 here to enable charging since -+ * the /CE pin is active-low, but not marked as such in the gpio_desc. -+ */ -+ gpiod_set_value(gpiod, 0); -+ -+ /* OTG pin */ -+ ret = x86_android_tablet_get_gpiod("INT33FF:03", 19, &gpiod); -+ if (ret < 0) -+ return ret; -+ -+ gpiod_set_value(gpiod, 0); -+ -+ return 0; -+} -+ -+static const struct x86_dev_info lenovo_yt3_info __initconst = { -+ .i2c_client_info = lenovo_yt3_i2c_clients, -+ .i2c_client_count = ARRAY_SIZE(lenovo_yt3_i2c_clients), -+ .init = lenovo_yt3_init, -+}; -+ -+/* Medion Lifetab S10346 tablets have an Android factory img with everything hardcoded */ -+static const char * const medion_lifetab_s10346_accel_mount_matrix[] = { -+ "0", "1", "0", -+ "1", "0", "0", -+ "0", "0", "1" -+}; -+ -+static const struct property_entry medion_lifetab_s10346_accel_props[] = { -+ PROPERTY_ENTRY_STRING_ARRAY("mount-matrix", medion_lifetab_s10346_accel_mount_matrix), -+ { } -+}; -+ -+static const struct software_node medion_lifetab_s10346_accel_node = { -+ .properties = medion_lifetab_s10346_accel_props, -+}; -+ -+/* Note the LCD panel is mounted upside down, this is correctly indicated in the VBT */ -+static const struct property_entry medion_lifetab_s10346_touchscreen_props[] = { -+ PROPERTY_ENTRY_BOOL("touchscreen-inverted-x"), -+ PROPERTY_ENTRY_BOOL("touchscreen-swapped-x-y"), -+ { } -+}; -+ -+static const struct software_node medion_lifetab_s10346_touchscreen_node = { -+ .properties = medion_lifetab_s10346_touchscreen_props, -+}; -+ -+static const struct x86_i2c_client_info medion_lifetab_s10346_i2c_clients[] __initconst = { -+ { -+ /* kxtj21009 accel */ -+ .board_info = { -+ .type = "kxtj21009", -+ .addr = 0x0f, -+ .dev_name = "kxtj21009", -+ .swnode = &medion_lifetab_s10346_accel_node, -+ }, -+ .adapter_path = "\\_SB_.I2C3", -+ .irq_data = { -+ .type = X86_ACPI_IRQ_TYPE_GPIOINT, -+ .chip = "INT33FC:02", -+ .index = 23, -+ .trigger = ACPI_EDGE_SENSITIVE, -+ .polarity = ACPI_ACTIVE_HIGH, -+ }, -+ }, { -+ /* goodix touchscreen */ -+ .board_info = { -+ .type = "GDIX1001:00", -+ .addr = 0x14, -+ .dev_name = "goodix_ts", -+ .swnode = &medion_lifetab_s10346_touchscreen_node, -+ }, -+ .adapter_path = "\\_SB_.I2C4", -+ .irq_data = { -+ .type = X86_ACPI_IRQ_TYPE_APIC, -+ .index = 0x44, -+ .trigger = ACPI_EDGE_SENSITIVE, -+ .polarity = ACPI_ACTIVE_LOW, -+ }, -+ }, -+}; -+ -+static struct gpiod_lookup_table medion_lifetab_s10346_goodix_gpios = { -+ .dev_id = "i2c-goodix_ts", -+ .table = { -+ GPIO_LOOKUP("INT33FC:01", 26, "reset", GPIO_ACTIVE_HIGH), -+ GPIO_LOOKUP("INT33FC:02", 3, "irq", GPIO_ACTIVE_HIGH), -+ { } -+ }, -+}; -+ -+static struct gpiod_lookup_table * const medion_lifetab_s10346_gpios[] = { -+ &medion_lifetab_s10346_goodix_gpios, -+ NULL -+}; -+ -+static const struct x86_dev_info medion_lifetab_s10346_info __initconst = { -+ .i2c_client_info = medion_lifetab_s10346_i2c_clients, -+ .i2c_client_count = ARRAY_SIZE(medion_lifetab_s10346_i2c_clients), -+ .gpiod_lookup_tables = medion_lifetab_s10346_gpios, -+}; -+ - /* Nextbook Ares 8 tablets have an Android factory img with everything hardcoded */ - static const char * const nextbook_ares8_accel_mount_matrix[] = { - "0", "-1", "0", -@@ -1179,6 +1435,14 @@ static const struct x86_dev_info xiaomi_mipad2_info __initconst = { - }; - - static const struct dmi_system_id x86_android_tablet_ids[] __initconst = { -+ { -+ /* Advantech MICA-071 */ -+ .matches = { -+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Advantech"), -+ DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "MICA-071"), -+ }, -+ .driver_data = (void *)&advantech_mica_071_info, -+ }, - { - /* Asus MeMO Pad 7 ME176C */ - .matches = { -@@ -1245,6 +1509,25 @@ static const struct dmi_system_id x86_android_tablet_ids[] __initconst = { - }, - .driver_data = (void *)&lenovo_yoga_tab2_830_1050_info, - }, -+ { -+ /* Lenovo Yoga Tab 3 Pro YT3-X90F */ -+ .matches = { -+ DMI_MATCH(DMI_SYS_VENDOR, "Intel Corporation"), -+ DMI_MATCH(DMI_PRODUCT_NAME, "CHERRYVIEW D1 PLATFORM"), -+ DMI_MATCH(DMI_PRODUCT_VERSION, "Blade3-10A-001"), -+ }, -+ .driver_data = (void *)&lenovo_yt3_info, -+ }, -+ { -+ /* Medion Lifetab S10346 */ -+ .matches = { -+ DMI_MATCH(DMI_BOARD_VENDOR, "AMI Corporation"), -+ DMI_MATCH(DMI_BOARD_NAME, "Aptio CRB"), -+ /* Above strings are much too generic, also match on BIOS date */ -+ DMI_MATCH(DMI_BIOS_DATE, "10/22/2015"), -+ }, -+ .driver_data = (void *)&medion_lifetab_s10346_info, -+ }, - { - /* Nextbook Ares 8 */ - .matches = { -diff --git a/drivers/remoteproc/imx_dsp_rproc.c b/drivers/remoteproc/imx_dsp_rproc.c -index 899aa8dd12f07..95da1cbefacf0 100644 ---- a/drivers/remoteproc/imx_dsp_rproc.c -+++ b/drivers/remoteproc/imx_dsp_rproc.c -@@ -347,9 +347,6 @@ static int imx_dsp_rproc_stop(struct rproc *rproc) - struct device *dev = rproc->dev.parent; - int ret = 0; - -- /* Make sure work is finished */ -- flush_work(&priv->rproc_work); -- - if (rproc->state == RPROC_CRASHED) { - priv->flags &= ~REMOTE_IS_READY; - return 0; -@@ -432,9 +429,18 @@ static void imx_dsp_rproc_vq_work(struct work_struct *work) - { - struct imx_dsp_rproc *priv = container_of(work, struct imx_dsp_rproc, - rproc_work); -+ struct rproc *rproc = priv->rproc; -+ -+ mutex_lock(&rproc->lock); -+ -+ if (rproc->state != RPROC_RUNNING) -+ goto unlock_mutex; - - rproc_vq_interrupt(priv->rproc, 0); - rproc_vq_interrupt(priv->rproc, 1); -+ -+unlock_mutex: -+ mutex_unlock(&rproc->lock); - } - - /** -diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c -index 7cc4fd207e2d8..596e1440cca56 100644 ---- a/drivers/remoteproc/imx_rproc.c -+++ b/drivers/remoteproc/imx_rproc.c -@@ -113,8 +113,8 @@ static const struct imx_rproc_att imx_rproc_att_imx93[] = { - { 0x80000000, 0x80000000, 0x10000000, 0 }, - { 0x90000000, 0x80000000, 0x10000000, 0 }, - -- { 0xC0000000, 0xa0000000, 0x10000000, 0 }, -- { 0xD0000000, 0xa0000000, 0x10000000, 0 }, -+ { 0xC0000000, 0xC0000000, 0x10000000, 0 }, -+ { 0xD0000000, 0xC0000000, 0x10000000, 0 }, - }; - - static const struct imx_rproc_att imx_rproc_att_imx8mn[] = { -diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c -index cb1d414a23896..c3f194d9384da 100644 ---- a/drivers/remoteproc/remoteproc_core.c -+++ b/drivers/remoteproc/remoteproc_core.c -@@ -1868,12 +1868,18 @@ static void rproc_crash_handler_work(struct work_struct *work) - - mutex_lock(&rproc->lock); - -- if (rproc->state == RPROC_CRASHED || rproc->state == RPROC_OFFLINE) { -+ if (rproc->state == RPROC_CRASHED) { - /* handle only the first crash detected */ - mutex_unlock(&rproc->lock); - return; - } - -+ if (rproc->state == RPROC_OFFLINE) { -+ /* Don't recover if the remote processor was stopped */ -+ mutex_unlock(&rproc->lock); -+ goto out; -+ } -+ - rproc->state = RPROC_CRASHED; - dev_err(dev, "handling crash #%u in %s\n", ++rproc->crash_cnt, - rproc->name); -@@ -1883,6 +1889,7 @@ static void rproc_crash_handler_work(struct work_struct *work) - if (!rproc->recovery_disabled) - rproc_trigger_recovery(rproc); - -+out: - pm_relax(rproc->dev.parent); - } - -diff --git a/drivers/rtc/rtc-ds1347.c b/drivers/rtc/rtc-ds1347.c -index 157bf5209ac40..a40c1a52df659 100644 ---- a/drivers/rtc/rtc-ds1347.c -+++ b/drivers/rtc/rtc-ds1347.c -@@ -112,7 +112,7 @@ static int ds1347_set_time(struct device *dev, struct rtc_time *dt) - return err; - - century = (dt->tm_year / 100) + 19; -- err = regmap_write(map, DS1347_CENTURY_REG, century); -+ err = regmap_write(map, DS1347_CENTURY_REG, bin2bcd(century)); - if (err) - return err; - -diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig -index 024e420f1bb77..ae504c43d9e74 100644 ---- a/drivers/soc/qcom/Kconfig -+++ b/drivers/soc/qcom/Kconfig -@@ -63,6 +63,7 @@ config QCOM_GSBI - config QCOM_LLCC - tristate "Qualcomm Technologies, Inc. LLCC driver" - depends on ARCH_QCOM || COMPILE_TEST -+ select REGMAP_MMIO - help - Qualcomm Technologies, Inc. platform specific - Last Level Cache Controller(LLCC) driver for platforms such as, -@@ -236,6 +237,7 @@ config QCOM_ICC_BWMON - tristate "QCOM Interconnect Bandwidth Monitor driver" - depends on ARCH_QCOM || COMPILE_TEST - select PM_OPP -+ select REGMAP_MMIO - help - Sets up driver monitoring bandwidth on various interconnects and - based on that voting for interconnect bandwidth, adjusting their -diff --git a/drivers/soc/ux500/ux500-soc-id.c b/drivers/soc/ux500/ux500-soc-id.c -index a9472e0e5d61c..27d6e25a01153 100644 ---- a/drivers/soc/ux500/ux500-soc-id.c -+++ b/drivers/soc/ux500/ux500-soc-id.c -@@ -167,20 +167,18 @@ ATTRIBUTE_GROUPS(ux500_soc); - static const char *db8500_read_soc_id(struct device_node *backupram) - { - void __iomem *base; -- void __iomem *uid; - const char *retstr; -+ u32 uid[5]; - - base = of_iomap(backupram, 0); - if (!base) - return NULL; -- uid = base + 0x1fc0; -+ memcpy_fromio(uid, base + 0x1fc0, sizeof(uid)); - - /* Throw these device-specific numbers into the entropy pool */ -- add_device_randomness(uid, 0x14); -+ add_device_randomness(uid, sizeof(uid)); - retstr = kasprintf(GFP_KERNEL, "%08x%08x%08x%08x%08x", -- readl((u32 *)uid+0), -- readl((u32 *)uid+1), readl((u32 *)uid+2), -- readl((u32 *)uid+3), readl((u32 *)uid+4)); -+ uid[0], uid[1], uid[2], uid[3], uid[4]); - iounmap(base); - return retstr; - } -diff --git a/drivers/staging/media/ipu3/ipu3-v4l2.c b/drivers/staging/media/ipu3/ipu3-v4l2.c -index ce13e746c15f3..e530767e80a5d 100644 ---- a/drivers/staging/media/ipu3/ipu3-v4l2.c -+++ b/drivers/staging/media/ipu3/ipu3-v4l2.c -@@ -188,6 +188,28 @@ static int imgu_subdev_set_fmt(struct v4l2_subdev *sd, - return 0; - } - -+static struct v4l2_rect * -+imgu_subdev_get_crop(struct imgu_v4l2_subdev *sd, -+ struct v4l2_subdev_state *sd_state, unsigned int pad, -+ enum v4l2_subdev_format_whence which) -+{ -+ if (which == V4L2_SUBDEV_FORMAT_TRY) -+ return v4l2_subdev_get_try_crop(&sd->subdev, sd_state, pad); -+ else -+ return &sd->rect.eff; -+} -+ -+static struct v4l2_rect * -+imgu_subdev_get_compose(struct imgu_v4l2_subdev *sd, -+ struct v4l2_subdev_state *sd_state, unsigned int pad, -+ enum v4l2_subdev_format_whence which) -+{ -+ if (which == V4L2_SUBDEV_FORMAT_TRY) -+ return v4l2_subdev_get_try_compose(&sd->subdev, sd_state, pad); -+ else -+ return &sd->rect.bds; -+} -+ - static int imgu_subdev_get_selection(struct v4l2_subdev *sd, - struct v4l2_subdev_state *sd_state, - struct v4l2_subdev_selection *sel) -@@ -200,18 +222,12 @@ static int imgu_subdev_get_selection(struct v4l2_subdev *sd, - - switch (sel->target) { - case V4L2_SEL_TGT_CROP: -- if (sel->which == V4L2_SUBDEV_FORMAT_TRY) -- sel->r = *v4l2_subdev_get_try_crop(sd, sd_state, -- sel->pad); -- else -- sel->r = imgu_sd->rect.eff; -+ sel->r = *imgu_subdev_get_crop(imgu_sd, sd_state, sel->pad, -+ sel->which); - return 0; - case V4L2_SEL_TGT_COMPOSE: -- if (sel->which == V4L2_SUBDEV_FORMAT_TRY) -- sel->r = *v4l2_subdev_get_try_compose(sd, sd_state, -- sel->pad); -- else -- sel->r = imgu_sd->rect.bds; -+ sel->r = *imgu_subdev_get_compose(imgu_sd, sd_state, sel->pad, -+ sel->which); - return 0; - default: - return -EINVAL; -@@ -223,10 +239,9 @@ static int imgu_subdev_set_selection(struct v4l2_subdev *sd, - struct v4l2_subdev_selection *sel) - { - struct imgu_device *imgu = v4l2_get_subdevdata(sd); -- struct imgu_v4l2_subdev *imgu_sd = container_of(sd, -- struct imgu_v4l2_subdev, -- subdev); -- struct v4l2_rect *rect, *try_sel; -+ struct imgu_v4l2_subdev *imgu_sd = -+ container_of(sd, struct imgu_v4l2_subdev, subdev); -+ struct v4l2_rect *rect; - - dev_dbg(&imgu->pci_dev->dev, - "set subdev %u sel which %u target 0x%4x rect [%ux%u]", -@@ -238,22 +253,18 @@ static int imgu_subdev_set_selection(struct v4l2_subdev *sd, - - switch (sel->target) { - case V4L2_SEL_TGT_CROP: -- try_sel = v4l2_subdev_get_try_crop(sd, sd_state, sel->pad); -- rect = &imgu_sd->rect.eff; -+ rect = imgu_subdev_get_crop(imgu_sd, sd_state, sel->pad, -+ sel->which); - break; - case V4L2_SEL_TGT_COMPOSE: -- try_sel = v4l2_subdev_get_try_compose(sd, sd_state, sel->pad); -- rect = &imgu_sd->rect.bds; -+ rect = imgu_subdev_get_compose(imgu_sd, sd_state, sel->pad, -+ sel->which); - break; - default: - return -EINVAL; - } - -- if (sel->which == V4L2_SUBDEV_FORMAT_TRY) -- *try_sel = sel->r; -- else -- *rect = sel->r; -- -+ *rect = sel->r; - return 0; - } - -diff --git a/drivers/staging/media/tegra-video/csi.c b/drivers/staging/media/tegra-video/csi.c -index b26e44adb2be7..426e653bd55d5 100644 ---- a/drivers/staging/media/tegra-video/csi.c -+++ b/drivers/staging/media/tegra-video/csi.c -@@ -433,7 +433,7 @@ static int tegra_csi_channel_alloc(struct tegra_csi *csi, - for (i = 0; i < chan->numgangports; i++) - chan->csi_port_nums[i] = port_num + i * CSI_PORTS_PER_BRICK; - -- chan->of_node = node; -+ chan->of_node = of_node_get(node); - chan->numpads = num_pads; - if (num_pads & 0x2) { - chan->pads[0].flags = MEDIA_PAD_FL_SINK; -@@ -448,6 +448,7 @@ static int tegra_csi_channel_alloc(struct tegra_csi *csi, - chan->mipi = tegra_mipi_request(csi->dev, node); - if (IS_ERR(chan->mipi)) { - ret = PTR_ERR(chan->mipi); -+ chan->mipi = NULL; - dev_err(csi->dev, "failed to get mipi device: %d\n", ret); - } - -@@ -640,6 +641,7 @@ static void tegra_csi_channels_cleanup(struct tegra_csi *csi) - media_entity_cleanup(&subdev->entity); - } - -+ of_node_put(chan->of_node); - list_del(&chan->list); - kfree(chan); - } -diff --git a/drivers/staging/media/tegra-video/csi.h b/drivers/staging/media/tegra-video/csi.h -index 4ee05a1785cfa..6960ea2e3d360 100644 ---- a/drivers/staging/media/tegra-video/csi.h -+++ b/drivers/staging/media/tegra-video/csi.h -@@ -56,7 +56,7 @@ struct tegra_csi; - * @framerate: active framerate for TPG - * @h_blank: horizontal blanking for TPG active format - * @v_blank: vertical blanking for TPG active format -- * @mipi: mipi device for corresponding csi channel pads -+ * @mipi: mipi device for corresponding csi channel pads, or NULL if not applicable (TPG, error) - * @pixel_rate: active pixel rate from the sensor on this channel - */ - struct tegra_csi_channel { -diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c -index 18374a6d05bdf..18cf801ab5908 100644 ---- a/fs/btrfs/backref.c -+++ b/fs/btrfs/backref.c -@@ -433,6 +433,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - u64 wanted_disk_byte = ref->wanted_disk_byte; - u64 count = 0; - u64 data_offset; -+ u8 type; - - if (level != 0) { - eb = path->nodes[level]; -@@ -487,6 +488,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - continue; - } - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); -+ type = btrfs_file_extent_type(eb, fi); -+ if (type == BTRFS_FILE_EXTENT_INLINE) -+ goto next; - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - data_offset = btrfs_file_extent_offset(eb, fi); - -diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c -index 3676580c2d97e..7b93719a486c5 100644 ---- a/fs/btrfs/extent-io-tree.c -+++ b/fs/btrfs/extent-io-tree.c -@@ -397,7 +397,7 @@ static int insert_state(struct extent_io_tree *tree, - u32 bits, struct extent_changeset *changeset) - { - struct rb_node **node; -- struct rb_node *parent; -+ struct rb_node *parent = NULL; - const u64 end = state->end; - - set_state_bits(tree, state, bits, changeset); -diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c -index 635f45f1a2ef8..dba087ad40ea2 100644 ---- a/fs/btrfs/volumes.c -+++ b/fs/btrfs/volumes.c -@@ -7241,8 +7241,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, - map->stripes[i].dev = handle_missing_device(fs_info, - devid, uuid); - if (IS_ERR(map->stripes[i].dev)) { -+ ret = PTR_ERR(map->stripes[i].dev); - free_extent_map(em); -- return PTR_ERR(map->stripes[i].dev); -+ return ret; - } - } - -diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c -index 712a431614480..6094cb2ff099b 100644 ---- a/fs/cifs/cifsfs.c -+++ b/fs/cifs/cifsfs.c -@@ -678,9 +678,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root) - seq_printf(s, ",echo_interval=%lu", - tcon->ses->server->echo_interval / HZ); - -- /* Only display max_credits if it was overridden on mount */ -+ /* Only display the following if overridden on mount */ - if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE) - seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits); -+ if (tcon->ses->server->tcp_nodelay) -+ seq_puts(s, ",tcpnodelay"); -+ if (tcon->ses->server->noautotune) -+ seq_puts(s, ",noautotune"); -+ if (tcon->ses->server->noblocksnd) -+ seq_puts(s, ",noblocksend"); - - if (tcon->snapshot_time) - seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); -diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c -index 9db9527c61cfc..7e7f712f97fd8 100644 ---- a/fs/cifs/connect.c -+++ b/fs/cifs/connect.c -@@ -279,8 +279,10 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, - tcon->need_reconnect = true; - tcon->status = TID_NEED_RECON; - } -- if (ses->tcon_ipc) -+ if (ses->tcon_ipc) { - ses->tcon_ipc->need_reconnect = true; -+ ses->tcon_ipc->status = TID_NEED_RECON; -+ } - - next_session: - spin_unlock(&ses->chan_lock); -@@ -1871,6 +1873,9 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) - - cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid); - -+ spin_lock(&tcon->tc_lock); -+ tcon->status = TID_GOOD; -+ spin_unlock(&tcon->tc_lock); - ses->tcon_ipc = tcon; - out: - return rc; -@@ -2157,7 +2162,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx __attribute__((unused)), - struct cifs_ses * - cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) - { -- int rc = -ENOMEM; -+ int rc = 0; - unsigned int xid; - struct cifs_ses *ses; - struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; -@@ -2206,6 +2211,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) - return ses; - } - -+ rc = -ENOMEM; -+ - cifs_dbg(FYI, "Existing smb sess not found\n"); - ses = sesInfoAlloc(); - if (ses == NULL) -@@ -2278,10 +2285,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) - list_add(&ses->smb_ses_list, &server->smb_ses_list); - spin_unlock(&cifs_tcp_ses_lock); - -- free_xid(xid); -- - cifs_setup_ipc(ses, ctx); - -+ free_xid(xid); -+ - return ses; - - get_ses_fail: -@@ -2600,6 +2607,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) - tcon->nodelete = ctx->nodelete; - tcon->local_lease = ctx->local_lease; - INIT_LIST_HEAD(&tcon->pending_opens); -+ tcon->status = TID_GOOD; - - /* schedule query interfaces poll */ - INIT_DELAYED_WORK(&tcon->query_interfaces, -diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c -index 59f64c596233b..871d4e9f49fb6 100644 ---- a/fs/dlm/lowcomms.c -+++ b/fs/dlm/lowcomms.c -@@ -1543,7 +1543,11 @@ static void process_recv_sockets(struct work_struct *work) - - static void process_listen_recv_socket(struct work_struct *work) - { -- accept_from_sock(&listen_con); -+ int ret; -+ -+ do { -+ ret = accept_from_sock(&listen_con); -+ } while (!ret); - } - - static void dlm_connect(struct connection *con) -@@ -1820,7 +1824,7 @@ static int dlm_listen_for_all(void) - result = sock->ops->listen(sock, 5); - if (result < 0) { - dlm_close_sock(&listen_con.sock); -- goto out; -+ return result; - } - - return 0; -@@ -2023,7 +2027,6 @@ fail_listen: - dlm_proto_ops = NULL; - fail_proto_ops: - dlm_allow_conn = 0; -- dlm_close_sock(&listen_con.sock); - work_stop(); - fail_local: - deinit_local(); -diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c -index 8f597753ac129..5202eddfc3c0a 100644 ---- a/fs/ext2/dir.c -+++ b/fs/ext2/dir.c -@@ -679,7 +679,7 @@ int ext2_empty_dir (struct inode * inode) - page = ext2_get_page(inode, i, 0, &page_addr); - - if (IS_ERR(page)) -- goto not_empty; -+ return 0; - - kaddr = page_addr; - de = (ext2_dirent *)kaddr; -diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h -index 8d5453852f98e..4e739902dc03a 100644 ---- a/fs/ext4/ext4.h -+++ b/fs/ext4/ext4.h -@@ -558,7 +558,7 @@ enum { - * - * It's not paranoia if the Murphy's Law really *is* out to get you. :-) - */ --#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) -+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) - #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) - - static inline void ext4_check_flag_values(void) -@@ -2964,7 +2964,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, - typedef enum { - EXT4_IGET_NORMAL = 0, - EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ -- EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ -+ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ -+ EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */ - } ext4_iget_flags; - - extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, -@@ -3619,8 +3620,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh, - unsigned int blocksize); - extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, - struct buffer_head *bh); --extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, -- struct inode *inode); -+extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, -+ struct inode *inode, struct dentry *dentry); - extern int __ext4_link(struct inode *dir, struct inode *inode, - struct dentry *dentry); - -diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c -index 6c399a8b22b35..36225ef56b0cd 100644 ---- a/fs/ext4/extents.c -+++ b/fs/ext4/extents.c -@@ -5799,6 +5799,14 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) - struct ext4_extent *extent; - ext4_lblk_t first_lblk, first_lclu, last_lclu; - -+ /* -+ * if data can be stored inline, the logical cluster isn't -+ * mapped - no physical clusters have been allocated, and the -+ * file has no extents -+ */ -+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) -+ return 0; -+ - /* search for the extent closest to the first block in the cluster */ - path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); - if (IS_ERR(path)) { -diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c -index cd0a861853e3f..7ada374ff27d7 100644 ---- a/fs/ext4/extents_status.c -+++ b/fs/ext4/extents_status.c -@@ -1371,7 +1371,7 @@ retry: - if (count_reserved) - count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, - &orig_es, &rc); -- goto out; -+ goto out_get_reserved; - } - - if (len1 > 0) { -@@ -1413,6 +1413,7 @@ retry: - } - } - -+out_get_reserved: - if (count_reserved) - *reserved = get_rsvd(inode, end, es, &rc); - out: -diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c -index 0f6d0a80467d7..7ed71c652f67f 100644 ---- a/fs/ext4/fast_commit.c -+++ b/fs/ext4/fast_commit.c -@@ -420,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) - struct __track_dentry_update_args *dentry_update = - (struct __track_dentry_update_args *)arg; - struct dentry *dentry = dentry_update->dentry; -- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); -+ struct inode *dir = dentry->d_parent->d_inode; -+ struct super_block *sb = inode->i_sb; -+ struct ext4_sb_info *sbi = EXT4_SB(sb); - - mutex_unlock(&ei->i_fc_lock); -+ -+ if (IS_ENCRYPTED(dir)) { -+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, -+ NULL); -+ mutex_lock(&ei->i_fc_lock); -+ return -EOPNOTSUPP; -+ } -+ - node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); - if (!node) { -- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); -+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); - mutex_lock(&ei->i_fc_lock); - return -ENOMEM; - } - - node->fcd_op = dentry_update->op; -- node->fcd_parent = dentry->d_parent->d_inode->i_ino; -+ node->fcd_parent = dir->i_ino; - node->fcd_ino = inode->i_ino; - if (dentry->d_name.len > DNAME_INLINE_LEN) { - node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); - if (!node->fcd_name.name) { - kmem_cache_free(ext4_fc_dentry_cachep, node); -- ext4_fc_mark_ineligible(inode->i_sb, -- EXT4_FC_REASON_NOMEM, NULL); -+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); - mutex_lock(&ei->i_fc_lock); - return -ENOMEM; - } -@@ -666,6 +675,15 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) - - /* Ext4 commit path routines */ - -+/* memcpy to fc reserved space and update CRC */ -+static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, -+ int len, u32 *crc) -+{ -+ if (crc) -+ *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); -+ return memcpy(dst, src, len); -+} -+ - /* memzero and update CRC */ - static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, - u32 *crc) -@@ -691,62 +709,59 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, - */ - static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) - { -- struct ext4_fc_tl *tl; -+ struct ext4_fc_tl tl; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct buffer_head *bh; - int bsize = sbi->s_journal->j_blocksize; - int ret, off = sbi->s_fc_bytes % bsize; -- int pad_len; -+ int remaining; -+ u8 *dst; - - /* -- * After allocating len, we should have space at least for a 0 byte -- * padding. -+ * If 'len' is too long to fit in any block alongside a PAD tlv, then we -+ * cannot fulfill the request. - */ -- if (len + EXT4_FC_TAG_BASE_LEN > bsize) -+ if (len > bsize - EXT4_FC_TAG_BASE_LEN) - return NULL; - -- if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) { -- /* -- * Only allocate from current buffer if we have enough space for -- * this request AND we have space to add a zero byte padding. -- */ -- if (!sbi->s_fc_bh) { -- ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); -- if (ret) -- return NULL; -- sbi->s_fc_bh = bh; -- } -+ if (!sbi->s_fc_bh) { -+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); -+ if (ret) -+ return NULL; -+ sbi->s_fc_bh = bh; -+ } -+ dst = sbi->s_fc_bh->b_data + off; -+ -+ /* -+ * Allocate the bytes in the current block if we can do so while still -+ * leaving enough space for a PAD tlv. -+ */ -+ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; -+ if (len <= remaining) { - sbi->s_fc_bytes += len; -- return sbi->s_fc_bh->b_data + off; -+ return dst; - } -- /* Need to add PAD tag */ -- tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); -- tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); -- pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN; -- tl->fc_len = cpu_to_le16(pad_len); -- if (crc) -- *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN); -- if (pad_len > 0) -- ext4_fc_memzero(sb, tl + 1, pad_len, crc); -+ -+ /* -+ * Else, terminate the current block with a PAD tlv, then allocate a new -+ * block and allocate the bytes at the start of that new block. -+ */ -+ -+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); -+ tl.fc_len = cpu_to_le16(remaining); -+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); -+ ext4_fc_memzero(sb, dst + EXT4_FC_TAG_BASE_LEN, remaining, crc); -+ - ext4_fc_submit_bh(sb, false); - - ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); - if (ret) - return NULL; - sbi->s_fc_bh = bh; -- sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; -+ sbi->s_fc_bytes += bsize - off + len; - return sbi->s_fc_bh->b_data; - } - --/* memcpy to fc reserved space and update CRC */ --static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, -- int len, u32 *crc) --{ -- if (crc) -- *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); -- return memcpy(dst, src, len); --} -- - /* - * Complete a fast commit by writing tail tag. - * -@@ -774,7 +789,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) - off = sbi->s_fc_bytes % bsize; - - tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); -- tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); -+ tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); - sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); - - ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc); -@@ -784,6 +799,8 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) - dst += sizeof(tail.fc_tid); - tail.fc_crc = cpu_to_le32(crc); - ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); -+ dst += sizeof(tail.fc_crc); -+ memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ - - ext4_fc_submit_bh(sb, true); - -@@ -1388,7 +1405,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, - return 0; - } - -- ret = __ext4_unlink(NULL, old_parent, &entry, inode); -+ ret = __ext4_unlink(old_parent, &entry, inode, NULL); - /* -ENOENT ok coz it might not exist anymore. */ - if (ret == -ENOENT) - ret = 0; -@@ -1977,32 +1994,31 @@ void ext4_fc_replay_cleanup(struct super_block *sb) - kfree(sbi->s_fc_replay_state.fc_modified_inodes); - } - --static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl, -- u8 *val, u8 *end) -+static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, -+ int tag, int len) - { -- if (val + tl->fc_len > end) -- return false; -- -- /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do -- * journal rescan before do CRC check. Other tags length check will -- * rely on CRC check. -- */ -- switch (tl->fc_tag) { -+ switch (tag) { - case EXT4_FC_TAG_ADD_RANGE: -- return (sizeof(struct ext4_fc_add_range) == tl->fc_len); -- case EXT4_FC_TAG_TAIL: -- return (sizeof(struct ext4_fc_tail) <= tl->fc_len); -- case EXT4_FC_TAG_HEAD: -- return (sizeof(struct ext4_fc_head) == tl->fc_len); -+ return len == sizeof(struct ext4_fc_add_range); - case EXT4_FC_TAG_DEL_RANGE: -+ return len == sizeof(struct ext4_fc_del_range); -+ case EXT4_FC_TAG_CREAT: - case EXT4_FC_TAG_LINK: - case EXT4_FC_TAG_UNLINK: -- case EXT4_FC_TAG_CREAT: -+ len -= sizeof(struct ext4_fc_dentry_info); -+ return len >= 1 && len <= EXT4_NAME_LEN; - case EXT4_FC_TAG_INODE: -+ len -= sizeof(struct ext4_fc_inode); -+ return len >= EXT4_GOOD_OLD_INODE_SIZE && -+ len <= sbi->s_inode_size; - case EXT4_FC_TAG_PAD: -- default: -- return true; -+ return true; /* padding can have any length */ -+ case EXT4_FC_TAG_TAIL: -+ return len >= sizeof(struct ext4_fc_tail); -+ case EXT4_FC_TAG_HEAD: -+ return len == sizeof(struct ext4_fc_head); - } -+ return false; - } - - /* -@@ -2040,7 +2056,7 @@ static int ext4_fc_replay_scan(journal_t *journal, - state = &sbi->s_fc_replay_state; - - start = (u8 *)bh->b_data; -- end = (__u8 *)bh->b_data + journal->j_blocksize - 1; -+ end = start + journal->j_blocksize; - - if (state->fc_replay_expected_off == 0) { - state->fc_cur_tag = 0; -@@ -2061,11 +2077,12 @@ static int ext4_fc_replay_scan(journal_t *journal, - } - - state->fc_replay_expected_off++; -- for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; -+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; - cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { - ext4_fc_get_tl(&tl, cur); - val = cur + EXT4_FC_TAG_BASE_LEN; -- if (!ext4_fc_tag_len_isvalid(&tl, val, end)) { -+ if (tl.fc_len > end - val || -+ !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { - ret = state->fc_replay_num_tags ? - JBD2_FC_REPLAY_STOP : -ECANCELED; - goto out_err; -@@ -2178,9 +2195,9 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, - #endif - - start = (u8 *)bh->b_data; -- end = (__u8 *)bh->b_data + journal->j_blocksize - 1; -+ end = start + journal->j_blocksize; - -- for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; -+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; - cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { - ext4_fc_get_tl(&tl, cur); - val = cur + EXT4_FC_TAG_BASE_LEN; -@@ -2249,17 +2266,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) - journal->j_fc_cleanup_callback = ext4_fc_cleanup; - } - --static const char *fc_ineligible_reasons[] = { -- "Extended attributes changed", -- "Cross rename", -- "Journal flag changed", -- "Insufficient memory", -- "Swap boot", -- "Resize", -- "Dir renamed", -- "Falloc range op", -- "Data journalling", -- "FC Commit Failed" -+static const char * const fc_ineligible_reasons[] = { -+ [EXT4_FC_REASON_XATTR] = "Extended attributes changed", -+ [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", -+ [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", -+ [EXT4_FC_REASON_NOMEM] = "Insufficient memory", -+ [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", -+ [EXT4_FC_REASON_RESIZE] = "Resize", -+ [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", -+ [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", -+ [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", -+ [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", - }; - - int ext4_fc_info_show(struct seq_file *seq, void *v) -diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h -index a6154c3ed1357..2fadb2c4780c8 100644 ---- a/fs/ext4/fast_commit.h -+++ b/fs/ext4/fast_commit.h -@@ -58,7 +58,7 @@ struct ext4_fc_dentry_info { - __u8 fc_dname[]; - }; - --/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ -+/* Value structure for EXT4_FC_TAG_INODE. */ - struct ext4_fc_inode { - __le32 fc_ino; - __u8 fc_raw_inode[]; -@@ -96,6 +96,7 @@ enum { - EXT4_FC_REASON_RENAME_DIR, - EXT4_FC_REASON_FALLOC_RANGE, - EXT4_FC_REASON_INODE_JOURNAL_DATA, -+ EXT4_FC_REASON_ENCRYPTED_FILENAME, - EXT4_FC_REASON_MAX - }; - -diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c -index 860fc51190098..c68bebe7ff4b6 100644 ---- a/fs/ext4/indirect.c -+++ b/fs/ext4/indirect.c -@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; -+ unsigned int key; - int ret = -EIO; - - *err = 0; -@@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, - if (!p->key) - goto no_block; - while (--depth) { -- bh = sb_getblk(sb, le32_to_cpu(p->key)); -+ key = le32_to_cpu(p->key); -+ if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { -+ /* the block was out of range */ -+ ret = -EFSCORRUPTED; -+ goto failure; -+ } -+ bh = sb_getblk(sb, key); - if (unlikely(!bh)) { - ret = -ENOMEM; - goto failure; -diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c -index 2b5ef1b642499..283afda26d9cb 100644 ---- a/fs/ext4/inode.c -+++ b/fs/ext4/inode.c -@@ -222,13 +222,13 @@ void ext4_evict_inode(struct inode *inode) - - /* - * For inodes with journalled data, transaction commit could have -- * dirtied the inode. Flush worker is ignoring it because of I_FREEING -- * flag but we still need to remove the inode from the writeback lists. -+ * dirtied the inode. And for inodes with dioread_nolock, unwritten -+ * extents converting worker could merge extents and also have dirtied -+ * the inode. Flush worker is ignoring it because of I_FREEING flag but -+ * we still need to remove the inode from the writeback lists. - */ -- if (!list_empty_careful(&inode->i_io_list)) { -- WARN_ON_ONCE(!ext4_should_journal_data(inode)); -+ if (!list_empty_careful(&inode->i_io_list)) - inode_io_list_del(inode); -- } - - /* - * Protect us against freezing - iput() caller didn't have to have any -@@ -335,6 +335,12 @@ stop_handle: - ext4_xattr_inode_array_free(ea_inode_array); - return; - no_delete: -+ /* -+ * Check out some where else accidentally dirty the evicting inode, -+ * which may probably cause inode use-after-free issues later. -+ */ -+ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); -+ - if (!list_empty(&EXT4_I(inode)->i_fc_list)) - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); - ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ -@@ -1309,7 +1315,8 @@ static int ext4_write_end(struct file *file, - - trace_ext4_write_end(inode, pos, len, copied); - -- if (ext4_has_inline_data(inode)) -+ if (ext4_has_inline_data(inode) && -+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); -@@ -4225,7 +4232,8 @@ int ext4_truncate(struct inode *inode) - - /* If we zero-out tail of the page, we have to create jinode for jbd2 */ - if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { -- if (ext4_inode_attach_jinode(inode) < 0) -+ err = ext4_inode_attach_jinode(inode); -+ if (err) - goto out_trace; - } - -@@ -4473,9 +4481,17 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, - inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; - inode_offset = ((ino - 1) % - EXT4_INODES_PER_GROUP(sb)); -- block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); - iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); - -+ block = ext4_inode_table(sb, gdp); -+ if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || -+ (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { -+ ext4_error(sb, "Invalid inode table block %llu in " -+ "block_group %u", block, iloc->block_group); -+ return -EFSCORRUPTED; -+ } -+ block += (inode_offset / inodes_per_block); -+ - bh = sb_getblk(sb, block); - if (unlikely(!bh)) - return -ENOMEM; -@@ -5044,8 +5060,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, - if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) - ext4_error_inode(inode, function, line, 0, - "casefold flag without casefold feature"); -- brelse(iloc.bh); -+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { -+ ext4_error_inode(inode, function, line, 0, -+ "bad inode without EXT4_IGET_BAD flag"); -+ ret = -EUCLEAN; -+ goto bad_inode; -+ } - -+ brelse(iloc.bh); - unlock_new_inode(inode); - return inode; - -@@ -5853,6 +5875,14 @@ static int __ext4_expand_extra_isize(struct inode *inode, - return 0; - } - -+ /* -+ * We may need to allocate external xattr block so we need quotas -+ * initialized. Here we can be called with various locks held so we -+ * cannot affort to initialize quotas ourselves. So just bail. -+ */ -+ if (dquot_initialize_needed(inode)) -+ return -EAGAIN; -+ - /* try to expand with EAs present */ - error = ext4_expand_extra_isize_ea(inode, new_extra_isize, - raw_inode, handle); -diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c -index 95dfea28bf4e9..8067ccda34e45 100644 ---- a/fs/ext4/ioctl.c -+++ b/fs/ext4/ioctl.c -@@ -374,7 +374,8 @@ static long swap_inode_boot_loader(struct super_block *sb, - blkcnt_t blocks; - unsigned short bytes; - -- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); -+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, -+ EXT4_IGET_SPECIAL | EXT4_IGET_BAD); - if (IS_ERR(inode_bl)) - return PTR_ERR(inode_bl); - ei_bl = EXT4_I(inode_bl); -@@ -424,7 +425,7 @@ static long swap_inode_boot_loader(struct super_block *sb, - /* Protect extent tree against block allocations via delalloc */ - ext4_double_down_write_data_sem(inode, inode_bl); - -- if (inode_bl->i_nlink == 0) { -+ if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { - /* this inode has never been used as a BOOT_LOADER */ - set_nlink(inode_bl, 1); - i_uid_write(inode_bl, 0); -@@ -731,6 +732,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) - if (ext4_is_quota_file(inode)) - return err; - -+ err = dquot_initialize(inode); -+ if (err) -+ return err; -+ - err = ext4_get_inode_loc(inode, &iloc); - if (err) - return err; -@@ -746,10 +751,6 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) - brelse(iloc.bh); - } - -- err = dquot_initialize(inode); -- if (err) -- return err; -- - handle = ext4_journal_start(inode, EXT4_HT_QUOTA, - EXT4_QUOTA_INIT_BLOCKS(sb) + - EXT4_QUOTA_DEL_BLOCKS(sb) + 3); -@@ -1153,19 +1154,22 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, - - if (fsuuid.fsu_len == 0) { - fsuuid.fsu_len = UUID_SIZE; -- if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len))) -+ if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len, -+ sizeof(fsuuid.fsu_len))) - return -EFAULT; -- return -EINVAL; -+ return 0; - } - -- if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) -+ if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0) - return -EINVAL; - - lock_buffer(sbi->s_sbh); - memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); - unlock_buffer(sbi->s_sbh); - -- if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) -+ fsuuid.fsu_len = UUID_SIZE; -+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) || -+ copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) - return -EFAULT; - return 0; - } -diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c -index c08c0aba18836..1c5518a4bdf91 100644 ---- a/fs/ext4/namei.c -+++ b/fs/ext4/namei.c -@@ -3204,14 +3204,20 @@ end_rmdir: - return retval; - } - --int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, -- struct inode *inode) -+int __ext4_unlink(struct inode *dir, const struct qstr *d_name, -+ struct inode *inode, -+ struct dentry *dentry /* NULL during fast_commit recovery */) - { - int retval = -ENOENT; - struct buffer_head *bh; - struct ext4_dir_entry_2 *de; -+ handle_t *handle; - int skip_remove_dentry = 0; - -+ /* -+ * Keep this outside the transaction; it may have to set up the -+ * directory's encryption key, which isn't GFP_NOFS-safe. -+ */ - bh = ext4_find_entry(dir, d_name, &de, NULL); - if (IS_ERR(bh)) - return PTR_ERR(bh); -@@ -3228,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name - if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - skip_remove_dentry = 1; - else -- goto out; -+ goto out_bh; -+ } -+ -+ handle = ext4_journal_start(dir, EXT4_HT_DIR, -+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) { -+ retval = PTR_ERR(handle); -+ goto out_bh; - } - - if (IS_DIRSYNC(dir)) -@@ -3237,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name - if (!skip_remove_dentry) { - retval = ext4_delete_entry(handle, dir, de, bh); - if (retval) -- goto out; -+ goto out_handle; - dir->i_ctime = dir->i_mtime = current_time(dir); - ext4_update_dx_flag(dir); - retval = ext4_mark_inode_dirty(handle, dir); - if (retval) -- goto out; -+ goto out_handle; - } else { - retval = 0; - } -@@ -3255,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name - ext4_orphan_add(handle, inode); - inode->i_ctime = current_time(inode); - retval = ext4_mark_inode_dirty(handle, inode); -- --out: -+ if (dentry && !retval) -+ ext4_fc_track_unlink(handle, dentry); -+out_handle: -+ ext4_journal_stop(handle); -+out_bh: - brelse(bh); - return retval; - } - - static int ext4_unlink(struct inode *dir, struct dentry *dentry) - { -- handle_t *handle; - int retval; - - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) -@@ -3281,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) - if (retval) - goto out_trace; - -- handle = ext4_journal_start(dir, EXT4_HT_DIR, -- EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); -- if (IS_ERR(handle)) { -- retval = PTR_ERR(handle); -- goto out_trace; -- } -- -- retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); -- if (!retval) -- ext4_fc_track_unlink(handle, dentry); -+ retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); - #if IS_ENABLED(CONFIG_UNICODE) - /* VFS negative dentries are incompatible with Encoding and - * Case-insensitiveness. Eventually we'll want avoid -@@ -3301,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) - if (IS_CASEFOLDED(dir)) - d_invalidate(dentry); - #endif -- if (handle) -- ext4_journal_stop(handle); - - out_trace: - trace_ext4_unlink_exit(dentry, retval); -@@ -3792,6 +3796,9 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, - return -EXDEV; - - retval = dquot_initialize(old.dir); -+ if (retval) -+ return retval; -+ retval = dquot_initialize(old.inode); - if (retval) - return retval; - retval = dquot_initialize(new.dir); -diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c -index 69a9cf9137a61..e5b47dda33175 100644 ---- a/fs/ext4/orphan.c -+++ b/fs/ext4/orphan.c -@@ -412,7 +412,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) - /* don't clear list on RO mount w/ errors */ - if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { - ext4_msg(sb, KERN_INFO, "Errors on filesystem, " -- "clearing orphan list.\n"); -+ "clearing orphan list."); - es->s_last_orphan = 0; - } - ext4_debug("Skipping orphan recovery on fs with errors.\n"); -diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c -index 46b87ffeb3045..b493233750ab2 100644 ---- a/fs/ext4/resize.c -+++ b/fs/ext4/resize.c -@@ -1110,6 +1110,16 @@ exit_free: - return err; - } - -+static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, -+ ext4_group_t group) -+{ -+ struct ext4_super_block *es = (struct ext4_super_block *) data; -+ -+ es->s_block_group_nr = cpu_to_le16(group); -+ if (ext4_has_metadata_csum(sb)) -+ es->s_checksum = ext4_superblock_csum(sb, es); -+} -+ - /* - * Update the backup copies of the ext4 metadata. These don't need to be part - * of the main resize transaction, because e2fsck will re-write them if there -@@ -1158,7 +1168,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, - while (group < sbi->s_groups_count) { - struct buffer_head *bh; - ext4_fsblk_t backup_block; -- struct ext4_super_block *es; -+ int has_super = ext4_bg_has_super(sb, group); -+ ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group); - - /* Out of journal space, and can't get more - abort - so sad */ - err = ext4_resize_ensure_credits_batch(handle, 1); -@@ -1168,8 +1179,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, - if (meta_bg == 0) - backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; - else -- backup_block = (ext4_group_first_block_no(sb, group) + -- ext4_bg_has_super(sb, group)); -+ backup_block = first_block + has_super; - - bh = sb_getblk(sb, backup_block); - if (unlikely(!bh)) { -@@ -1187,10 +1197,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, - memcpy(bh->b_data, data, size); - if (rest) - memset(bh->b_data + size, 0, rest); -- es = (struct ext4_super_block *) bh->b_data; -- es->s_block_group_nr = cpu_to_le16(group); -- if (ext4_has_metadata_csum(sb)) -- es->s_checksum = ext4_superblock_csum(sb, es); -+ if (has_super && (backup_block == first_block)) -+ ext4_set_block_group_nr(sb, bh->b_data, group); - set_buffer_uptodate(bh); - unlock_buffer(bh); - err = ext4_handle_dirty_metadata(handle, NULL, bh); -@@ -1476,8 +1484,6 @@ static void ext4_update_super(struct super_block *sb, - * active. */ - ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + - reserved_blocks); -- ext4_superblock_csum_set(sb); -- unlock_buffer(sbi->s_sbh); - - /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeclusters_counter, -@@ -1513,6 +1519,8 @@ static void ext4_update_super(struct super_block *sb, - ext4_calculate_overhead(sb); - es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); - -+ ext4_superblock_csum_set(sb); -+ unlock_buffer(sbi->s_sbh); - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: added group %u:" - "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, -@@ -1596,8 +1604,8 @@ exit_journal: - int meta_bg = ext4_has_feature_meta_bg(sb); - sector_t old_gdb = 0; - -- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, -- sizeof(struct ext4_super_block), 0); -+ update_backups(sb, ext4_group_first_block_no(sb, 0), -+ (char *)es, sizeof(struct ext4_super_block), 0); - for (; gdb_num <= gdb_num_end; gdb_num++) { - struct buffer_head *gdb_bh; - -@@ -1808,7 +1816,7 @@ errout: - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extended group to %llu " - "blocks\n", ext4_blocks_count(es)); -- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, -+ update_backups(sb, ext4_group_first_block_no(sb, 0), - (char *)es, sizeof(struct ext4_super_block), 0); - } - return err; -diff --git a/fs/ext4/super.c b/fs/ext4/super.c -index 7cdd2138c8972..aa4f65663fad8 100644 ---- a/fs/ext4/super.c -+++ b/fs/ext4/super.c -@@ -1323,6 +1323,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) - return NULL; - - inode_set_iversion(&ei->vfs_inode, 1); -+ ei->i_flags = 0; - spin_lock_init(&ei->i_raw_lock); - INIT_LIST_HEAD(&ei->i_prealloc_list); - atomic_set(&ei->i_prealloc_active, 0); -@@ -2247,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) - return -EINVAL; - } - -- error = fs_lookup_param(fc, param, 1, &path); -+ error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); - if (error) { - ext4_msg(NULL, KERN_ERR, "error: could not find " - "journal device path"); -@@ -5287,14 +5288,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) - goto failed_mount3a; - } else { - /* Nojournal mode, all journal mount options are illegal */ -- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { -+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " -- "journal_checksum, fs mounted w/o journal"); -+ "journal_async_commit, fs mounted w/o journal"); - goto failed_mount3a; - } -- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { -+ -+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "can't mount with " -- "journal_async_commit, fs mounted w/o journal"); -+ "journal_checksum, fs mounted w/o journal"); - goto failed_mount3a; - } - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { -@@ -5723,7 +5725,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, - - ext4_debug("Journal inode found at %p: %lld bytes\n", - journal_inode, journal_inode->i_size); -- if (!S_ISREG(journal_inode->i_mode)) { -+ if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { - ext4_msg(sb, KERN_ERR, "invalid journal inode"); - iput(journal_inode); - return NULL; -@@ -6886,6 +6888,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - return err; - } - -+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) -+{ -+ switch (type) { -+ case USRQUOTA: -+ return qf_inum == EXT4_USR_QUOTA_INO; -+ case GRPQUOTA: -+ return qf_inum == EXT4_GRP_QUOTA_INO; -+ case PRJQUOTA: -+ return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; -+ default: -+ BUG(); -+ } -+} -+ - static int ext4_quota_enable(struct super_block *sb, int type, int format_id, - unsigned int flags) - { -@@ -6902,9 +6918,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, - if (!qf_inums[type]) - return -EPERM; - -+ if (!ext4_check_quota_inum(type, qf_inums[type])) { -+ ext4_error(sb, "Bad quota inum: %lu, type: %d", -+ qf_inums[type], type); -+ return -EUCLEAN; -+ } -+ - qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); - if (IS_ERR(qf_inode)) { -- ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); -+ ext4_error(sb, "Bad quota inode: %lu, type: %d", -+ qf_inums[type], type); - return PTR_ERR(qf_inode); - } - -@@ -6943,8 +6966,9 @@ int ext4_enable_quotas(struct super_block *sb) - if (err) { - ext4_warning(sb, - "Failed to enable quota tracking " -- "(type=%d, err=%d). Please run " -- "e2fsck to fix.", type, err); -+ "(type=%d, err=%d, ino=%lu). " -+ "Please run e2fsck to fix.", type, -+ err, qf_inums[type]); - for (type--; type >= 0; type--) { - struct inode *inode; - -diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c -index 3c640bd7ecaeb..30e3b65798b50 100644 ---- a/fs/ext4/verity.c -+++ b/fs/ext4/verity.c -@@ -79,7 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, - size_t n = min_t(size_t, count, - PAGE_SIZE - offset_in_page(pos)); - struct page *page; -- void *fsdata; -+ void *fsdata = NULL; - int res; - - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); -diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c -index 36d6ba7190b6d..866772a2e068f 100644 ---- a/fs/ext4/xattr.c -+++ b/fs/ext4/xattr.c -@@ -1281,7 +1281,7 @@ retry_ref: - ce = mb_cache_entry_get(ea_block_cache, hash, - bh->b_blocknr); - if (ce) { -- ce->e_reusable = 1; -+ set_bit(MBE_REUSABLE_B, &ce->e_flags); - mb_cache_entry_put(ea_block_cache, ce); - } - } -@@ -1441,6 +1441,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, - if (!err) - err = ext4_inode_attach_jinode(ea_inode); - if (err) { -+ if (ext4_xattr_inode_dec_ref(handle, ea_inode)) -+ ext4_warning_inode(ea_inode, -+ "cleanup dec ref error %d", err); - iput(ea_inode); - return ERR_PTR(err); - } -@@ -2042,7 +2045,7 @@ inserted: - } - BHDR(new_bh)->h_refcount = cpu_to_le32(ref); - if (ref == EXT4_XATTR_REFCOUNT_MAX) -- ce->e_reusable = 0; -+ clear_bit(MBE_REUSABLE_B, &ce->e_flags); - ea_bdebug(new_bh, "reusing; refcount now=%d", - ref); - ext4_xattr_block_csum_set(inode, new_bh); -@@ -2070,19 +2073,11 @@ inserted: - - goal = ext4_group_first_block_no(sb, - EXT4_I(inode)->i_block_group); -- -- /* non-extent files can't have physical blocks past 2^32 */ -- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) -- goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; -- - block = ext4_new_meta_blocks(handle, inode, goal, 0, - NULL, &error); - if (error) - goto cleanup; - -- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) -- BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); -- - ea_idebug(inode, "creating block %llu", - (unsigned long long)block); - -@@ -2555,7 +2550,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, - - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); - bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); -- buffer = kmalloc(value_size, GFP_NOFS); -+ buffer = kvmalloc(value_size, GFP_NOFS); - b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); - if (!is || !bs || !buffer || !b_entry_name) { - error = -ENOMEM; -@@ -2607,7 +2602,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, - error = 0; - out: - kfree(b_entry_name); -- kfree(buffer); -+ kvfree(buffer); - if (is) - brelse(is->iloc.bh); - if (bs) -diff --git a/fs/fs_parser.c b/fs/fs_parser.c -index ed40ce5742fda..edb3712dcfa58 100644 ---- a/fs/fs_parser.c -+++ b/fs/fs_parser.c -@@ -138,15 +138,16 @@ EXPORT_SYMBOL(__fs_parse); - * @fc: The filesystem context to log errors through. - * @param: The parameter. - * @want_bdev: T if want a blockdev -+ * @flags: Pathwalk flags passed to filename_lookup() - * @_path: The result of the lookup - */ - int fs_lookup_param(struct fs_context *fc, - struct fs_parameter *param, - bool want_bdev, -+ unsigned int flags, - struct path *_path) - { - struct filename *f; -- unsigned int flags = 0; - bool put_f; - int ret; - -diff --git a/fs/mbcache.c b/fs/mbcache.c -index e272ad738faff..2a4b8b549e934 100644 ---- a/fs/mbcache.c -+++ b/fs/mbcache.c -@@ -100,8 +100,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - atomic_set(&entry->e_refcnt, 2); - entry->e_key = key; - entry->e_value = value; -- entry->e_reusable = reusable; -- entry->e_referenced = 0; -+ entry->e_flags = 0; -+ if (reusable) -+ set_bit(MBE_REUSABLE_B, &entry->e_flags); - head = mb_cache_entry_head(cache, key); - hlist_bl_lock(head); - hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { -@@ -165,7 +166,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, - while (node) { - entry = hlist_bl_entry(node, struct mb_cache_entry, - e_hash_list); -- if (entry->e_key == key && entry->e_reusable && -+ if (entry->e_key == key && -+ test_bit(MBE_REUSABLE_B, &entry->e_flags) && - atomic_inc_not_zero(&entry->e_refcnt)) - goto out; - node = node->next; -@@ -284,7 +286,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get); - void mb_cache_entry_touch(struct mb_cache *cache, - struct mb_cache_entry *entry) - { -- entry->e_referenced = 1; -+ set_bit(MBE_REFERENCED_B, &entry->e_flags); - } - EXPORT_SYMBOL(mb_cache_entry_touch); - -@@ -309,9 +311,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, - entry = list_first_entry(&cache->c_list, - struct mb_cache_entry, e_list); - /* Drop initial hash reference if there is no user */ -- if (entry->e_referenced || -+ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || - atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { -- entry->e_referenced = 0; -+ clear_bit(MBE_REFERENCED_B, &entry->e_flags); - list_move_tail(&entry->e_list, &cache->c_list); - continue; - } -diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c -index 0427b44bfee54..f27faf5db5544 100644 ---- a/fs/quota/dquot.c -+++ b/fs/quota/dquot.c -@@ -2324,6 +2324,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type) - struct super_block *sb = inode->i_sb; - struct quota_info *dqopt = sb_dqopt(sb); - -+ if (is_bad_inode(inode)) -+ return -EUCLEAN; - if (!S_ISREG(inode->i_mode)) - return -EACCES; - if (IS_RDONLY(inode)) -diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h -index 9e1e6965f4074..0eb8f035b3d9f 100644 ---- a/include/linux/bpf_verifier.h -+++ b/include/linux/bpf_verifier.h -@@ -642,7 +642,7 @@ static inline u32 type_flag(u32 type) - } - - /* only use after check_attach_btf_id() */ --static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) -+static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) - { - return prog->type == BPF_PROG_TYPE_EXT ? - prog->aux->dst_prog->type : prog->type; -diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h -index 34aab4dd336c8..4dc7cda4fd469 100644 ---- a/include/linux/devfreq.h -+++ b/include/linux/devfreq.h -@@ -152,8 +152,8 @@ struct devfreq_stats { - * @max_state: count of entry present in the frequency table. - * @previous_freq: previously configured frequency value. - * @last_status: devfreq user device info, performance statistics -- * @data: Private data of the governor. The devfreq framework does not -- * touch this. -+ * @data: devfreq driver pass to governors, governor should not change it. -+ * @governor_data: private data for governors, devfreq core doesn't touch it. - * @user_min_freq_req: PM QoS minimum frequency request from user (via sysfs) - * @user_max_freq_req: PM QoS maximum frequency request from user (via sysfs) - * @scaling_min_freq: Limit minimum frequency requested by OPP interface -@@ -193,7 +193,8 @@ struct devfreq { - unsigned long previous_freq; - struct devfreq_dev_status last_status; - -- void *data; /* private data for governors */ -+ void *data; -+ void *governor_data; - - struct dev_pm_qos_request user_min_freq_req; - struct dev_pm_qos_request user_max_freq_req; -diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h -index f103c91139d4a..01542c4b87a2b 100644 ---- a/include/linux/fs_parser.h -+++ b/include/linux/fs_parser.h -@@ -76,6 +76,7 @@ static inline int fs_parse(struct fs_context *fc, - extern int fs_lookup_param(struct fs_context *fc, - struct fs_parameter *param, - bool want_bdev, -+ unsigned int flags, - struct path *_path); - - extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); -diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h -index 2da63fd7b98f4..97e64184767de 100644 ---- a/include/linux/mbcache.h -+++ b/include/linux/mbcache.h -@@ -10,6 +10,12 @@ - - struct mb_cache; - -+/* Cache entry flags */ -+enum { -+ MBE_REFERENCED_B = 0, -+ MBE_REUSABLE_B -+}; -+ - struct mb_cache_entry { - /* List of entries in cache - protected by cache->c_list_lock */ - struct list_head e_list; -@@ -26,8 +32,7 @@ struct mb_cache_entry { - atomic_t e_refcnt; - /* Key in hash - stable during lifetime of the entry */ - u32 e_key; -- u32 e_referenced:1; -- u32 e_reusable:1; -+ unsigned long e_flags; - /* User provided value - stable during lifetime of the entry */ - u64 e_value; - }; -diff --git a/include/linux/prandom.h b/include/linux/prandom.h -index e0a0759dd09c0..1f4a0de7b019e 100644 ---- a/include/linux/prandom.h -+++ b/include/linux/prandom.h -@@ -23,24 +23,10 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); - #define prandom_init_once(pcpu_state) \ - DO_ONCE(prandom_seed_full_state, (pcpu_state)) - --/** -- * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) -- * @ep_ro: right open interval endpoint -- * -- * Returns a pseudo-random number that is in interval [0, ep_ro). This is -- * useful when requesting a random index of an array containing ep_ro elements, -- * for example. The result is somewhat biased when ep_ro is not a power of 2, -- * so do not use this for cryptographic purposes. -- * -- * Returns: pseudo-random number in interval [0, ep_ro) -- */ -+/* Deprecated: use get_random_u32_below() instead. */ - static inline u32 prandom_u32_max(u32 ep_ro) - { -- if (__builtin_constant_p(ep_ro <= 1U << 8) && ep_ro <= 1U << 8) -- return (get_random_u8() * ep_ro) >> 8; -- if (__builtin_constant_p(ep_ro <= 1U << 16) && ep_ro <= 1U << 16) -- return (get_random_u16() * ep_ro) >> 16; -- return ((u64)get_random_u32() * ep_ro) >> 32; -+ return get_random_u32_below(ep_ro); - } - - /* -diff --git a/include/linux/random.h b/include/linux/random.h -index 147a5e0d0b8ed..bd954ecbef901 100644 ---- a/include/linux/random.h -+++ b/include/linux/random.h -@@ -51,6 +51,71 @@ static inline unsigned long get_random_long(void) - #endif - } - -+u32 __get_random_u32_below(u32 ceil); -+ -+/* -+ * Returns a random integer in the interval [0, ceil), with uniform -+ * distribution, suitable for all uses. Fastest when ceil is a constant, but -+ * still fast for variable ceil as well. -+ */ -+static inline u32 get_random_u32_below(u32 ceil) -+{ -+ if (!__builtin_constant_p(ceil)) -+ return __get_random_u32_below(ceil); -+ -+ /* -+ * For the fast path, below, all operations on ceil are precomputed by -+ * the compiler, so this incurs no overhead for checking pow2, doing -+ * divisions, or branching based on integer size. The resultant -+ * algorithm does traditional reciprocal multiplication (typically -+ * optimized by the compiler into shifts and adds), rejecting samples -+ * whose lower half would indicate a range indivisible by ceil. -+ */ -+ BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0"); -+ if (ceil <= 1) -+ return 0; -+ for (;;) { -+ if (ceil <= 1U << 8) { -+ u32 mult = ceil * get_random_u8(); -+ if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil)) -+ return mult >> 8; -+ } else if (ceil <= 1U << 16) { -+ u32 mult = ceil * get_random_u16(); -+ if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil)) -+ return mult >> 16; -+ } else { -+ u64 mult = (u64)ceil * get_random_u32(); -+ if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil)) -+ return mult >> 32; -+ } -+ } -+} -+ -+/* -+ * Returns a random integer in the interval (floor, U32_MAX], with uniform -+ * distribution, suitable for all uses. Fastest when floor is a constant, but -+ * still fast for variable floor as well. -+ */ -+static inline u32 get_random_u32_above(u32 floor) -+{ -+ BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && floor == U32_MAX, -+ "get_random_u32_above() must take floor < U32_MAX"); -+ return floor + 1 + get_random_u32_below(U32_MAX - floor); -+} -+ -+/* -+ * Returns a random integer in the interval [floor, ceil], with uniform -+ * distribution, suitable for all uses. Fastest when floor and ceil are -+ * constant, but still fast for variable floor and ceil as well. -+ */ -+static inline u32 get_random_u32_inclusive(u32 floor, u32 ceil) -+{ -+ BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && __builtin_constant_p(ceil) && -+ (floor > ceil || ceil - floor == U32_MAX), -+ "get_random_u32_inclusive() must take floor <= ceil"); -+ return floor + get_random_u32_below(ceil - floor + 1); -+} -+ - /* - * On 64-bit architectures, protect against non-terminated C string overflows - * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. -diff --git a/include/net/mptcp.h b/include/net/mptcp.h -index 412479ebf5ad3..3c5c68618fcc5 100644 ---- a/include/net/mptcp.h -+++ b/include/net/mptcp.h -@@ -97,8 +97,6 @@ struct mptcp_out_options { - }; - - #ifdef CONFIG_MPTCP --extern struct request_sock_ops mptcp_subflow_request_sock_ops; -- - void mptcp_init(void); - - static inline bool sk_is_mptcp(const struct sock *sk) -@@ -188,6 +186,9 @@ void mptcp_seq_show(struct seq_file *seq); - int mptcp_subflow_init_cookie_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb); -+struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, -+ struct sock *sk_listener, -+ bool attach_listener); - - __be32 mptcp_get_reset_option(const struct sk_buff *skb); - -@@ -274,6 +275,13 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, - return 0; /* TCP fallback */ - } - -+static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, -+ struct sock *sk_listener, -+ bool attach_listener) -+{ -+ return NULL; -+} -+ - static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } - #endif /* CONFIG_MPTCP */ - -diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h -index 229e8fae66a34..ced95fec3367d 100644 ---- a/include/trace/events/ext4.h -+++ b/include/trace/events/ext4.h -@@ -104,6 +104,7 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE); - TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR); - TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE); - TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); -+TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME); - TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); - - #define show_fc_reason(reason) \ -@@ -116,7 +117,8 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); - { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ - { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ - { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ -- { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}) -+ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ -+ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) - - TRACE_EVENT(ext4_other_inode_update_time, - TP_PROTO(struct inode *inode, ino_t orig_ino), -@@ -2764,7 +2766,7 @@ TRACE_EVENT(ext4_fc_stats, - ), - - TP_printk("dev %d,%d fc ineligible reasons:\n" -- "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u " -+ "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u" - "num_commits:%lu, ineligible: %lu, numblks: %lu", - MAJOR(__entry->dev), MINOR(__entry->dev), - FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), -@@ -2776,6 +2778,7 @@ TRACE_EVENT(ext4_fc_stats, - FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), - FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), - FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), -+ FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME), - __entry->fc_commits, __entry->fc_ineligible_commits, - __entry->fc_numblks) - ); -diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h -index 99f783c384bb4..8f5ee380d3093 100644 ---- a/include/trace/events/jbd2.h -+++ b/include/trace/events/jbd2.h -@@ -40,7 +40,7 @@ DECLARE_EVENT_CLASS(jbd2_commit, - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( char, sync_commit ) -- __field( int, transaction ) -+ __field( tid_t, transaction ) - ), - - TP_fast_assign( -@@ -49,7 +49,7 @@ DECLARE_EVENT_CLASS(jbd2_commit, - __entry->transaction = commit_transaction->t_tid; - ), - -- TP_printk("dev %d,%d transaction %d sync %d", -+ TP_printk("dev %d,%d transaction %u sync %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->transaction, __entry->sync_commit) - ); -@@ -97,8 +97,8 @@ TRACE_EVENT(jbd2_end_commit, - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( char, sync_commit ) -- __field( int, transaction ) -- __field( int, head ) -+ __field( tid_t, transaction ) -+ __field( tid_t, head ) - ), - - TP_fast_assign( -@@ -108,7 +108,7 @@ TRACE_EVENT(jbd2_end_commit, - __entry->head = journal->j_tail_sequence; - ), - -- TP_printk("dev %d,%d transaction %d sync %d head %d", -+ TP_printk("dev %d,%d transaction %u sync %d head %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->transaction, __entry->sync_commit, __entry->head) - ); -@@ -134,14 +134,14 @@ TRACE_EVENT(jbd2_submit_inode_data, - ); - - DECLARE_EVENT_CLASS(jbd2_handle_start_class, -- TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, -+ TP_PROTO(dev_t dev, tid_t tid, unsigned int type, - unsigned int line_no, int requested_blocks), - - TP_ARGS(dev, tid, type, line_no, requested_blocks), - - TP_STRUCT__entry( - __field( dev_t, dev ) -- __field( unsigned long, tid ) -+ __field( tid_t, tid ) - __field( unsigned int, type ) - __field( unsigned int, line_no ) - __field( int, requested_blocks) -@@ -155,28 +155,28 @@ DECLARE_EVENT_CLASS(jbd2_handle_start_class, - __entry->requested_blocks = requested_blocks; - ), - -- TP_printk("dev %d,%d tid %lu type %u line_no %u " -+ TP_printk("dev %d,%d tid %u type %u line_no %u " - "requested_blocks %d", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, - __entry->type, __entry->line_no, __entry->requested_blocks) - ); - - DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start, -- TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, -+ TP_PROTO(dev_t dev, tid_t tid, unsigned int type, - unsigned int line_no, int requested_blocks), - - TP_ARGS(dev, tid, type, line_no, requested_blocks) - ); - - DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart, -- TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, -+ TP_PROTO(dev_t dev, tid_t tid, unsigned int type, - unsigned int line_no, int requested_blocks), - - TP_ARGS(dev, tid, type, line_no, requested_blocks) - ); - - TRACE_EVENT(jbd2_handle_extend, -- TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, -+ TP_PROTO(dev_t dev, tid_t tid, unsigned int type, - unsigned int line_no, int buffer_credits, - int requested_blocks), - -@@ -184,7 +184,7 @@ TRACE_EVENT(jbd2_handle_extend, - - TP_STRUCT__entry( - __field( dev_t, dev ) -- __field( unsigned long, tid ) -+ __field( tid_t, tid ) - __field( unsigned int, type ) - __field( unsigned int, line_no ) - __field( int, buffer_credits ) -@@ -200,7 +200,7 @@ TRACE_EVENT(jbd2_handle_extend, - __entry->requested_blocks = requested_blocks; - ), - -- TP_printk("dev %d,%d tid %lu type %u line_no %u " -+ TP_printk("dev %d,%d tid %u type %u line_no %u " - "buffer_credits %d requested_blocks %d", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, - __entry->type, __entry->line_no, __entry->buffer_credits, -@@ -208,7 +208,7 @@ TRACE_EVENT(jbd2_handle_extend, - ); - - TRACE_EVENT(jbd2_handle_stats, -- TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, -+ TP_PROTO(dev_t dev, tid_t tid, unsigned int type, - unsigned int line_no, int interval, int sync, - int requested_blocks, int dirtied_blocks), - -@@ -217,7 +217,7 @@ TRACE_EVENT(jbd2_handle_stats, - - TP_STRUCT__entry( - __field( dev_t, dev ) -- __field( unsigned long, tid ) -+ __field( tid_t, tid ) - __field( unsigned int, type ) - __field( unsigned int, line_no ) - __field( int, interval ) -@@ -237,7 +237,7 @@ TRACE_EVENT(jbd2_handle_stats, - __entry->dirtied_blocks = dirtied_blocks; - ), - -- TP_printk("dev %d,%d tid %lu type %u line_no %u interval %d " -+ TP_printk("dev %d,%d tid %u type %u line_no %u interval %d " - "sync %d requested_blocks %d dirtied_blocks %d", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, - __entry->type, __entry->line_no, __entry->interval, -@@ -246,14 +246,14 @@ TRACE_EVENT(jbd2_handle_stats, - ); - - TRACE_EVENT(jbd2_run_stats, -- TP_PROTO(dev_t dev, unsigned long tid, -+ TP_PROTO(dev_t dev, tid_t tid, - struct transaction_run_stats_s *stats), - - TP_ARGS(dev, tid, stats), - - TP_STRUCT__entry( - __field( dev_t, dev ) -- __field( unsigned long, tid ) -+ __field( tid_t, tid ) - __field( unsigned long, wait ) - __field( unsigned long, request_delay ) - __field( unsigned long, running ) -@@ -279,7 +279,7 @@ TRACE_EVENT(jbd2_run_stats, - __entry->blocks_logged = stats->rs_blocks_logged; - ), - -- TP_printk("dev %d,%d tid %lu wait %u request_delay %u running %u " -+ TP_printk("dev %d,%d tid %u wait %u request_delay %u running %u " - "locked %u flushing %u logging %u handle_count %u " - "blocks %u blocks_logged %u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, -@@ -294,14 +294,14 @@ TRACE_EVENT(jbd2_run_stats, - ); - - TRACE_EVENT(jbd2_checkpoint_stats, -- TP_PROTO(dev_t dev, unsigned long tid, -+ TP_PROTO(dev_t dev, tid_t tid, - struct transaction_chp_stats_s *stats), - - TP_ARGS(dev, tid, stats), - - TP_STRUCT__entry( - __field( dev_t, dev ) -- __field( unsigned long, tid ) -+ __field( tid_t, tid ) - __field( unsigned long, chp_time ) - __field( __u32, forced_to_close ) - __field( __u32, written ) -@@ -317,7 +317,7 @@ TRACE_EVENT(jbd2_checkpoint_stats, - __entry->dropped = stats->cs_dropped; - ), - -- TP_printk("dev %d,%d tid %lu chp_time %u forced_to_close %u " -+ TP_printk("dev %d,%d tid %u chp_time %u forced_to_close %u " - "written %u dropped %u", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, - jiffies_to_msecs(__entry->chp_time), -diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c -index 25a54e04560e5..17ab3e15ac25f 100644 ---- a/kernel/bpf/core.c -+++ b/kernel/bpf/core.c -@@ -2088,6 +2088,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, - bool bpf_prog_map_compatible(struct bpf_map *map, - const struct bpf_prog *fp) - { -+ enum bpf_prog_type prog_type = resolve_prog_type(fp); - bool ret; - - if (fp->kprobe_override) -@@ -2098,12 +2099,12 @@ bool bpf_prog_map_compatible(struct bpf_map *map, - /* There's no owner yet where we could check for - * compatibility. - */ -- map->owner.type = fp->type; -+ map->owner.type = prog_type; - map->owner.jited = fp->jited; - map->owner.xdp_has_frags = fp->aux->xdp_has_frags; - ret = true; - } else { -- ret = map->owner.type == fp->type && -+ ret = map->owner.type == prog_type && - map->owner.jited == fp->jited && - map->owner.xdp_has_frags == fp->aux->xdp_has_frags; - } -diff --git a/kernel/events/core.c b/kernel/events/core.c -index 732b392fc5c63..3b9e86108f435 100644 ---- a/kernel/events/core.c -+++ b/kernel/events/core.c -@@ -12231,12 +12231,12 @@ SYSCALL_DEFINE5(perf_event_open, - if (flags & ~PERF_FLAG_ALL) - return -EINVAL; - -- /* Do we allow access to perf_event_open(2) ? */ -- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); -+ err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - -- err = perf_copy_attr(attr_uptr, &attr); -+ /* Do we allow access to perf_event_open(2) ? */ -+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); - if (err) - return err; - -diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig -index e9e95c790b8ee..93d7249962833 100644 ---- a/kernel/trace/Kconfig -+++ b/kernel/trace/Kconfig -@@ -375,6 +375,7 @@ config SCHED_TRACER - config HWLAT_TRACER - bool "Tracer to detect hardware latencies (like SMIs)" - select GENERIC_TRACER -+ select TRACER_MAX_TRACE - help - This tracer, when enabled will create one or more kernel threads, - depending on what the cpumask file is set to, which each thread -@@ -410,6 +411,7 @@ config HWLAT_TRACER - config OSNOISE_TRACER - bool "OS Noise tracer" - select GENERIC_TRACER -+ select TRACER_MAX_TRACE - help - In the context of high-performance computing (HPC), the Operating - System Noise (osnoise) refers to the interference experienced by an -diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c -index 5cfc95a52bc37..3076af8dbf32e 100644 ---- a/kernel/trace/trace.c -+++ b/kernel/trace/trace.c -@@ -1421,6 +1421,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) - return false; - } - EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); -+#define free_snapshot(tr) do { } while (0) - #endif /* CONFIG_TRACER_SNAPSHOT */ - - void tracer_tracing_off(struct trace_array *tr) -@@ -1692,6 +1693,8 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) - } - - unsigned long __read_mostly tracing_thresh; -+ -+#ifdef CONFIG_TRACER_MAX_TRACE - static const struct file_operations tracing_max_lat_fops; - - #ifdef LATENCY_FS_NOTIFY -@@ -1748,18 +1751,14 @@ void latency_fsnotify(struct trace_array *tr) - irq_work_queue(&tr->fsnotify_irqwork); - } - --#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ -- || defined(CONFIG_OSNOISE_TRACER) -+#else /* !LATENCY_FS_NOTIFY */ - - #define trace_create_maxlat_file(tr, d_tracer) \ - trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ - d_tracer, &tr->max_latency, &tracing_max_lat_fops) - --#else --#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) - #endif - --#ifdef CONFIG_TRACER_MAX_TRACE - /* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, -@@ -1834,14 +1833,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, - ring_buffer_record_off(tr->max_buffer.buffer); - - #ifdef CONFIG_TRACER_SNAPSHOT -- if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) -- goto out_unlock; -+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { -+ arch_spin_unlock(&tr->max_lock); -+ return; -+ } - #endif - swap(tr->array_buffer.buffer, tr->max_buffer.buffer); - - __update_max_tr(tr, tsk, cpu); - -- out_unlock: - arch_spin_unlock(&tr->max_lock); - } - -@@ -1888,6 +1888,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) - __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&tr->max_lock); - } -+ - #endif /* CONFIG_TRACER_MAX_TRACE */ - - static int wait_on_pipe(struct trace_iterator *iter, int full) -@@ -6572,7 +6573,7 @@ out: - return ret; - } - --#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) -+#ifdef CONFIG_TRACER_MAX_TRACE - - static ssize_t - tracing_max_lat_read(struct file *filp, char __user *ubuf, -@@ -6796,7 +6797,20 @@ waitagain: - - ret = print_trace_line(iter); - if (ret == TRACE_TYPE_PARTIAL_LINE) { -- /* don't print partial lines */ -+ /* -+ * If one print_trace_line() fills entire trace_seq in one shot, -+ * trace_seq_to_user() will returns -EBUSY because save_len == 0, -+ * In this case, we need to consume it, otherwise, loop will peek -+ * this event next time, resulting in an infinite loop. -+ */ -+ if (save_len == 0) { -+ iter->seq.full = 0; -+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); -+ trace_consume(iter); -+ break; -+ } -+ -+ /* In other cases, don't print partial lines */ - iter->seq.seq.len = save_len; - break; - } -@@ -7587,7 +7601,7 @@ static const struct file_operations tracing_thresh_fops = { - .llseek = generic_file_llseek, - }; - --#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) -+#ifdef CONFIG_TRACER_MAX_TRACE - static const struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic, - .read = tracing_max_lat_read, -@@ -9601,7 +9615,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) - - create_trace_options_dir(tr); - -+#ifdef CONFIG_TRACER_MAX_TRACE - trace_create_maxlat_file(tr, d_tracer); -+#endif - - if (ftrace_create_function_files(tr, d_tracer)) - MEM_FAIL(1, "Could not allocate function filter files"); -diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h -index d42e245071525..5581754d97628 100644 ---- a/kernel/trace/trace.h -+++ b/kernel/trace/trace.h -@@ -308,8 +308,7 @@ struct trace_array { - struct array_buffer max_buffer; - bool allocated_snapshot; - #endif --#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ -- || defined(CONFIG_OSNOISE_TRACER) -+#ifdef CONFIG_TRACER_MAX_TRACE - unsigned long max_latency; - #ifdef CONFIG_FSNOTIFY - struct dentry *d_max_latency; -@@ -688,12 +687,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, - void *cond_data); - void update_max_tr_single(struct trace_array *tr, - struct task_struct *tsk, int cpu); --#endif /* CONFIG_TRACER_MAX_TRACE */ - --#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ -- || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY) -+#ifdef CONFIG_FSNOTIFY - #define LATENCY_FS_NOTIFY - #endif -+#endif /* CONFIG_TRACER_MAX_TRACE */ - - #ifdef LATENCY_FS_NOTIFY - void latency_fsnotify(struct trace_array *tr); -@@ -1956,17 +1954,30 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) - } - - /* Check the name is good for event/group/fields */ --static inline bool is_good_name(const char *name) -+static inline bool __is_good_name(const char *name, bool hash_ok) - { -- if (!isalpha(*name) && *name != '_') -+ if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-')) - return false; - while (*++name != '\0') { -- if (!isalpha(*name) && !isdigit(*name) && *name != '_') -+ if (!isalpha(*name) && !isdigit(*name) && *name != '_' && -+ (!hash_ok || *name != '-')) - return false; - } - return true; - } - -+/* Check the name is good for event/group/fields */ -+static inline bool is_good_name(const char *name) -+{ -+ return __is_good_name(name, false); -+} -+ -+/* Check the name is good for system */ -+static inline bool is_good_system_name(const char *name) -+{ -+ return __is_good_name(name, true); -+} -+ - /* Convert certain expected symbols into '_' when generating event names */ - static inline void sanitize_event_name(char *name) - { -diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c -index 352b65e2b9105..753fc536525d3 100644 ---- a/kernel/trace/trace_eprobe.c -+++ b/kernel/trace/trace_eprobe.c -@@ -564,6 +564,9 @@ static void eprobe_trigger_func(struct event_trigger_data *data, - { - struct eprobe_data *edata = data->private_data; - -+ if (unlikely(!rec)) -+ return; -+ - if (unlikely(!rec)) - return; - -diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c -index b6e5724a9ea35..c6e406995c112 100644 ---- a/kernel/trace/trace_events_hist.c -+++ b/kernel/trace/trace_events_hist.c -@@ -617,7 +617,7 @@ struct action_data { - * event param, and is passed to the synthetic event - * invocation. - */ -- unsigned int var_ref_idx[TRACING_MAP_VARS_MAX]; -+ unsigned int var_ref_idx[SYNTH_FIELDS_MAX]; - struct synth_event *synth_event; - bool use_trace_keyword; - char *synth_event_name; -@@ -2173,7 +2173,9 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, - return ref_field; - } - } -- -+ /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */ -+ if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX) -+ return NULL; - ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); - if (ref_field) { - if (init_var_ref(ref_field, var_field, system, event_name)) { -@@ -3586,6 +3588,7 @@ static int parse_action_params(struct trace_array *tr, char *params, - while (params) { - if (data->n_params >= SYNTH_FIELDS_MAX) { - hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); -+ ret = -EINVAL; - goto out; - } - -@@ -3922,6 +3925,10 @@ static int trace_action_create(struct hist_trigger_data *hist_data, - - lockdep_assert_held(&event_mutex); - -+ /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */ -+ if (data->n_params > SYNTH_FIELDS_MAX) -+ return -EINVAL; -+ - if (data->use_trace_keyword) - synth_event_name = data->synth_event_name; - else -diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c -index c3b582d19b620..67592eed0be8d 100644 ---- a/kernel/trace/trace_events_synth.c -+++ b/kernel/trace/trace_events_synth.c -@@ -1282,12 +1282,12 @@ static int __create_synth_event(const char *name, const char *raw_fields) - goto err_free_arg; - } - -- fields[n_fields++] = field; - if (n_fields == SYNTH_FIELDS_MAX) { - synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); - ret = -EINVAL; - goto err_free_arg; - } -+ fields[n_fields++] = field; - - n_fields_this_loop++; - } -diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c -index 36dff277de464..bb2f95d7175c2 100644 ---- a/kernel/trace/trace_probe.c -+++ b/kernel/trace/trace_probe.c -@@ -246,7 +246,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, - return -EINVAL; - } - strlcpy(buf, event, slash - event + 1); -- if (!is_good_name(buf)) { -+ if (!is_good_system_name(buf)) { - trace_probe_log_err(offset, BAD_GROUP_NAME); - return -EINVAL; - } -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 3638b3424be53..12dfe6691dd52 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -2092,6 +2092,7 @@ config TEST_MIN_HEAP - config TEST_SORT - tristate "Array-based sort test" if !KUNIT_ALL_TESTS - depends on KUNIT -+ select STACKTRACE if ARCH_CORRECT_STACKTRACE_ON_KRETPROBE - default KUNIT_ALL_TESTS - help - This option enables the self-test function of 'sort()' at boot, -diff --git a/mm/hugetlb.c b/mm/hugetlb.c -index e36ca75311a5c..9c251faeb6f59 100644 ---- a/mm/hugetlb.c -+++ b/mm/hugetlb.c -@@ -255,6 +255,152 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) - return subpool_inode(file_inode(vma->vm_file)); - } - -+/* -+ * hugetlb vma_lock helper routines -+ */ -+static bool __vma_shareable_lock(struct vm_area_struct *vma) -+{ -+ return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && -+ vma->vm_private_data; -+} -+ -+void hugetlb_vma_lock_read(struct vm_area_struct *vma) -+{ -+ if (__vma_shareable_lock(vma)) { -+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -+ -+ down_read(&vma_lock->rw_sema); -+ } -+} -+ -+void hugetlb_vma_unlock_read(struct vm_area_struct *vma) -+{ -+ if (__vma_shareable_lock(vma)) { -+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -+ -+ up_read(&vma_lock->rw_sema); -+ } -+} -+ -+void hugetlb_vma_lock_write(struct vm_area_struct *vma) -+{ -+ if (__vma_shareable_lock(vma)) { -+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -+ -+ down_write(&vma_lock->rw_sema); -+ } -+} -+ -+void hugetlb_vma_unlock_write(struct vm_area_struct *vma) -+{ -+ if (__vma_shareable_lock(vma)) { -+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -+ -+ up_write(&vma_lock->rw_sema); -+ } -+} -+ -+int hugetlb_vma_trylock_write(struct vm_area_struct *vma) -+{ -+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -+ -+ if (!__vma_shareable_lock(vma)) -+ return 1; -+ -+ return down_write_trylock(&vma_lock->rw_sema); -+} -+ -+void hugetlb_vma_assert_locked(struct vm_area_struct *vma) -+{ -+ if (__vma_shareable_lock(vma)) { -+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -+ -+ lockdep_assert_held(&vma_lock->rw_sema); -+ } -+} -+ -+void hugetlb_vma_lock_release(struct kref *kref) -+{ -+ struct hugetlb_vma_lock *vma_lock = container_of(kref, -+ struct hugetlb_vma_lock, refs); -+ -+ kfree(vma_lock); -+} -+ -+static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) -+{ -+ struct vm_area_struct *vma = vma_lock->vma; -+ -+ /* -+ * vma_lock structure may or not be released as a result of put, -+ * it certainly will no longer be attached to vma so clear pointer. -+ * Semaphore synchronizes access to vma_lock->vma field. -+ */ -+ vma_lock->vma = NULL; -+ vma->vm_private_data = NULL; -+ up_write(&vma_lock->rw_sema); -+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release); -+} -+ -+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) -+{ -+ if (__vma_shareable_lock(vma)) { -+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -+ -+ __hugetlb_vma_unlock_write_put(vma_lock); -+ } -+} -+ -+static void hugetlb_vma_lock_free(struct vm_area_struct *vma) -+{ -+ /* -+ * Only present in sharable vmas. -+ */ -+ if (!vma || !__vma_shareable_lock(vma)) -+ return; -+ -+ if (vma->vm_private_data) { -+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -+ -+ down_write(&vma_lock->rw_sema); -+ __hugetlb_vma_unlock_write_put(vma_lock); -+ } -+} -+ -+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) -+{ -+ struct hugetlb_vma_lock *vma_lock; -+ -+ /* Only establish in (flags) sharable vmas */ -+ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) -+ return; -+ -+ /* Should never get here with non-NULL vm_private_data */ -+ if (vma->vm_private_data) -+ return; -+ -+ vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); -+ if (!vma_lock) { -+ /* -+ * If we can not allocate structure, then vma can not -+ * participate in pmd sharing. This is only a possible -+ * performance enhancement and memory saving issue. -+ * However, the lock is also used to synchronize page -+ * faults with truncation. If the lock is not present, -+ * unlikely races could leave pages in a file past i_size -+ * until the file is removed. Warn in the unlikely case of -+ * allocation failure. -+ */ -+ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); -+ return; -+ } -+ -+ kref_init(&vma_lock->refs); -+ init_rwsem(&vma_lock->rw_sema); -+ vma_lock->vma = vma; -+ vma->vm_private_data = vma_lock; -+} -+ - /* Helper that removes a struct file_region from the resv_map cache and returns - * it for use. - */ -@@ -6557,7 +6703,8 @@ bool hugetlb_reserve_pages(struct inode *inode, - } - - /* -- * vma specific semaphore used for pmd sharing synchronization -+ * vma specific semaphore used for pmd sharing and fault/truncation -+ * synchronization - */ - hugetlb_vma_lock_alloc(vma); - -@@ -6813,149 +6960,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, - *end = ALIGN(*end, PUD_SIZE); - } - --static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma) --{ -- return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && -- vma->vm_private_data; --} -- --void hugetlb_vma_lock_read(struct vm_area_struct *vma) --{ -- if (__vma_shareable_flags_pmd(vma)) { -- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -- -- down_read(&vma_lock->rw_sema); -- } --} -- --void hugetlb_vma_unlock_read(struct vm_area_struct *vma) --{ -- if (__vma_shareable_flags_pmd(vma)) { -- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -- -- up_read(&vma_lock->rw_sema); -- } --} -- --void hugetlb_vma_lock_write(struct vm_area_struct *vma) --{ -- if (__vma_shareable_flags_pmd(vma)) { -- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -- -- down_write(&vma_lock->rw_sema); -- } --} -- --void hugetlb_vma_unlock_write(struct vm_area_struct *vma) --{ -- if (__vma_shareable_flags_pmd(vma)) { -- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -- -- up_write(&vma_lock->rw_sema); -- } --} -- --int hugetlb_vma_trylock_write(struct vm_area_struct *vma) --{ -- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -- -- if (!__vma_shareable_flags_pmd(vma)) -- return 1; -- -- return down_write_trylock(&vma_lock->rw_sema); --} -- --void hugetlb_vma_assert_locked(struct vm_area_struct *vma) --{ -- if (__vma_shareable_flags_pmd(vma)) { -- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -- -- lockdep_assert_held(&vma_lock->rw_sema); -- } --} -- --void hugetlb_vma_lock_release(struct kref *kref) --{ -- struct hugetlb_vma_lock *vma_lock = container_of(kref, -- struct hugetlb_vma_lock, refs); -- -- kfree(vma_lock); --} -- --static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) --{ -- struct vm_area_struct *vma = vma_lock->vma; -- -- /* -- * vma_lock structure may or not be released as a result of put, -- * it certainly will no longer be attached to vma so clear pointer. -- * Semaphore synchronizes access to vma_lock->vma field. -- */ -- vma_lock->vma = NULL; -- vma->vm_private_data = NULL; -- up_write(&vma_lock->rw_sema); -- kref_put(&vma_lock->refs, hugetlb_vma_lock_release); --} -- --static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) --{ -- if (__vma_shareable_flags_pmd(vma)) { -- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -- -- __hugetlb_vma_unlock_write_put(vma_lock); -- } --} -- --static void hugetlb_vma_lock_free(struct vm_area_struct *vma) --{ -- /* -- * Only present in sharable vmas. -- */ -- if (!vma || !__vma_shareable_flags_pmd(vma)) -- return; -- -- if (vma->vm_private_data) { -- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; -- -- down_write(&vma_lock->rw_sema); -- __hugetlb_vma_unlock_write_put(vma_lock); -- } --} -- --static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) --{ -- struct hugetlb_vma_lock *vma_lock; -- -- /* Only establish in (flags) sharable vmas */ -- if (!vma || !(vma->vm_flags & VM_MAYSHARE)) -- return; -- -- /* Should never get here with non-NULL vm_private_data */ -- if (vma->vm_private_data) -- return; -- -- vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); -- if (!vma_lock) { -- /* -- * If we can not allocate structure, then vma can not -- * participate in pmd sharing. This is only a possible -- * performance enhancement and memory saving issue. -- * However, the lock is also used to synchronize page -- * faults with truncation. If the lock is not present, -- * unlikely races could leave pages in a file past i_size -- * until the file is removed. Warn in the unlikely case of -- * allocation failure. -- */ -- pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); -- return; -- } -- -- kref_init(&vma_lock->refs); -- init_rwsem(&vma_lock->rw_sema); -- vma_lock->vma = vma; -- vma->vm_private_data = vma_lock; --} -- - /* - * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() - * and returns the corresponding pte. While this is not necessary for the -@@ -7044,47 +7048,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - - #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ - --void hugetlb_vma_lock_read(struct vm_area_struct *vma) --{ --} -- --void hugetlb_vma_unlock_read(struct vm_area_struct *vma) --{ --} -- --void hugetlb_vma_lock_write(struct vm_area_struct *vma) --{ --} -- --void hugetlb_vma_unlock_write(struct vm_area_struct *vma) --{ --} -- --int hugetlb_vma_trylock_write(struct vm_area_struct *vma) --{ -- return 1; --} -- --void hugetlb_vma_assert_locked(struct vm_area_struct *vma) --{ --} -- --void hugetlb_vma_lock_release(struct kref *kref) --{ --} -- --static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) --{ --} -- --static void hugetlb_vma_lock_free(struct vm_area_struct *vma) --{ --} -- --static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) --{ --} -- - pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pud_t *pud) - { -diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c -index 942d2dfa11151..26fb97d1d4d9a 100644 ---- a/net/ipv4/syncookies.c -+++ b/net/ipv4/syncookies.c -@@ -288,12 +288,11 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - struct tcp_request_sock *treq; - struct request_sock *req; - --#ifdef CONFIG_MPTCP - if (sk_is_mptcp(sk)) -- ops = &mptcp_subflow_request_sock_ops; --#endif -+ req = mptcp_subflow_reqsk_alloc(ops, sk, false); -+ else -+ req = inet_reqsk_alloc(ops, sk, false); - -- req = inet_reqsk_alloc(ops, sk, false); - if (!req) - return NULL; - -diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c -index 9e82250cbb703..0430415357ba3 100644 ---- a/net/mptcp/pm_userspace.c -+++ b/net/mptcp/pm_userspace.c -@@ -156,6 +156,7 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) - - if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - GENL_SET_ERR_MSG(info, "invalid addr id or flags"); -+ err = -EINVAL; - goto announce_err; - } - -@@ -282,6 +283,7 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) - - if (addr_l.id == 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id"); -+ err = -EINVAL; - goto create_err; - } - -@@ -395,11 +397,13 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) - - if (addr_l.family != addr_r.family) { - GENL_SET_ERR_MSG(info, "address families do not match"); -+ err = -EINVAL; - goto destroy_err; - } - - if (!addr_l.port || !addr_r.port) { - GENL_SET_ERR_MSG(info, "missing local or remote port"); -+ err = -EINVAL; - goto destroy_err; - } - -diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c -index 2159b5f9988f8..613f515fedf0a 100644 ---- a/net/mptcp/subflow.c -+++ b/net/mptcp/subflow.c -@@ -45,7 +45,6 @@ static void subflow_req_destructor(struct request_sock *req) - sock_put((struct sock *)subflow_req->msk); - - mptcp_token_destroy_request(req); -- tcp_request_sock_ops.destructor(req); - } - - static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, -@@ -529,7 +528,7 @@ static int subflow_v6_rebuild_header(struct sock *sk) - } - #endif - --struct request_sock_ops mptcp_subflow_request_sock_ops; -+static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; - static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; - - static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) -@@ -542,7 +541,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) - if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - goto drop; - -- return tcp_conn_request(&mptcp_subflow_request_sock_ops, -+ return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, - &subflow_request_sock_ipv4_ops, - sk, skb); - drop: -@@ -550,7 +549,14 @@ drop: - return 0; - } - -+static void subflow_v4_req_destructor(struct request_sock *req) -+{ -+ subflow_req_destructor(req); -+ tcp_request_sock_ops.destructor(req); -+} -+ - #if IS_ENABLED(CONFIG_MPTCP_IPV6) -+static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; - static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; - static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; - static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; -@@ -573,15 +579,36 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) - return 0; - } - -- return tcp_conn_request(&mptcp_subflow_request_sock_ops, -+ return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, - &subflow_request_sock_ipv6_ops, sk, skb); - - drop: - tcp_listendrop(sk); - return 0; /* don't send reset */ - } -+ -+static void subflow_v6_req_destructor(struct request_sock *req) -+{ -+ subflow_req_destructor(req); -+ tcp6_request_sock_ops.destructor(req); -+} -+#endif -+ -+struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, -+ struct sock *sk_listener, -+ bool attach_listener) -+{ -+ if (ops->family == AF_INET) -+ ops = &mptcp_subflow_v4_request_sock_ops; -+#if IS_ENABLED(CONFIG_MPTCP_IPV6) -+ else if (ops->family == AF_INET6) -+ ops = &mptcp_subflow_v6_request_sock_ops; - #endif - -+ return inet_reqsk_alloc(ops, sk_listener, attach_listener); -+} -+EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); -+ - /* validate hmac received in third ACK */ - static bool subflow_hmac_valid(const struct request_sock *req, - const struct mptcp_options_received *mp_opt) -@@ -1904,7 +1931,6 @@ static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { - static int subflow_ops_init(struct request_sock_ops *subflow_ops) - { - subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); -- subflow_ops->slab_name = "request_sock_subflow"; - - subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, - subflow_ops->obj_size, 0, -@@ -1914,16 +1940,17 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) - if (!subflow_ops->slab) - return -ENOMEM; - -- subflow_ops->destructor = subflow_req_destructor; -- - return 0; - } - - void __init mptcp_subflow_init(void) - { -- mptcp_subflow_request_sock_ops = tcp_request_sock_ops; -- if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) -- panic("MPTCP: failed to init subflow request sock ops\n"); -+ mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; -+ mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; -+ mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; -+ -+ if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) -+ panic("MPTCP: failed to init subflow v4 request sock ops\n"); - - subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; - subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; -@@ -1938,6 +1965,20 @@ void __init mptcp_subflow_init(void) - tcp_prot_override.release_cb = tcp_release_cb_override; - - #if IS_ENABLED(CONFIG_MPTCP_IPV6) -+ /* In struct mptcp_subflow_request_sock, we assume the TCP request sock -+ * structures for v4 and v6 have the same size. It should not changed in -+ * the future but better to make sure to be warned if it is no longer -+ * the case. -+ */ -+ BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); -+ -+ mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; -+ mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; -+ mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; -+ -+ if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) -+ panic("MPTCP: failed to init subflow v6 request sock ops\n"); -+ - subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; - subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; - -diff --git a/security/device_cgroup.c b/security/device_cgroup.c -index a9f8c63a96d1a..bef2b9285fb34 100644 ---- a/security/device_cgroup.c -+++ b/security/device_cgroup.c -@@ -82,6 +82,17 @@ free_and_exit: - return -ENOMEM; - } - -+static void dev_exceptions_move(struct list_head *dest, struct list_head *orig) -+{ -+ struct dev_exception_item *ex, *tmp; -+ -+ lockdep_assert_held(&devcgroup_mutex); -+ -+ list_for_each_entry_safe(ex, tmp, orig, list) { -+ list_move_tail(&ex->list, dest); -+ } -+} -+ - /* - * called under devcgroup_mutex - */ -@@ -604,11 +615,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, - int count, rc = 0; - struct dev_exception_item ex; - struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent); -+ struct dev_cgroup tmp_devcgrp; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - memset(&ex, 0, sizeof(ex)); -+ memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp)); - b = buffer; - - switch (*b) { -@@ -620,15 +633,27 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, - - if (!may_allow_all(parent)) - return -EPERM; -- dev_exception_clean(devcgroup); -- devcgroup->behavior = DEVCG_DEFAULT_ALLOW; -- if (!parent) -+ if (!parent) { -+ devcgroup->behavior = DEVCG_DEFAULT_ALLOW; -+ dev_exception_clean(devcgroup); - break; -+ } - -+ INIT_LIST_HEAD(&tmp_devcgrp.exceptions); -+ rc = dev_exceptions_copy(&tmp_devcgrp.exceptions, -+ &devcgroup->exceptions); -+ if (rc) -+ return rc; -+ dev_exception_clean(devcgroup); - rc = dev_exceptions_copy(&devcgroup->exceptions, - &parent->exceptions); -- if (rc) -+ if (rc) { -+ dev_exceptions_move(&devcgroup->exceptions, -+ &tmp_devcgrp.exceptions); - return rc; -+ } -+ devcgroup->behavior = DEVCG_DEFAULT_ALLOW; -+ dev_exception_clean(&tmp_devcgrp); - break; - case DEVCG_DENY: - if (css_has_online_children(&devcgroup->css)) -diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig -index 7249f16257c72..39caeca474449 100644 ---- a/security/integrity/ima/Kconfig -+++ b/security/integrity/ima/Kconfig -@@ -112,7 +112,7 @@ choice - - config IMA_DEFAULT_HASH_SM3 - bool "SM3" -- depends on CRYPTO_SM3=y -+ depends on CRYPTO_SM3_GENERIC=y - endchoice - - config IMA_DEFAULT_HASH -diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c -index 040b03ddc1c77..4a207a3ef7ef3 100644 ---- a/security/integrity/ima/ima_main.c -+++ b/security/integrity/ima/ima_main.c -@@ -542,8 +542,13 @@ static int __ima_inode_hash(struct inode *inode, struct file *file, char *buf, - - rc = ima_collect_measurement(&tmp_iint, file, NULL, 0, - ima_hash_algo, NULL); -- if (rc < 0) -+ if (rc < 0) { -+ /* ima_hash could be allocated in case of failure. */ -+ if (rc != -ENOMEM) -+ kfree(tmp_iint.ima_hash); -+ - return -EOPNOTSUPP; -+ } - - iint = &tmp_iint; - mutex_lock(&iint->mutex); -diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c -index 195ac18f09275..04c49f05cb74f 100644 ---- a/security/integrity/ima/ima_template.c -+++ b/security/integrity/ima/ima_template.c -@@ -340,8 +340,11 @@ static struct ima_template_desc *restore_template_fmt(char *template_name) - - template_desc->name = ""; - template_desc->fmt = kstrdup(template_name, GFP_KERNEL); -- if (!template_desc->fmt) -+ if (!template_desc->fmt) { -+ kfree(template_desc); -+ template_desc = NULL; - goto out; -+ } - - spin_lock(&template_list); - list_add_tail_rcu(&template_desc->list, &defined_templates); -diff --git a/security/integrity/platform_certs/load_uefi.c b/security/integrity/platform_certs/load_uefi.c -index b78753d27d8ea..d1fdd113450a6 100644 ---- a/security/integrity/platform_certs/load_uefi.c -+++ b/security/integrity/platform_certs/load_uefi.c -@@ -35,6 +35,7 @@ static const struct dmi_system_id uefi_skip_cert[] = { - { UEFI_QUIRK_SKIP_CERT("Apple Inc.", "MacPro7,1") }, - { UEFI_QUIRK_SKIP_CERT("Apple Inc.", "iMac20,1") }, - { UEFI_QUIRK_SKIP_CERT("Apple Inc.", "iMac20,2") }, -+ { UEFI_QUIRK_SKIP_CERT("Apple Inc.", "iMacPro1,1") }, - { } - }; - -diff --git a/sound/pci/hda/patch_cs8409.c b/sound/pci/hda/patch_cs8409.c -index 754aa8ddd2e4f..0ba1fbcbb21e4 100644 ---- a/sound/pci/hda/patch_cs8409.c -+++ b/sound/pci/hda/patch_cs8409.c -@@ -888,7 +888,7 @@ static void cs42l42_resume(struct sub_codec *cs42l42) - - /* Initialize CS42L42 companion codec */ - cs8409_i2c_bulk_write(cs42l42, cs42l42->init_seq, cs42l42->init_seq_num); -- usleep_range(20000, 25000); -+ usleep_range(30000, 35000); - - /* Clear interrupts, by reading interrupt status registers */ - cs8409_i2c_bulk_read(cs42l42, irq_regs, ARRAY_SIZE(irq_regs)); -diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c -index f5f640851fdcb..3794b522c2222 100644 ---- a/sound/pci/hda/patch_realtek.c -+++ b/sound/pci/hda/patch_realtek.c -@@ -6903,6 +6903,34 @@ static void alc287_fixup_yoga9_14iap7_bass_spk_pin(struct hda_codec *codec, - } - } - -+static void alc295_fixup_dell_inspiron_top_speakers(struct hda_codec *codec, -+ const struct hda_fixup *fix, int action) -+{ -+ static const struct hda_pintbl pincfgs[] = { -+ { 0x14, 0x90170151 }, -+ { 0x17, 0x90170150 }, -+ { } -+ }; -+ static const hda_nid_t conn[] = { 0x02, 0x03 }; -+ static const hda_nid_t preferred_pairs[] = { -+ 0x14, 0x02, -+ 0x17, 0x03, -+ 0x21, 0x02, -+ 0 -+ }; -+ struct alc_spec *spec = codec->spec; -+ -+ alc_fixup_no_shutup(codec, fix, action); -+ -+ switch (action) { -+ case HDA_FIXUP_ACT_PRE_PROBE: -+ snd_hda_apply_pincfgs(codec, pincfgs); -+ snd_hda_override_conn_list(codec, 0x17, ARRAY_SIZE(conn), conn); -+ spec->gen.preferred_dacs = preferred_pairs; -+ break; -+ } -+} -+ - enum { - ALC269_FIXUP_GPIO2, - ALC269_FIXUP_SONY_VAIO, -@@ -7146,6 +7174,8 @@ enum { - ALC287_FIXUP_LEGION_16ITHG6, - ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK, - ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN, -+ ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS, -+ ALC236_FIXUP_DELL_DUAL_CODECS, - }; - - /* A special fixup for Lenovo C940 and Yoga Duet 7; -@@ -9095,6 +9125,18 @@ static const struct hda_fixup alc269_fixups[] = { - .chained = true, - .chain_id = ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK, - }, -+ [ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS] = { -+ .type = HDA_FIXUP_FUNC, -+ .v.func = alc295_fixup_dell_inspiron_top_speakers, -+ .chained = true, -+ .chain_id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, -+ }, -+ [ALC236_FIXUP_DELL_DUAL_CODECS] = { -+ .type = HDA_FIXUP_PINS, -+ .v.func = alc1220_fixup_gb_dual_codecs, -+ .chained = true, -+ .chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, -+ }, - }; - - static const struct snd_pci_quirk alc269_fixup_tbl[] = { -@@ -9195,6 +9237,14 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { - SND_PCI_QUIRK(0x1028, 0x0a9e, "Dell Latitude 5430", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0x1028, 0x0b19, "Dell XPS 15 9520", ALC289_FIXUP_DUAL_SPK), - SND_PCI_QUIRK(0x1028, 0x0b1a, "Dell Precision 5570", ALC289_FIXUP_DUAL_SPK), -+ SND_PCI_QUIRK(0x1028, 0x0b37, "Dell Inspiron 16 Plus 7620 2-in-1", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS), -+ SND_PCI_QUIRK(0x1028, 0x0b71, "Dell Inspiron 16 Plus 7620", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS), -+ SND_PCI_QUIRK(0x1028, 0x0c19, "Dell Precision 3340", ALC236_FIXUP_DELL_DUAL_CODECS), -+ SND_PCI_QUIRK(0x1028, 0x0c1a, "Dell Precision 3340", ALC236_FIXUP_DELL_DUAL_CODECS), -+ SND_PCI_QUIRK(0x1028, 0x0c1b, "Dell Precision 3440", ALC236_FIXUP_DELL_DUAL_CODECS), -+ SND_PCI_QUIRK(0x1028, 0x0c1c, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS), -+ SND_PCI_QUIRK(0x1028, 0x0c1d, "Dell Precision 3440", ALC236_FIXUP_DELL_DUAL_CODECS), -+ SND_PCI_QUIRK(0x1028, 0x0c1e, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS), - SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), -diff --git a/sound/soc/jz4740/jz4740-i2s.c b/sound/soc/jz4740/jz4740-i2s.c -index c4c1e89b47c1b..83cb81999c6fc 100644 ---- a/sound/soc/jz4740/jz4740-i2s.c -+++ b/sound/soc/jz4740/jz4740-i2s.c -@@ -55,7 +55,8 @@ - #define JZ_AIC_CTRL_MONO_TO_STEREO BIT(11) - #define JZ_AIC_CTRL_SWITCH_ENDIANNESS BIT(10) - #define JZ_AIC_CTRL_SIGNED_TO_UNSIGNED BIT(9) --#define JZ_AIC_CTRL_FLUSH BIT(8) -+#define JZ_AIC_CTRL_TFLUSH BIT(8) -+#define JZ_AIC_CTRL_RFLUSH BIT(7) - #define JZ_AIC_CTRL_ENABLE_ROR_INT BIT(6) - #define JZ_AIC_CTRL_ENABLE_TUR_INT BIT(5) - #define JZ_AIC_CTRL_ENABLE_RFS_INT BIT(4) -@@ -90,6 +91,8 @@ enum jz47xx_i2s_version { - struct i2s_soc_info { - enum jz47xx_i2s_version version; - struct snd_soc_dai_driver *dai; -+ -+ bool shared_fifo_flush; - }; - - struct jz4740_i2s { -@@ -116,19 +119,44 @@ static inline void jz4740_i2s_write(const struct jz4740_i2s *i2s, - writel(value, i2s->base + reg); - } - -+static inline void jz4740_i2s_set_bits(const struct jz4740_i2s *i2s, -+ unsigned int reg, uint32_t bits) -+{ -+ uint32_t value = jz4740_i2s_read(i2s, reg); -+ value |= bits; -+ jz4740_i2s_write(i2s, reg, value); -+} -+ - static int jz4740_i2s_startup(struct snd_pcm_substream *substream, - struct snd_soc_dai *dai) - { - struct jz4740_i2s *i2s = snd_soc_dai_get_drvdata(dai); -- uint32_t conf, ctrl; -+ uint32_t conf; - int ret; - -+ /* -+ * When we can flush FIFOs independently, only flush the FIFO -+ * that is starting up. We can do this when the DAI is active -+ * because it does not disturb other active substreams. -+ */ -+ if (!i2s->soc_info->shared_fifo_flush) { -+ if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) -+ jz4740_i2s_set_bits(i2s, JZ_REG_AIC_CTRL, JZ_AIC_CTRL_TFLUSH); -+ else -+ jz4740_i2s_set_bits(i2s, JZ_REG_AIC_CTRL, JZ_AIC_CTRL_RFLUSH); -+ } -+ - if (snd_soc_dai_active(dai)) - return 0; - -- ctrl = jz4740_i2s_read(i2s, JZ_REG_AIC_CTRL); -- ctrl |= JZ_AIC_CTRL_FLUSH; -- jz4740_i2s_write(i2s, JZ_REG_AIC_CTRL, ctrl); -+ /* -+ * When there is a shared flush bit for both FIFOs, the TFLUSH -+ * bit flushes both FIFOs. Flushing while the DAI is active would -+ * cause FIFO underruns in other active substreams so we have to -+ * guard this behind the snd_soc_dai_active() check. -+ */ -+ if (i2s->soc_info->shared_fifo_flush) -+ jz4740_i2s_set_bits(i2s, JZ_REG_AIC_CTRL, JZ_AIC_CTRL_TFLUSH); - - ret = clk_prepare_enable(i2s->clk_i2s); - if (ret) -@@ -443,6 +471,7 @@ static struct snd_soc_dai_driver jz4740_i2s_dai = { - static const struct i2s_soc_info jz4740_i2s_soc_info = { - .version = JZ_I2S_JZ4740, - .dai = &jz4740_i2s_dai, -+ .shared_fifo_flush = true, - }; - - static const struct i2s_soc_info jz4760_i2s_soc_info = { -diff --git a/sound/usb/card.h b/sound/usb/card.h -index 40061550105ac..6ec95b2edf863 100644 ---- a/sound/usb/card.h -+++ b/sound/usb/card.h -@@ -131,6 +131,7 @@ struct snd_usb_endpoint { - bool lowlatency_playback; /* low-latency playback mode */ - bool need_setup; /* (re-)need for hw_params? */ - bool need_prepare; /* (re-)need for prepare? */ -+ bool fixed_rate; /* skip rate setup */ - - /* for hw constraints */ - const struct audioformat *cur_audiofmt; -diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c -index 4aaf0784940b5..419302e2057e8 100644 ---- a/sound/usb/endpoint.c -+++ b/sound/usb/endpoint.c -@@ -769,7 +769,8 @@ struct snd_usb_endpoint * - snd_usb_endpoint_open(struct snd_usb_audio *chip, - const struct audioformat *fp, - const struct snd_pcm_hw_params *params, -- bool is_sync_ep) -+ bool is_sync_ep, -+ bool fixed_rate) - { - struct snd_usb_endpoint *ep; - int ep_num = is_sync_ep ? fp->sync_ep : fp->endpoint; -@@ -825,6 +826,7 @@ snd_usb_endpoint_open(struct snd_usb_audio *chip, - ep->implicit_fb_sync = fp->implicit_fb; - ep->need_setup = true; - ep->need_prepare = true; -+ ep->fixed_rate = fixed_rate; - - usb_audio_dbg(chip, " channels=%d, rate=%d, format=%s, period_bytes=%d, periods=%d, implicit_fb=%d\n", - ep->cur_channels, ep->cur_rate, -@@ -1413,11 +1415,13 @@ static int init_sample_rate(struct snd_usb_audio *chip, - if (clock && !clock->need_setup) - return 0; - -- err = snd_usb_init_sample_rate(chip, ep->cur_audiofmt, rate); -- if (err < 0) { -- if (clock) -- clock->rate = 0; /* reset rate */ -- return err; -+ if (!ep->fixed_rate) { -+ err = snd_usb_init_sample_rate(chip, ep->cur_audiofmt, rate); -+ if (err < 0) { -+ if (clock) -+ clock->rate = 0; /* reset rate */ -+ return err; -+ } - } - - if (clock) -diff --git a/sound/usb/endpoint.h b/sound/usb/endpoint.h -index e67ea28faa54f..924f4351588ce 100644 ---- a/sound/usb/endpoint.h -+++ b/sound/usb/endpoint.h -@@ -14,7 +14,8 @@ struct snd_usb_endpoint * - snd_usb_endpoint_open(struct snd_usb_audio *chip, - const struct audioformat *fp, - const struct snd_pcm_hw_params *params, -- bool is_sync_ep); -+ bool is_sync_ep, -+ bool fixed_rate); - void snd_usb_endpoint_close(struct snd_usb_audio *chip, - struct snd_usb_endpoint *ep); - int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, -diff --git a/sound/usb/implicit.c b/sound/usb/implicit.c -index f3e8484b3d9cb..41ac7185b42b6 100644 ---- a/sound/usb/implicit.c -+++ b/sound/usb/implicit.c -@@ -15,6 +15,7 @@ - #include "usbaudio.h" - #include "card.h" - #include "helper.h" -+#include "pcm.h" - #include "implicit.h" - - enum { -@@ -455,7 +456,8 @@ const struct audioformat * - snd_usb_find_implicit_fb_sync_format(struct snd_usb_audio *chip, - const struct audioformat *target, - const struct snd_pcm_hw_params *params, -- int stream) -+ int stream, -+ bool *fixed_rate) - { - struct snd_usb_substream *subs; - const struct audioformat *fp, *sync_fmt = NULL; -@@ -483,6 +485,8 @@ snd_usb_find_implicit_fb_sync_format(struct snd_usb_audio *chip, - } - } - -+ if (fixed_rate) -+ *fixed_rate = snd_usb_pcm_has_fixed_rate(subs); - return sync_fmt; - } - -diff --git a/sound/usb/implicit.h b/sound/usb/implicit.h -index ccb415a0ea860..7f1577b6c4d38 100644 ---- a/sound/usb/implicit.h -+++ b/sound/usb/implicit.h -@@ -9,6 +9,6 @@ const struct audioformat * - snd_usb_find_implicit_fb_sync_format(struct snd_usb_audio *chip, - const struct audioformat *target, - const struct snd_pcm_hw_params *params, -- int stream); -+ int stream, bool *fixed_rate); - - #endif /* __USBAUDIO_IMPLICIT_H */ -diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c -index 9557bd4d1bbca..99a66d0ef5b26 100644 ---- a/sound/usb/pcm.c -+++ b/sound/usb/pcm.c -@@ -157,6 +157,31 @@ find_substream_format(struct snd_usb_substream *subs, - true, subs); - } - -+bool snd_usb_pcm_has_fixed_rate(struct snd_usb_substream *subs) -+{ -+ const struct audioformat *fp; -+ struct snd_usb_audio *chip = subs->stream->chip; -+ int rate = -1; -+ -+ if (!(chip->quirk_flags & QUIRK_FLAG_FIXED_RATE)) -+ return false; -+ list_for_each_entry(fp, &subs->fmt_list, list) { -+ if (fp->rates & SNDRV_PCM_RATE_CONTINUOUS) -+ return false; -+ if (fp->nr_rates < 1) -+ continue; -+ if (fp->nr_rates > 1) -+ return false; -+ if (rate < 0) { -+ rate = fp->rate_table[0]; -+ continue; -+ } -+ if (rate != fp->rate_table[0]) -+ return false; -+ } -+ return true; -+} -+ - static int init_pitch_v1(struct snd_usb_audio *chip, int ep) - { - struct usb_device *dev = chip->dev; -@@ -450,12 +475,14 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, - struct snd_usb_audio *chip = subs->stream->chip; - const struct audioformat *fmt; - const struct audioformat *sync_fmt; -+ bool fixed_rate, sync_fixed_rate; - int ret; - - ret = snd_media_start_pipeline(subs); - if (ret) - return ret; - -+ fixed_rate = snd_usb_pcm_has_fixed_rate(subs); - fmt = find_substream_format(subs, hw_params); - if (!fmt) { - usb_audio_dbg(chip, -@@ -469,7 +496,8 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, - if (fmt->implicit_fb) { - sync_fmt = snd_usb_find_implicit_fb_sync_format(chip, fmt, - hw_params, -- !substream->stream); -+ !substream->stream, -+ &sync_fixed_rate); - if (!sync_fmt) { - usb_audio_dbg(chip, - "cannot find sync format: ep=0x%x, iface=%d:%d, format=%s, rate=%d, channels=%d\n", -@@ -482,6 +510,7 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, - } - } else { - sync_fmt = fmt; -+ sync_fixed_rate = fixed_rate; - } - - ret = snd_usb_lock_shutdown(chip); -@@ -499,7 +528,7 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, - close_endpoints(chip, subs); - } - -- subs->data_endpoint = snd_usb_endpoint_open(chip, fmt, hw_params, false); -+ subs->data_endpoint = snd_usb_endpoint_open(chip, fmt, hw_params, false, fixed_rate); - if (!subs->data_endpoint) { - ret = -EINVAL; - goto unlock; -@@ -508,7 +537,8 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, - if (fmt->sync_ep) { - subs->sync_endpoint = snd_usb_endpoint_open(chip, sync_fmt, - hw_params, -- fmt == sync_fmt); -+ fmt == sync_fmt, -+ sync_fixed_rate); - if (!subs->sync_endpoint) { - ret = -EINVAL; - goto unlock; -diff --git a/sound/usb/pcm.h b/sound/usb/pcm.h -index 493a4e34d78dc..388fe2ba346d6 100644 ---- a/sound/usb/pcm.h -+++ b/sound/usb/pcm.h -@@ -6,6 +6,8 @@ void snd_usb_set_pcm_ops(struct snd_pcm *pcm, int stream); - int snd_usb_pcm_suspend(struct snd_usb_stream *as); - int snd_usb_pcm_resume(struct snd_usb_stream *as); - -+bool snd_usb_pcm_has_fixed_rate(struct snd_usb_substream *as); -+ - int snd_usb_init_pitch(struct snd_usb_audio *chip, - const struct audioformat *fmt); - void snd_usb_preallocate_buffer(struct snd_usb_substream *subs); -diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c -index 58b37bfc885cb..3d13fdf7590cd 100644 ---- a/sound/usb/quirks.c -+++ b/sound/usb/quirks.c -@@ -2152,6 +2152,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { - QUIRK_FLAG_GENERIC_IMPLICIT_FB), - DEVICE_FLG(0x0525, 0xa4ad, /* Hamedal C20 usb camero */ - QUIRK_FLAG_IFACE_SKIP_CLOSE), -+ DEVICE_FLG(0x0ecb, 0x2069, /* JBL Quantum810 Wireless */ -+ QUIRK_FLAG_FIXED_RATE), - - /* Vendor matches */ - VENDOR_FLG(0x045e, /* MS Lifecam */ -diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h -index 2aba508a48312..f5a8dca66457f 100644 ---- a/sound/usb/usbaudio.h -+++ b/sound/usb/usbaudio.h -@@ -175,6 +175,9 @@ extern bool snd_usb_skip_validation; - * QUIRK_FLAG_FORCE_IFACE_RESET - * Force an interface reset whenever stopping & restarting a stream - * (e.g. after xrun) -+ * QUIRK_FLAG_FIXED_RATE -+ * Do not set PCM rate (frequency) when only one rate is available -+ * for the given endpoint. - */ - - #define QUIRK_FLAG_GET_SAMPLE_RATE (1U << 0) -@@ -198,5 +201,6 @@ extern bool snd_usb_skip_validation; - #define QUIRK_FLAG_SKIP_IMPLICIT_FB (1U << 18) - #define QUIRK_FLAG_IFACE_SKIP_CLOSE (1U << 19) - #define QUIRK_FLAG_FORCE_IFACE_RESET (1U << 20) -+#define QUIRK_FLAG_FIXED_RATE (1U << 21) - - #endif /* __USBAUDIO_H */ -diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl -index 09d1578f9d66f..1737c59e4ff67 100755 ---- a/tools/testing/ktest/ktest.pl -+++ b/tools/testing/ktest/ktest.pl -@@ -1963,7 +1963,7 @@ sub run_scp_mod { - - sub _get_grub_index { - -- my ($command, $target, $skip) = @_; -+ my ($command, $target, $skip, $submenu) = @_; - - return if (defined($grub_number) && defined($last_grub_menu) && - $last_grub_menu eq $grub_menu && defined($last_machine) && -@@ -1980,11 +1980,16 @@ sub _get_grub_index { - - my $found = 0; - -+ my $submenu_number = 0; -+ - while () { - if (/$target/) { - $grub_number++; - $found = 1; - last; -+ } elsif (defined($submenu) && /$submenu/) { -+ $submenu_number++; -+ $grub_number = -1; - } elsif (/$skip/) { - $grub_number++; - } -@@ -1993,6 +1998,9 @@ sub _get_grub_index { - - dodie "Could not find '$grub_menu' through $command on $machine" - if (!$found); -+ if ($submenu_number > 0) { -+ $grub_number = "$submenu_number>$grub_number"; -+ } - doprint "$grub_number\n"; - $last_grub_menu = $grub_menu; - $last_machine = $machine; -@@ -2003,6 +2011,7 @@ sub get_grub_index { - my $command; - my $target; - my $skip; -+ my $submenu; - my $grub_menu_qt; - - if ($reboot_type !~ /^grub/) { -@@ -2017,8 +2026,9 @@ sub get_grub_index { - $skip = '^\s*title\s'; - } elsif ($reboot_type eq "grub2") { - $command = "cat $grub_file"; -- $target = '^menuentry.*' . $grub_menu_qt; -- $skip = '^menuentry\s|^submenu\s'; -+ $target = '^\s*menuentry.*' . $grub_menu_qt; -+ $skip = '^\s*menuentry'; -+ $submenu = '^\s*submenu\s'; - } elsif ($reboot_type eq "grub2bls") { - $command = $grub_bls_get; - $target = '^title=.*' . $grub_menu_qt; -@@ -2027,7 +2037,7 @@ sub get_grub_index { - return; - } - -- _get_grub_index($command, $target, $skip); -+ _get_grub_index($command, $target, $skip, $submenu); - } - - sub wait_for_input { -@@ -2090,7 +2100,7 @@ sub reboot_to { - if ($reboot_type eq "grub") { - run_ssh "'(echo \"savedefault --default=$grub_number --once\" | grub --batch)'"; - } elsif (($reboot_type eq "grub2") or ($reboot_type eq "grub2bls")) { -- run_ssh "$grub_reboot $grub_number"; -+ run_ssh "$grub_reboot \"'$grub_number'\""; - } elsif ($reboot_type eq "syslinux") { - run_ssh "$syslinux --once \\\"$syslinux_label\\\" $syslinux_path"; - } elsif (defined $reboot_script) { -@@ -3768,9 +3778,10 @@ sub test_this_config { - # .config to make sure it is missing the config that - # we had before - my %configs = %min_configs; -- delete $configs{$config}; -+ $configs{$config} = "# $config is not set"; - make_new_config ((values %configs), (values %keep_configs)); - make_oldconfig; -+ delete $configs{$config}; - undef %configs; - assign_configs \%configs, $output_config; - -diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk -index a3ea3d4a206d0..291144c284fbc 100644 ---- a/tools/testing/selftests/lib.mk -+++ b/tools/testing/selftests/lib.mk -@@ -123,6 +123,11 @@ endef - clean: - $(CLEAN) - -+# Enables to extend CFLAGS and LDFLAGS from command line, e.g. -+# make USERCFLAGS=-Werror USERLDFLAGS=-static -+CFLAGS += $(USERCFLAGS) -+LDFLAGS += $(USERLDFLAGS) -+ - # When make O= with kselftest target from main level - # the following aren't defined. - # diff --git a/sys-kernel/pinephone-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch b/sys-kernel/pinephone-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch new file mode 100644 index 0000000..e8c3015 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch @@ -0,0 +1,17 @@ +--- a/fs/namei.c 2022-01-23 13:02:27.876558299 -0500 ++++ b/fs/namei.c 2022-03-06 12:47:39.375719693 -0500 +@@ -1020,10 +1020,10 @@ static inline void put_link(struct namei + path_put(&last->link); + } + +-static int sysctl_protected_symlinks __read_mostly; +-static int sysctl_protected_hardlinks __read_mostly; +-static int sysctl_protected_fifos __read_mostly; +-static int sysctl_protected_regular __read_mostly; ++static int sysctl_protected_symlinks __read_mostly = 1; ++static int sysctl_protected_hardlinks __read_mostly = 1; ++int sysctl_protected_fifos __read_mostly = 1; ++int sysctl_protected_regular __read_mostly = 1; + + #ifdef CONFIG_SYSCTL + static struct ctl_table namei_sysctls[] = { diff --git a/sys-kernel/pinephone-sources/files/5020_BMQ-and-PDS-io-scheduler-v6.1-r0.patch b/sys-kernel/pinephone-sources/files/5020_BMQ-and-PDS-io-scheduler-v6.1-r0.patch deleted file mode 100644 index 783f3bc..0000000 --- a/sys-kernel/pinephone-sources/files/5020_BMQ-and-PDS-io-scheduler-v6.1-r0.patch +++ /dev/null @@ -1,10076 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 42af9ca0127e..31747ec54f9d 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -5406,6 +5406,12 @@ - sa1100ir [NET] - See drivers/net/irda/sa1100_ir.c. - -+ sched_timeslice= -+ [KNL] Time slice in ms for Project C BMQ/PDS scheduler. -+ Format: integer 2, 4 -+ Default: 4 -+ See Documentation/scheduler/sched-BMQ.txt -+ - sched_verbose [KNL] Enables verbose scheduler debug messages. - - schedstats= [KNL,X86] Enable or disable scheduled statistics. -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 98d1b198b2b4..d7c78a107f93 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1552,3 +1552,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+yield_type: -+=========== -+ -+BMQ/PDS CPU scheduler only. This determines what type of yield calls -+to sched_yield will perform. -+ -+ 0 - No yield. -+ 1 - Deboost and requeue task. (default) -+ 2 - Set run queue skip task. -diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt -new file mode 100644 -index 000000000000..05c84eec0f31 ---- /dev/null -+++ b/Documentation/scheduler/sched-BMQ.txt -@@ -0,0 +1,110 @@ -+ BitMap queue CPU Scheduler -+ -------------------------- -+ -+CONTENT -+======== -+ -+ Background -+ Design -+ Overview -+ Task policy -+ Priority management -+ BitMap Queue -+ CPU Assignment and Migration -+ -+ -+Background -+========== -+ -+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution -+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), -+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code -+simple, while efficiency and scalable for interactive tasks, such as desktop, -+movie playback and gaming etc. -+ -+Design -+====== -+ -+Overview -+-------- -+ -+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, -+each CPU is responsible for scheduling the tasks that are putting into it's -+run queue. -+ -+The run queue is a set of priority queues. Note that these queues are fifo -+queue for non-rt tasks or priority queue for rt tasks in data structure. See -+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact -+that most applications are non-rt tasks. No matter the queue is fifo or -+priority, In each queue is an ordered list of runnable tasks awaiting execution -+and the data structures are the same. When it is time for a new task to run, -+the scheduler simply looks the lowest numbered queueue that contains a task, -+and runs the first task from the head of that queue. And per CPU idle task is -+also in the run queue, so the scheduler can always find a task to run on from -+its run queue. -+ -+Each task will assigned the same timeslice(default 4ms) when it is picked to -+start running. Task will be reinserted at the end of the appropriate priority -+queue when it uses its whole timeslice. When the scheduler selects a new task -+from the priority queue it sets the CPU's preemption timer for the remainder of -+the previous timeslice. When that timer fires the scheduler will stop execution -+on that task, select another task and start over again. -+ -+If a task blocks waiting for a shared resource then it's taken out of its -+priority queue and is placed in a wait queue for the shared resource. When it -+is unblocked it will be reinserted in the appropriate priority queue of an -+eligible CPU. -+ -+Task policy -+----------- -+ -+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the -+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's -+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each -+policy. -+ -+DEADLINE -+ It is squashed as priority 0 FIFO task. -+ -+FIFO/RR -+ All RT tasks share one single priority queue in BMQ run queue designed. The -+complexity of insert operation is O(n). BMQ is not designed for system runs -+with major rt policy tasks. -+ -+NORMAL/BATCH/IDLE -+ BATCH and IDLE tasks are treated as the same policy. They compete CPU with -+NORMAL policy tasks, but they just don't boost. To control the priority of -+NORMAL/BATCH/IDLE tasks, simply use nice level. -+ -+ISO -+ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy -+task instead. -+ -+Priority management -+------------------- -+ -+RT tasks have priority from 0-99. For non-rt tasks, there are three different -+factors used to determine the effective priority of a task. The effective -+priority being what is used to determine which queue it will be in. -+ -+The first factor is simply the task’s static priority. Which is assigned from -+task's nice level, within [-20, 19] in userland's point of view and [0, 39] -+internally. -+ -+The second factor is the priority boost. This is a value bounded between -+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is -+modified by the following cases: -+ -+*When a thread has used up its entire timeslice, always deboost its boost by -+increasing by one. -+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, -+and its switch-in time(time after last switch and run) below the thredhold -+based on its priority boost, will boost its boost by decreasing by one buti is -+capped at 0 (won’t go negative). -+ -+The intent in this system is to ensure that interactive threads are serviced -+quickly. These are usually the threads that interact directly with the user -+and cause user-perceivable latency. These threads usually do little work and -+spend most of their time blocked awaiting another user event. So they get the -+priority boost from unblocking while background threads that do most of the -+processing receive the priority penalty for using their entire timeslice. -diff --git a/fs/proc/base.c b/fs/proc/base.c -index 9e479d7d202b..2a8530021b23 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h -index 8874f681b056..59eb72bf7d5f 100644 ---- a/include/asm-generic/resource.h -+++ b/include/asm-generic/resource.h -@@ -23,7 +23,7 @@ - [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ - [RLIMIT_SIGPENDING] = { 0, 0 }, \ - [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ -- [RLIMIT_NICE] = { 0, 0 }, \ -+ [RLIMIT_NICE] = { 30, 30 }, \ - [RLIMIT_RTPRIO] = { 0, 0 }, \ - [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ - } -diff --git a/include/linux/sched.h b/include/linux/sched.h -index ffb6eb55cd13..2e730a59caa2 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -762,8 +762,14 @@ struct task_struct { - unsigned int ptrace; - - #ifdef CONFIG_SMP -- int on_cpu; - struct __call_single_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) -+ int on_cpu; -+#endif -+ -+#ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - unsigned int wakee_flips; - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; -@@ -777,6 +783,7 @@ struct task_struct { - */ - int recent_used_cpu; - int wake_cpu; -+#endif /* !CONFIG_SCHED_ALT */ - #endif - int on_rq; - -@@ -785,6 +792,20 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_ALT -+ u64 last_ran; -+ s64 time_slice; -+ int sq_idx; -+ struct list_head sq_node; -+#ifdef CONFIG_SCHED_BMQ -+ int boost_prio; -+#endif /* CONFIG_SCHED_BMQ */ -+#ifdef CONFIG_SCHED_PDS -+ u64 deadline; -+#endif /* CONFIG_SCHED_PDS */ -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* !CONFIG_SCHED_ALT */ - struct sched_entity se; - struct sched_rt_entity rt; - struct sched_dl_entity dl; -@@ -795,6 +816,7 @@ struct task_struct { - unsigned long core_cookie; - unsigned int core_occupation; - #endif -+#endif /* !CONFIG_SCHED_ALT */ - - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; -@@ -1545,6 +1567,15 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_ALT -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+#endif /* !CONFIG_SCHED_ALT */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 7c83d4d5a971..fa30f98cb2be 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,24 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_ALT -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHED_BMQ -+#define __tsk_deadline(p) (0UL) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+#define __tsk_deadline(p) ((((u64) ((p)->prio))<<56) | (p)->deadline) -+#endif -+ -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -21,6 +40,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_ALT */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index ab83d85e1183..6af9ae681116 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -18,6 +18,32 @@ - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -+#ifdef CONFIG_SCHED_ALT -+ -+/* Undefine MAX_PRIO and DEFAULT_PRIO */ -+#undef MAX_PRIO -+#undef DEFAULT_PRIO -+ -+/* +/- priority levels from the base priority */ -+#ifdef CONFIG_SCHED_BMQ -+#define MAX_PRIORITY_ADJ (7) -+ -+#define MIN_NORMAL_PRIO (MAX_RT_PRIO) -+#define MAX_PRIO (MIN_NORMAL_PRIO + NICE_WIDTH) -+#define DEFAULT_PRIO (MIN_NORMAL_PRIO + NICE_WIDTH / 2) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+#define MAX_PRIORITY_ADJ (0) -+ -+#define MIN_NORMAL_PRIO (128) -+#define NORMAL_PRIO_NUM (64) -+#define MAX_PRIO (MIN_NORMAL_PRIO + NORMAL_PRIO_NUM) -+#define DEFAULT_PRIO (MAX_PRIO - NICE_WIDTH / 2) -+#endif -+ -+#endif /* CONFIG_SCHED_ALT */ -+ - /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index 994c25640e15..8c050a59ece1 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_ALT - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h -index 816df6cc444e..c8da08e18c91 100644 ---- a/include/linux/sched/topology.h -+++ b/include/linux/sched/topology.h -@@ -234,7 +234,8 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) - - #endif /* !CONFIG_SMP */ - --#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) && \ -+ !defined(CONFIG_SCHED_ALT) - extern void rebuild_sched_domains_energy(void); - #else - static inline void rebuild_sched_domains_energy(void) -diff --git a/init/Kconfig b/init/Kconfig -index 94125d3b6893..c87ba766d354 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -819,6 +819,7 @@ menu "Scheduler features" - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_ALT - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -865,6 +866,35 @@ config UCLAMP_BUCKETS_COUNT - - If in doubt, use the default value. - -+menuconfig SCHED_ALT -+ bool "Alternative CPU Schedulers" -+ default n -+ help -+ This feature enable alternative CPU scheduler" -+ -+if SCHED_ALT -+ -+choice -+ prompt "Alternative CPU Scheduler" -+ default SCHED_BMQ -+ -+config SCHED_BMQ -+ bool "BMQ CPU scheduler" -+ help -+ The BitMap Queue CPU scheduler for excellent interactivity and -+ responsiveness on the desktop and solid scalability on normal -+ hardware and commodity servers. -+ -+config SCHED_PDS -+ bool "PDS CPU scheduler" -+ help -+ The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler. -+ -+endchoice -+ -+endif -+ - endmenu - - # -@@ -918,6 +948,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION && !PREEMPT_RT -+ depends on !SCHED_ALT - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -1015,6 +1046,7 @@ config FAIR_GROUP_SCHED - depends on CGROUP_SCHED - default CGROUP_SCHED - -+if !SCHED_ALT - config CFS_BANDWIDTH - bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" - depends on FAIR_GROUP_SCHED -@@ -1037,6 +1069,7 @@ config RT_GROUP_SCHED - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.rst for more information. - -+endif #!SCHED_ALT - endif #CGROUP_SCHED - - config UCLAMP_TASK_GROUP -@@ -1281,6 +1314,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_ALT - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index ff6c4b9bfe6b..19e9c662d1a1 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -75,9 +75,15 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_ALT -+ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+ .static_prio = DEFAULT_PRIO, -+ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .user_cpus_ptr = NULL, -@@ -88,6 +94,17 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifdef CONFIG_SCHED_ALT -+ .sq_node = LIST_HEAD_INIT(init_task.sq_node), -+#ifdef CONFIG_SCHED_BMQ -+ .boost_prio = 0, -+ .sq_idx = 15, -+#endif -+#ifdef CONFIG_SCHED_PDS -+ .deadline = 0, -+#endif -+ .time_slice = HZ, -+#else - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -95,6 +112,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a821..41654679b1b2 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -117,7 +117,7 @@ config PREEMPT_DYNAMIC - - config SCHED_CORE - bool "Core Scheduling for SMT" -- depends on SCHED_SMT -+ depends on SCHED_SMT && !SCHED_ALT - help - This option permits Core Scheduling, a means of coordinated task - selection across SMT siblings. When enabled -- see -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index b474289c15b8..a23224b45b03 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -787,7 +787,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1183,7 +1183,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index e39cb696cfbd..463423572e09 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -150,7 +150,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index 35e0a31a0315..64e368441cf4 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -125,7 +125,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -146,7 +146,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index 7779ee8abc2a..5b9893cdfb1b 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -300,21 +300,25 @@ static __always_inline void - waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) - { - waiter->prio = __waiter_prio(task); -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - } - - /* - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = __tsk_deadline(p) } - - static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline < right->deadline); -+#else - if (left->prio < right->prio) - return 1; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -323,16 +327,22 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return dl_time_before(left->deadline, right->deadline); -+#endif - - return 0; -+#endif - } - - static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline == right->deadline); -+#else - if (left->prio != right->prio) - return 0; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -341,8 +351,10 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return left->deadline == right->deadline; -+#endif - - return 1; -+#endif - } - - static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 976092b7bd45..31d587c16ec1 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -28,7 +28,12 @@ endif - # These compilation units have roughly the same size and complexity - so their - # build parallelizes well and finishes roughly at once: - # -+ifdef CONFIG_SCHED_ALT -+obj-y += alt_core.o -+obj-$(CONFIG_SCHED_DEBUG) += alt_debug.o -+else - obj-y += core.o - obj-y += fair.o -+endif - obj-y += build_policy.o - obj-y += build_utility.o -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -new file mode 100644 -index 000000000000..4bea0c025475 ---- /dev/null -+++ b/kernel/sched/alt_core.c -@@ -0,0 +1,7912 @@ -+/* -+ * kernel/sched/alt_core.c -+ * -+ * Core alternative kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+ -+#define CREATE_TRACE_POINTS -+#include -+#undef CREATE_TRACE_POINTS -+ -+#include "sched.h" -+ -+#include "pelt.h" -+ -+#include "../../io_uring/io-wq.h" -+#include "../smpboot.h" -+ -+/* -+ * Export tracepoints that act as a bare tracehook (ie: have no trace event -+ * associated with them) to allow external modules to probe them. -+ */ -+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); -+ -+#ifdef CONFIG_SCHED_DEBUG -+#define sched_feat(x) (1) -+/* -+ * Print a warning if need_resched is set for the given duration (if -+ * LATENCY_WARN is enabled). -+ * -+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown -+ * per boot. -+ */ -+__read_mostly int sysctl_resched_latency_warn_ms = 100; -+__read_mostly int sysctl_resched_latency_warn_once = 1; -+#else -+#define sched_feat(x) (0) -+#endif /* CONFIG_SCHED_DEBUG */ -+ -+#define ALT_SCHED_VERSION "v6.1-r0" -+ -+/* rt_prio(prio) defined in include/linux/sched/rt.h */ -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ -+u64 sched_timeslice_ns __read_mostly = (4 << 20); -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq, int idx); -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds.h" -+#endif -+ -+static int __init sched_timeslice(char *str) -+{ -+ int timeslice_ms; -+ -+ get_option(&str, ×lice_ms); -+ if (2 != timeslice_ms) -+ timeslice_ms = 4; -+ sched_timeslice_ns = timeslice_ms << 20; -+ sched_timeslice_imp(timeslice_ms); -+ -+ return 0; -+} -+early_param("sched_timeslice", sched_timeslice); -+ -+/* Reschedule if less than this many μs left */ -+#define RESCHED_NS (100 << 10) -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Deboost and requeue task. (default) -+ * 2: Set rq skip task. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+#ifdef CONFIG_SMP -+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_topo_end_mask); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; -+#endif -+static cpumask_t sched_rq_watermark[SCHED_QUEUE_BITS] ____cacheline_aligned_in_smp; -+ -+/* sched_queue related functions */ -+static inline void sched_queue_init(struct sched_queue *q) -+{ -+ int i; -+ -+ bitmap_zero(q->bitmap, SCHED_QUEUE_BITS); -+ for(i = 0; i < SCHED_BITS; i++) -+ INIT_LIST_HEAD(&q->heads[i]); -+} -+ -+/* -+ * Init idle task and put into queue structure of rq -+ * IMPORTANT: may be called multiple times for a single cpu -+ */ -+static inline void sched_queue_init_idle(struct sched_queue *q, -+ struct task_struct *idle) -+{ -+ idle->sq_idx = IDLE_TASK_SCHED_PRIO; -+ INIT_LIST_HEAD(&q->heads[idle->sq_idx]); -+ list_add(&idle->sq_node, &q->heads[idle->sq_idx]); -+} -+ -+/* water mark related functions */ -+static inline void update_sched_rq_watermark(struct rq *rq) -+{ -+ unsigned long watermark = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS); -+ unsigned long last_wm = rq->watermark; -+ unsigned long i; -+ int cpu; -+ -+ if (watermark == last_wm) -+ return; -+ -+ rq->watermark = watermark; -+ cpu = cpu_of(rq); -+ if (watermark < last_wm) { -+ for (i = last_wm; i > watermark; i--) -+ cpumask_clear_cpu(cpu, sched_rq_watermark + SCHED_QUEUE_BITS - i); -+#ifdef CONFIG_SCHED_SMT -+ if (static_branch_likely(&sched_smt_present) && -+ IDLE_TASK_SCHED_PRIO == last_wm) -+ cpumask_andnot(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+#endif -+ return; -+ } -+ /* last_wm < watermark */ -+ for (i = watermark; i > last_wm; i--) -+ cpumask_set_cpu(cpu, sched_rq_watermark + SCHED_QUEUE_BITS - i); -+#ifdef CONFIG_SCHED_SMT -+ if (static_branch_likely(&sched_smt_present) && -+ IDLE_TASK_SCHED_PRIO == watermark) { -+ cpumask_t tmp; -+ -+ cpumask_and(&tmp, cpu_smt_mask(cpu), sched_rq_watermark); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+ } -+#endif -+} -+ -+/* -+ * This routine assume that the idle task always in queue -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS); -+ const struct list_head *head = &rq->queue.heads[sched_prio2idx(idx, rq)]; -+ -+ return list_first_entry(head, struct task_struct, sq_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ unsigned long idx = p->sq_idx; -+ struct list_head *head = &rq->queue.heads[idx]; -+ -+ if (list_is_last(&p->sq_node, head)) { -+ idx = find_next_bit(rq->queue.bitmap, SCHED_QUEUE_BITS, -+ sched_idx2prio(idx, rq) + 1); -+ head = &rq->queue.heads[sched_prio2idx(idx, rq)]; -+ -+ return list_first_entry(head, struct task_struct, sq_node); -+ } -+ -+ return list_next_entry(p, sq_node); -+} -+ -+static inline struct task_struct *rq_runnable_task(struct rq *rq) -+{ -+ struct task_struct *next = sched_rq_first_task(rq); -+ -+ if (unlikely(next == rq->skip)) -+ next = sched_rq_next_task(next, rq); -+ -+ return next; -+} -+ -+/* -+ * Serialization rules: -+ * -+ * Lock order: -+ * -+ * p->pi_lock -+ * rq->lock -+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) -+ * -+ * rq1->lock -+ * rq2->lock where: rq1 < rq2 -+ * -+ * Regular state: -+ * -+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the -+ * local CPU's rq->lock, it optionally removes the task from the runqueue and -+ * always looks at the local rq data structures to find the most eligible task -+ * to run next. -+ * -+ * Task enqueue is also under rq->lock, possibly taken from another CPU. -+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to -+ * the local CPU to avoid bouncing the runqueue state around [ see -+ * ttwu_queue_wakelist() ] -+ * -+ * Task wakeup, specifically wakeups that involve migration, are horribly -+ * complicated to avoid having to take two rq->locks. -+ * -+ * Special state: -+ * -+ * System-calls and anything external will use task_rq_lock() which acquires -+ * both p->pi_lock and rq->lock. As a consequence the state they change is -+ * stable while holding either lock: -+ * -+ * - sched_setaffinity()/ -+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed -+ * - set_user_nice(): p->se.load, p->*prio -+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, -+ * p->se.load, p->rt_priority, -+ * p->dl.dl_{runtime, deadline, period, flags, bw, density} -+ * - sched_setnuma(): p->numa_preferred_nid -+ * - sched_move_task(): p->sched_task_group -+ * - uclamp_update_active() p->uclamp* -+ * -+ * p->state <- TASK_*: -+ * -+ * is changed locklessly using set_current_state(), __set_current_state() or -+ * set_special_state(), see their respective comments, or by -+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against -+ * concurrent self. -+ * -+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: -+ * -+ * is set by activate_task() and cleared by deactivate_task(), under -+ * rq->lock. Non-zero indicates the task is runnable, the special -+ * ON_RQ_MIGRATING state is used for migration without holding both -+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). -+ * -+ * p->on_cpu <- { 0, 1 }: -+ * -+ * is set by prepare_task() and cleared by finish_task() such that it will be -+ * set before p is scheduled-in and cleared after p is scheduled-out, both -+ * under rq->lock. Non-zero indicates the task is running on its CPU. -+ * -+ * [ The astute reader will observe that it is possible for two tasks on one -+ * CPU to have ->on_cpu = 1 at the same time. ] -+ * -+ * task_cpu(p): is changed by set_task_cpu(), the rules are: -+ * -+ * - Don't call set_task_cpu() on a blocked task: -+ * -+ * We don't care what CPU we're not running on, this simplifies hotplug, -+ * the CPU assignment of blocked tasks isn't required to be valid. -+ * -+ * - for try_to_wake_up(), called under p->pi_lock: -+ * -+ * This allows try_to_wake_up() to only take one rq->lock, see its comment. -+ * -+ * - for migration called under rq->lock: -+ * [ see task_on_rq_migrating() in task_rq_lock() ] -+ * -+ * o move_queued_task() -+ * o detach_task() -+ * -+ * - for migration called under double_rq_lock(): -+ * -+ * o __migrate_swap_task() -+ * o push_rt_task() / pull_rt_task() -+ * o push_dl_task() / pull_dl_task() -+ * o dl_task_offline_migration() -+ * -+ */ -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+static inline void -+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -+} -+ -+void raw_spin_rq_lock_nested(struct rq *rq, int subclass) -+{ -+ raw_spinlock_t *lock; -+ -+ /* Matches synchronize_rcu() in __sched_core_enable() */ -+ preempt_disable(); -+ -+ for (;;) { -+ lock = __rq_lockp(rq); -+ raw_spin_lock_nested(lock, subclass); -+ if (likely(lock == __rq_lockp(rq))) { -+ /* preempt_count *MUST* be > 1 */ -+ preempt_enable_no_resched(); -+ return; -+ } -+ raw_spin_unlock(lock); -+ } -+} -+ -+void raw_spin_rq_unlock(struct rq *rq) -+{ -+ raw_spin_unlock(rq_lockp(rq)); -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+ psi_account_irqtime(rq->curr, irq_delta); -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_time_edge(rq); -+ update_rq_clock_task(rq, delta); -+} -+ -+/* -+ * RQ Load update routine -+ */ -+#define RQ_LOAD_HISTORY_BITS (sizeof(s32) * 8ULL) -+#define RQ_UTIL_SHIFT (8) -+#define RQ_LOAD_HISTORY_TO_UTIL(l) (((l) >> (RQ_LOAD_HISTORY_BITS - 1 - RQ_UTIL_SHIFT)) & 0xff) -+ -+#define LOAD_BLOCK(t) ((t) >> 17) -+#define LOAD_HALF_BLOCK(t) ((t) >> 16) -+#define BLOCK_MASK(t) ((t) & ((0x01 << 18) - 1)) -+#define LOAD_BLOCK_BIT(b) (1UL << (RQ_LOAD_HISTORY_BITS - 1 - (b))) -+#define CURRENT_LOAD_BIT LOAD_BLOCK_BIT(0) -+ -+static inline void rq_load_update(struct rq *rq) -+{ -+ u64 time = rq->clock; -+ u64 delta = min(LOAD_BLOCK(time) - LOAD_BLOCK(rq->load_stamp), -+ RQ_LOAD_HISTORY_BITS - 1); -+ u64 prev = !!(rq->load_history & CURRENT_LOAD_BIT); -+ u64 curr = !!rq->nr_running; -+ -+ if (delta) { -+ rq->load_history = rq->load_history >> delta; -+ -+ if (delta < RQ_UTIL_SHIFT) { -+ rq->load_block += (~BLOCK_MASK(rq->load_stamp)) * prev; -+ if (!!LOAD_HALF_BLOCK(rq->load_block) ^ curr) -+ rq->load_history ^= LOAD_BLOCK_BIT(delta); -+ } -+ -+ rq->load_block = BLOCK_MASK(time) * prev; -+ } else { -+ rq->load_block += (time - rq->load_stamp) * prev; -+ } -+ if (prev ^ curr) -+ rq->load_history ^= CURRENT_LOAD_BIT; -+ rq->load_stamp = time; -+} -+ -+unsigned long rq_load_util(struct rq *rq, unsigned long max) -+{ -+ return RQ_LOAD_HISTORY_TO_UTIL(rq->load_history) * (max >> RQ_UTIL_SHIFT); -+} -+ -+#ifdef CONFIG_SMP -+unsigned long sched_cpu_util(int cpu) -+{ -+ return rq_load_util(cpu_rq(cpu), arch_scale_cpu_capacity(cpu)); -+} -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_CPU_FREQ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+#ifdef CONFIG_SMP -+ rq_load_update(rq); -+#endif -+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, -+ cpu_of(rq))); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+#ifdef CONFIG_SMP -+ rq_load_update(rq); -+#endif -+} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+bool sched_task_on_rq(struct task_struct *p) -+{ -+ return task_on_rq_queued(p); -+} -+ -+unsigned long get_wchan(struct task_struct *p) -+{ -+ unsigned long ip = 0; -+ unsigned int state; -+ -+ if (!p || p == current) -+ return 0; -+ -+ /* Only get wchan if task is blocked and we can keep it that way. */ -+ raw_spin_lock_irq(&p->pi_lock); -+ state = READ_ONCE(p->__state); -+ smp_rmb(); /* see try_to_wake_up() */ -+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) -+ ip = __get_wchan(p); -+ raw_spin_unlock_irq(&p->pi_lock); -+ -+ return ip; -+} -+ -+/* -+ * Add/Remove/Requeue task to/from the runqueue routines -+ * Context: rq->lock -+ */ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeue(rq, p); \ -+ \ -+ list_del(&p->sq_node); \ -+ if (list_empty(&rq->queue.heads[p->sq_idx])) \ -+ clear_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_enqueue(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->sq_idx = task_sched_prio_idx(p, rq); \ -+ list_add_tail(&p->sq_node, &rq->queue.heads[p->sq_idx]); \ -+ set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); -+ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_DEQUEUE_TASK(p, rq, flags); -+ --rq->nr_running; -+#ifdef CONFIG_SMP -+ if (1 == rq->nr_running) -+ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+} -+ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_ENQUEUE_TASK(p, rq, flags); -+ update_sched_rq_watermark(rq); -+ ++rq->nr_running; -+#ifdef CONFIG_SMP -+ if (2 == rq->nr_running) -+ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq, int idx) -+{ -+ lockdep_assert_held(&rq->lock); -+ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ list_del(&p->sq_node); -+ list_add_tail(&p->sq_node, &rq->queue.heads[idx]); -+ if (idx != p->sq_idx) { -+ if (list_empty(&rq->queue.heads[p->sq_idx])) -+ clear_bit(sched_idx2prio(p->sq_idx, rq), -+ rq->queue.bitmap); -+ p->sq_idx = idx; -+ set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); -+ update_sched_rq_watermark(rq); -+ } -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _val = *_ptr; \ -+ \ -+ do { \ -+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \ -+ _val; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static inline bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) -+ break; -+ } -+ return true; -+} -+ -+#else -+static inline bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static inline bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * it's already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) {} -+ -+void select_nohz_load_balancer(int stop_tick) {} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ const struct cpumask *hk_mask; -+ -+ if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); -+ -+ for (mask = per_cpu(sched_cpu_topo_masks, cpu) + 1; -+ mask < per_cpu(sched_cpu_topo_end_mask, cpu); mask++) -+ for_each_cpu_and(i, mask, hk_mask) -+ if (!idle_cpu(i)) -+ return i; -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); -+ cpu = default_cpu; -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+static inline void wake_up_idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(rq->idle)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static inline bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+ -+static void nohz_csd_func(void *info) -+{ -+ struct rq *rq = info; -+ int cpu = cpu_of(rq); -+ unsigned int flags; -+ -+ /* -+ * Release the rq::nohz_csd. -+ */ -+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); -+ WARN_ON(!(flags & NOHZ_KICK_MASK)); -+ -+ rq->idle_balance = idle_cpu(cpu); -+ if (rq->idle_balance && !need_resched()) { -+ rq->nohz_idle_balance = flags; -+ raise_softirq_irqoff(SCHED_SOFTIRQ); -+ } -+} -+ -+#endif /* CONFIG_NO_HZ_COMMON */ -+#endif /* CONFIG_SMP */ -+ -+static inline void check_preempt_curr(struct rq *rq) -+{ -+ if (sched_rq_first_task(rq) != rq->curr) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * Alt schedule FW doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time = rq->hrtick_time; -+ -+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ -+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ if (rq == this_rq()) -+ __hrtick_restart(rq); -+ else -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int __normal_prio(int policy, int rt_prio, int static_prio) -+{ -+ return rt_policy(policy) ? (MAX_RT_PRIO - 1 - rt_prio) : -+ static_prio + MAX_PRIORITY_ADJ; -+} -+ -+/* -+ * Calculate the expected normal priority: i.e. priority -+ * without taking RT-inheritance into account. Might be -+ * boosted by interactivity modifiers. Changes upon fork, -+ * setprio syscalls, and whenever the interactivity -+ * estimator recalculates. -+ */ -+static inline int normal_prio(struct task_struct *p) -+{ -+ return __normal_prio(p->policy, p->rt_priority, p->static_prio); -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT * p->in_iowait); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_util(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+} -+ -+static inline bool is_migration_disabled(struct task_struct *p) -+{ -+#ifdef CONFIG_SMP -+ return p->migration_disabled; -+#else -+ return false; -+#endif -+} -+ -+#define SCA_CHECK 0x01 -+#define SCA_USER 0x08 -+ -+#ifdef CONFIG_SMP -+ -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ unsigned int state = READ_ONCE(p->__state); -+ -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); -+ -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+ -+ WARN_ON_ONCE(is_migration_disabled(p)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+#define MDF_FORCE_ENABLED 0x80 -+ -+static void -+__do_set_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ /* -+ * This here violates the locking rules for affinity, since we're only -+ * supposed to change these variables while holding both rq->lock and -+ * p->pi_lock. -+ * -+ * HOWEVER, it magically works, because ttwu() is the only code that -+ * accesses these variables under p->pi_lock and only does so after -+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() -+ * before finish_task(). -+ * -+ * XXX do further audits, this smells like something putrid. -+ */ -+ SCHED_WARN_ON(!p->on_cpu); -+ p->cpus_ptr = new_mask; -+} -+ -+void migrate_disable(void) -+{ -+ struct task_struct *p = current; -+ int cpu; -+ -+ if (p->migration_disabled) { -+ p->migration_disabled++; -+ return; -+ } -+ -+ preempt_disable(); -+ cpu = smp_processor_id(); -+ if (cpumask_test_cpu(cpu, &p->cpus_mask)) { -+ cpu_rq(cpu)->nr_pinned++; -+ p->migration_disabled = 1; -+ p->migration_flags &= ~MDF_FORCE_ENABLED; -+ -+ /* -+ * Violates locking rules! see comment in __do_set_cpus_ptr(). -+ */ -+ if (p->cpus_ptr == &p->cpus_mask) -+ __do_set_cpus_ptr(p, cpumask_of(cpu)); -+ } -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(migrate_disable); -+ -+void migrate_enable(void) -+{ -+ struct task_struct *p = current; -+ -+ if (0 == p->migration_disabled) -+ return; -+ -+ if (p->migration_disabled > 1) { -+ p->migration_disabled--; -+ return; -+ } -+ -+ if (WARN_ON_ONCE(!p->migration_disabled)) -+ return; -+ -+ /* -+ * Ensure stop_task runs either before or after this, and that -+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). -+ */ -+ preempt_disable(); -+ /* -+ * Assumption: current should be running on allowed cpu -+ */ -+ WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &p->cpus_mask)); -+ if (p->cpus_ptr != &p->cpus_mask) -+ __do_set_cpus_ptr(p, &p->cpus_mask); -+ /* -+ * Mustn't clear migration_disabled() until cpus_ptr points back at the -+ * regular cpus_mask, otherwise things that race (eg. -+ * select_fallback_rq) get confused. -+ */ -+ barrier(); -+ p->migration_disabled = 0; -+ this_rq()->nr_pinned--; -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(migrate_enable); -+ -+static inline bool rq_has_pinned_tasks(struct rq *rq) -+{ -+ return rq->nr_pinned; -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ /* When not in the task's cpumask, no point in looking further. */ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ /* migrate_disabled() must be allowed to finish. */ -+ if (is_migration_disabled(p)) -+ return cpu_online(cpu); -+ -+ /* Non kernel threads are not allowed during either online or offline. */ -+ if (!(p->flags & PF_KTHREAD)) -+ return cpu_active(cpu) && task_cpu_possible(cpu, p); -+ -+ /* KTHREAD_IS_PER_CPU is always allowed. */ -+ if (kthread_is_per_cpu(p)) -+ return cpu_online(cpu); -+ -+ /* Regular kernel threads don't get to stay during offline. */ -+ if (cpu_dying(cpu)) -+ return false; -+ -+ /* But are allowed during online. */ -+ return cpu_online(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ dequeue_task(p, rq, 0); -+ update_sched_rq_watermark(rq); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ WARN_ON_ONCE(task_cpu(p) != new_cpu); -+ sched_task_sanity_check(p, rq); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ unsigned long flags; -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_save(flags); -+ /* -+ * We need to explicitly wake pending tasks before running -+ * __migrate_task() such that we will not miss enforcing cpus_ptr -+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. -+ */ -+ flush_smp_call_function_queue(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+static void -+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ lockdep_assert_held(&p->pi_lock); -+ set_cpus_allowed_common(p, new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ __do_set_cpus_allowed(p, new_mask); -+} -+ -+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, -+ int node) -+{ -+ if (!src->user_cpus_ptr) -+ return 0; -+ -+ dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); -+ if (!dst->user_cpus_ptr) -+ return -ENOMEM; -+ -+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); -+ return 0; -+} -+ -+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) -+{ -+ struct cpumask *user_mask = NULL; -+ -+ swap(p->user_cpus_ptr, user_mask); -+ -+ return user_mask; -+} -+ -+void release_user_cpus_ptr(struct task_struct *p) -+{ -+ kfree(clear_user_cpus_ptr(p)); -+} -+ -+#endif -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * Wait for the thread to block in any of the states set in @match_state. -+ * If it changes, i.e. @p might have woken up, then return zero. When we -+ * succeed in waiting for @p to be off its CPU, we return a positive number -+ * (its total switch count). If a second call a short while later returns the -+ * same number, the caller can be sure that @p has remained unscheduled the -+ * whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_on_cpu(p) && p == rq->curr) { -+ if (!(READ_ONCE(p->__state) & match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_on_cpu(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (READ_ONCE(p->__state) & match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (is_cpu_allowed(p, dest_cpu)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, p->cpus_ptr) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (cpuset_cpus_allowed_fallback(p)) { -+ state = possible; -+ break; -+ } -+ fallthrough; -+ case possible: -+ /* -+ * XXX When called from select_task_rq() we only -+ * hold p->pi_lock and again violate locking order. -+ * -+ * More yuck to audit. -+ */ -+ do_set_cpus_allowed(p, task_cpu_possible_mask(p)); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ cpumask_t chk_mask, tmp; -+ -+ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_active_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ if ( -+#ifdef CONFIG_SCHED_SMT -+ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || -+#endif -+ cpumask_and(&tmp, &chk_mask, sched_rq_watermark) || -+ cpumask_and(&tmp, &chk_mask, -+ sched_rq_watermark + SCHED_QUEUE_BITS - 1 - task_sched_prio(p))) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ return best_mask_cpu(task_cpu(p), &chk_mask); -+} -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ static struct lock_class_key stop_pi_lock; -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ -+ /* -+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to -+ * adjust the effective priority of a task. As a result, -+ * rt_mutex_setprio() can trigger (RT) balancing operations, -+ * which can then trigger wakeups of the stop thread to push -+ * around the current task. -+ * -+ * The stop task itself will never be part of the PI-chain, it -+ * never blocks, therefore that ->pi_lock recursion is safe. -+ * Tell lockdep about this by placing the stop->pi_lock in its -+ * own class. -+ */ -+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+static int affine_move_task(struct rq *rq, struct task_struct *p, int dest_cpu, -+ raw_spinlock_t *lock, unsigned long irq_flags) -+{ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { -+ if (p->migration_disabled) { -+ if (likely(p->cpus_ptr != &p->cpus_mask)) -+ __do_set_cpus_ptr(p, &p->cpus_mask); -+ p->migration_disabled = 0; -+ p->migration_flags |= MDF_FORCE_ENABLED; -+ /* When p is migrate_disabled, rq->lock should be held */ -+ rq->nr_pinned--; -+ } -+ -+ if (task_on_cpu(p) || READ_ONCE(p->__state) == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ } -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ return 0; -+} -+ -+static int __set_cpus_allowed_ptr_locked(struct task_struct *p, -+ const struct cpumask *new_mask, -+ u32 flags, -+ struct rq *rq, -+ raw_spinlock_t *lock, -+ unsigned long irq_flags) -+{ -+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ bool kthread = p->flags & PF_KTHREAD; -+ struct cpumask *user_mask = NULL; -+ int dest_cpu; -+ int ret = 0; -+ -+ if (kthread || is_migration_disabled(p)) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs, -+ * however, during cpu-hot-unplug, even these might get pushed -+ * away if not KTHREAD_IS_PER_CPU. -+ * -+ * Specifically, migration_disabled() tasks must not fail the -+ * cpumask_any_and_distribute() pick below, esp. so on -+ * SCA_MIGRATE_ENABLE, otherwise we'll not call -+ * set_cpus_allowed_common() and actually reset p->cpus_ptr. -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ __do_set_cpus_allowed(p, new_mask); -+ -+ if (flags & SCA_USER) -+ user_mask = clear_user_cpus_ptr(p); -+ -+ ret = affine_move_task(rq, p, dest_cpu, lock, irq_flags); -+ -+ kfree(user_mask); -+ -+ return ret; -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ -+ return ret; -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, u32 flags) -+{ -+ unsigned long irq_flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, irq_flags); -+ rq = __task_access_lock(p, &lock); -+ -+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, lock, irq_flags); -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, 0); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+/* -+ * Change a given task's CPU affinity to the intersection of its current -+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask -+ * and pointing @p->user_cpus_ptr to a copy of the old mask. -+ * If the resulting mask is empty, leave the affinity unchanged and return -+ * -EINVAL. -+ */ -+static int restrict_cpus_allowed_ptr(struct task_struct *p, -+ struct cpumask *new_mask, -+ const struct cpumask *subset_mask) -+{ -+ struct cpumask *user_mask = NULL; -+ unsigned long irq_flags; -+ raw_spinlock_t *lock; -+ struct rq *rq; -+ int err; -+ -+ if (!p->user_cpus_ptr) { -+ user_mask = kmalloc(cpumask_size(), GFP_KERNEL); -+ if (!user_mask) -+ return -ENOMEM; -+ } -+ -+ raw_spin_lock_irqsave(&p->pi_lock, irq_flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { -+ err = -EINVAL; -+ goto err_unlock; -+ } -+ -+ /* -+ * We're about to butcher the task affinity, so keep track of what -+ * the user asked for in case we're able to restore it later on. -+ */ -+ if (user_mask) { -+ cpumask_copy(user_mask, p->cpus_ptr); -+ p->user_cpus_ptr = user_mask; -+ } -+ -+ /*return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);*/ -+ return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, lock, irq_flags); -+ -+err_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); -+ kfree(user_mask); -+ return err; -+} -+ -+/* -+ * Restrict the CPU affinity of task @p so that it is a subset of -+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the -+ * old affinity mask. If the resulting mask is empty, we warn and walk -+ * up the cpuset hierarchy until we find a suitable mask. -+ */ -+void force_compatible_cpus_allowed_ptr(struct task_struct *p) -+{ -+ cpumask_var_t new_mask; -+ const struct cpumask *override_mask = task_cpu_possible_mask(p); -+ -+ alloc_cpumask_var(&new_mask, GFP_KERNEL); -+ -+ /* -+ * __migrate_task() can fail silently in the face of concurrent -+ * offlining of the chosen destination CPU, so take the hotplug -+ * lock to ensure that the migration succeeds. -+ */ -+ cpus_read_lock(); -+ if (!cpumask_available(new_mask)) -+ goto out_set_mask; -+ -+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) -+ goto out_free_mask; -+ -+ /* -+ * We failed to find a valid subset of the affinity mask for the -+ * task, so override it based on its cpuset hierarchy. -+ */ -+ cpuset_cpus_allowed(p, new_mask); -+ override_mask = new_mask; -+ -+out_set_mask: -+ if (printk_ratelimit()) { -+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", -+ task_pid_nr(p), p->comm, -+ cpumask_pr_args(override_mask)); -+ } -+ -+ WARN_ON(set_cpus_allowed_ptr(p, override_mask)); -+out_free_mask: -+ cpus_read_unlock(); -+ free_cpumask_var(new_mask); -+} -+ -+static int -+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); -+ -+/* -+ * Restore the affinity of a task @p which was previously restricted by a -+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) -+ * @p->user_cpus_ptr. -+ * -+ * It is the caller's responsibility to serialise this with any calls to -+ * force_compatible_cpus_allowed_ptr(@p). -+ */ -+void relax_compatible_cpus_allowed_ptr(struct task_struct *p) -+{ -+ struct cpumask *user_mask = p->user_cpus_ptr; -+ unsigned long flags; -+ -+ /* -+ * Try to restore the old affinity mask. If this fails, then -+ * we free the mask explicitly to avoid it being inherited across -+ * a subsequent fork(). -+ */ -+ if (!user_mask || !__sched_setaffinity(p, user_mask)) -+ return; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ user_mask = clear_user_cpus_ptr(p); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ kfree(user_mask); -+} -+ -+#else /* CONFIG_SMP */ -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, u32 flags) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+ -+static inline bool rq_has_pinned_tasks(struct rq *rq) -+{ -+ return false; -+} -+ -+#endif /* !CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq = this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) { -+ __schedstat_inc(rq->ttwu_local); -+ __schedstat_inc(p->stats.nr_wakeups_local); -+ } else { -+ /** Alt schedule FW ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+ __schedstat_inc(p->stats.nr_wakeups); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ check_preempt_curr(rq); -+ WRITE_ONCE(p->__state, TASK_RUNNING); -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+ if ( -+#ifdef CONFIG_SMP -+ !(wake_flags & WF_MIGRATED) && -+#endif -+ p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+/* -+ * Consider @p being inside a wait loop: -+ * -+ * for (;;) { -+ * set_current_state(TASK_UNINTERRUPTIBLE); -+ * -+ * if (CONDITION) -+ * break; -+ * -+ * schedule(); -+ * } -+ * __set_current_state(TASK_RUNNING); -+ * -+ * between set_current_state() and schedule(). In this case @p is still -+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in -+ * an atomic manner. -+ * -+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq -+ * then schedule() must still happen and p->state can be changed to -+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we -+ * need to do a full wakeup with enqueue. -+ * -+ * Returns: %true when the wakeup is done, -+ * %false otherwise. -+ */ -+static int ttwu_runnable(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ /* check_preempt_curr() may use rq clock */ -+ update_rq_clock(rq); -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ update_rq_clock(rq); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { -+ if (WARN_ON_ONCE(p->on_cpu)) -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) -+ set_task_cpu(p, cpu_of(rq)); -+ -+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if -+ * necessary. The wakee CPU on receipt of the IPI will queue the task -+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost -+ * of the wakeup instead of the waker. -+ */ -+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); -+ -+ WRITE_ONCE(rq->ttwu_pending, 1); -+ __smp_call_single_queue(cpu, &p->wake_entry.llist); -+} -+ -+static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) -+{ -+ /* -+ * Do not complicate things with the async wake_list while the CPU is -+ * in hotplug state. -+ */ -+ if (!cpu_active(cpu)) -+ return false; -+ -+ /* Ensure the task will still be allowed to run on the CPU. */ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ /* -+ * If the CPU does not share cache, then queue the task on the -+ * remote rqs wakelist to avoid accessing remote data. -+ */ -+ if (!cpus_share_cache(smp_processor_id(), cpu)) -+ return true; -+ -+ if (cpu == smp_processor_id()) -+ return false; -+ -+ /* -+ * If the wakee cpu is idle, or the task is descheduling and the -+ * only running task on the CPU, then use the wakelist to offload -+ * the task activation to the idle (or soon-to-be-idle) CPU as -+ * the current CPU is likely busy. nr_running is checked to -+ * avoid unnecessary task stacking. -+ * -+ * Note that we can only get here with (wakee) p->on_rq=0, -+ * p->on_cpu can be whatever, we've done the dequeue, so -+ * the wakee has been accounted out of ->nr_running. -+ */ -+ if (!cpu_rq(cpu)->nr_running) -+ return true; -+ -+ return false; -+} -+ -+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ __ttwu_queue_wakelist(p, cpu, wake_flags); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ resched_curr(rq); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ if (this_cpu == that_cpu) -+ return true; -+ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#else /* !CONFIG_SMP */ -+ -+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (ttwu_queue_wakelist(p, cpu, wake_flags)) -+ return; -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Invoked from try_to_wake_up() to check whether the task can be woken up. -+ * -+ * The caller holds p::pi_lock if p != current or has preemption -+ * disabled when p == current. -+ * -+ * The rules of PREEMPT_RT saved_state: -+ * -+ * The related locking code always holds p::pi_lock when updating -+ * p::saved_state, which means the code is fully serialized in both cases. -+ * -+ * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other -+ * bits set. This allows to distinguish all wakeup scenarios. -+ */ -+static __always_inline -+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) -+{ -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { -+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && -+ state != TASK_RTLOCK_WAIT); -+ } -+ -+ if (READ_ONCE(p->__state) & state) { -+ *success = 1; -+ return true; -+ } -+ -+#ifdef CONFIG_PREEMPT_RT -+ /* -+ * Saved state preserves the task state across blocking on -+ * an RT lock. If the state matches, set p::saved_state to -+ * TASK_RUNNING, but do not wake the task because it waits -+ * for a lock wakeup. Also indicate success because from -+ * the regular waker's point of view this has succeeded. -+ * -+ * After acquiring the lock the task will restore p::__state -+ * from p::saved_state which ensures that the regular -+ * wakeup is not lost. The restore will also set -+ * p::saved_state to TASK_RUNNING so any further tests will -+ * not result in false positives vs. @success -+ */ -+ if (p->saved_state & state) { -+ p->saved_state = TASK_RUNNING; -+ *success = 1; -+ } -+#endif -+ return false; -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task() -+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Conceptually does: -+ * -+ * If (@state & @p->state) @p->state = TASK_RUNNING. -+ * -+ * If the task was not queued/runnable, also place it back on a runqueue. -+ * -+ * This function is atomic against schedule() which would dequeue the task. -+ * -+ * It issues a full memory barrier before accessing @p->state, see the comment -+ * with set_current_state(). -+ * -+ * Uses p->pi_lock to serialize against concurrent wake-ups. -+ * -+ * Relies on p->pi_lock stabilizing: -+ * - p->sched_class -+ * - p->cpus_ptr -+ * - p->sched_task_group -+ * in order to do migration, see its use of select_task_rq()/set_task_cpu(). -+ * -+ * Tries really hard to only take one task_rq(p)->lock for performance. -+ * Takes rq->lock in: -+ * - ttwu_runnable() -- old rq, unavoidable, see comment there; -+ * - ttwu_queue() -- new rq, for enqueue of the task; -+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. -+ * -+ * As a consequence we race really badly with just about everything. See the -+ * many memory barriers and their comments for details. -+ * -+ * Return: %true if @p->state changes (an actual wakeup was done), -+ * %false otherwise. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_runnable()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!ttwu_state_match(p, state, &success)) -+ goto out; -+ -+ trace_sched_waking(p); -+ WRITE_ONCE(p->__state, TASK_RUNNING); -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with smp_store_mb() -+ * in set_current_state() that the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!ttwu_state_match(p, state, &success)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). -+ */ -+ smp_rmb(); -+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) -+ goto unlock; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure -+ * schedule()'s deactivate_task() has 'happened' and p will no longer -+ * care about it's own p->state. See the comment in __schedule(). -+ */ -+ smp_acquire__after_ctrl_dep(); -+ -+ /* -+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq -+ * == 0), which means we need to do an enqueue, change p->state to -+ * TASK_WAKING such that we can unlock p->pi_lock before doing the -+ * enqueue, such as ttwu_queue_wakelist(). -+ */ -+ WRITE_ONCE(p->__state, TASK_WAKING); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, considering queueing p on the remote CPUs wake_list -+ * which potentially sends an IPI instead of spinning on p->on_cpu to -+ * let the waker make forward progress. This is safe because IRQs are -+ * disabled and the IPI will deliver after on_cpu is cleared. -+ * -+ * Ensure we load task_cpu(p) after p->on_cpu: -+ * -+ * set_task_cpu(p, cpu); -+ * STORE p->cpu = @cpu -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock -+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) -+ * STORE p->on_cpu = 1 LOAD p->cpu -+ * -+ * to ensure we observe the correct CPU on which the task is currently -+ * scheduling. -+ */ -+ if (smp_load_acquire(&p->on_cpu) && -+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) -+ goto unlock; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until it's done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ sched_task_ttwu(p); -+ -+ cpu = select_task_rq(p); -+ -+ if (cpu != task_cpu(p)) { -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else -+ cpu = task_cpu(p); -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, task_cpu(p), wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+static bool __task_needs_rq_lock(struct task_struct *p) -+{ -+ unsigned int state = READ_ONCE(p->__state); -+ -+ /* -+ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when -+ * the task is blocked. Make sure to check @state since ttwu() can drop -+ * locks at the end, see ttwu_queue_wakelist(). -+ */ -+ if (state == TASK_RUNNING || state == TASK_WAKING) -+ return true; -+ -+ /* -+ * Ensure we load p->on_rq after p->__state, otherwise it would be -+ * possible to, falsely, observe p->on_rq == 0. -+ * -+ * See try_to_wake_up() for a longer comment. -+ */ -+ smp_rmb(); -+ if (p->on_rq) -+ return true; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure the task has finished __schedule() and will not be referenced -+ * anymore. Again, see try_to_wake_up() for a longer comment. -+ */ -+ smp_rmb(); -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+#endif -+ -+ return false; -+} -+ -+/** -+ * task_call_func - Invoke a function on task in fixed state -+ * @p: Process for which the function is to be invoked, can be @current. -+ * @func: Function to invoke. -+ * @arg: Argument to function. -+ * -+ * Fix the task in it's current state by avoiding wakeups and or rq operations -+ * and call @func(@arg) on it. This function can use ->on_rq and task_curr() -+ * to work out what the state is, if required. Given that @func can be invoked -+ * with a runqueue lock held, it had better be quite lightweight. -+ * -+ * Returns: -+ * Whatever @func returns -+ */ -+int task_call_func(struct task_struct *p, task_call_f func, void *arg) -+{ -+ struct rq *rq = NULL; -+ struct rq_flags rf; -+ int ret; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags); -+ -+ if (__task_needs_rq_lock(p)) -+ rq = __task_rq_lock(p, &rf); -+ -+ /* -+ * At this point the task is pinned; either: -+ * - blocked and we're holding off wakeups (pi->lock) -+ * - woken, and we're holding off enqueue (rq->lock) -+ * - queued, and we're holding off schedule (rq->lock) -+ * - running, and we're holding off de-schedule (rq->lock) -+ * -+ * The called function (@func) can use: task_curr(), p->on_rq and -+ * p->__state to differentiate between these states. -+ */ -+ ret = func(p, arg); -+ -+ if (rq) -+ __task_rq_unlock(rq, &rf); -+ -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); -+ return ret; -+} -+ -+/** -+ * cpu_curr_snapshot - Return a snapshot of the currently running task -+ * @cpu: The CPU on which to snapshot the task. -+ * -+ * Returns the task_struct pointer of the task "currently" running on -+ * the specified CPU. If the same task is running on that CPU throughout, -+ * the return value will be a pointer to that task's task_struct structure. -+ * If the CPU did any context switches even vaguely concurrently with the -+ * execution of this function, the return value will be a pointer to the -+ * task_struct structure of a randomly chosen task that was running on -+ * that CPU somewhere around the time that this function was executing. -+ * -+ * If the specified CPU was offline, the return value is whatever it -+ * is, perhaps a pointer to the task_struct structure of that CPU's idle -+ * task, but there is no guarantee. Callers wishing a useful return -+ * value must take some action to ensure that the specified CPU remains -+ * online throughout. -+ * -+ * This function executes full memory barriers before and after fetching -+ * the pointer, which permits the caller to confine this function's fetch -+ * with respect to the caller's accesses to other shared variables. -+ */ -+struct task_struct *cpu_curr_snapshot(int cpu) -+{ -+ struct task_struct *t; -+ -+ smp_mb(); /* Pairing determined by caller's synchronization design. */ -+ t = rcu_dereference(cpu_curr(cpu)); -+ smp_mb(); /* Pairing determined by caller's synchronization design. */ -+ return t; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ * -+ * __sched_fork() is basic setup used by init_idle() too: -+ */ -+static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ p->on_rq = 0; -+ p->on_cpu = 0; -+ p->utime = 0; -+ p->stime = 0; -+ p->sched_time = 0; -+ -+#ifdef CONFIG_SCHEDSTATS -+ /* Even if schedstat is disabled, there should not be garbage */ -+ memset(&p->stats, 0, sizeof(p->stats)); -+#endif -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+#ifdef CONFIG_SMP -+ p->wake_entry.u_flags = CSD_TYPE_TTWU; -+#endif -+} -+ -+/* -+ * fork()/clone()-time setup: -+ */ -+int sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ __sched_fork(clone_flags, p); -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->__state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = p->static_prio; -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ /* -+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly -+ * required yet, but lockdep gets upset if rules are violated. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, rq->curr->time_slice); -+#endif -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = sched_timeslice_ns; -+ resched_curr(rq); -+ } -+ sched_task_fork(p, rq); -+ raw_spin_unlock(&rq->lock); -+ -+ rseq_migrate(p); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, smp_processor_id()); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+void sched_post_fork(struct task_struct *p) -+{ -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ if (!strcmp(str, "enable")) { -+ set_schedstats(true); -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ set_schedstats(false); -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+#ifdef CONFIG_PROC_SYSCTL -+static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, -+ size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+ -+static struct ctl_table sched_core_sysctls[] = { -+ { -+ .procname = "sched_schedstats", -+ .data = NULL, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0644, -+ .proc_handler = sysctl_schedstats, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+ {} -+}; -+static int __init sched_core_sysctl_init(void) -+{ -+ register_sysctl_init("kernel", sched_core_sysctls); -+ return 0; -+} -+late_initcall(sched_core_sysctl_init); -+#endif /* CONFIG_PROC_SYSCTL */ -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ WRITE_ONCE(p->__state, TASK_RUNNING); -+ rq = cpu_rq(select_task_rq(p)); -+#ifdef CONFIG_SMP -+ rseq_migrate(p); -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_ptr can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ * -+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and -+ * its ordering comment. -+ */ -+ WRITE_ONCE(next->on_cpu, 1); -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * This must be the very last reference to @prev from this CPU. After -+ * p->on_cpu is cleared, the task can be moved to a different CPU. We -+ * must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+#ifdef CONFIG_SMP -+ -+static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) -+{ -+ void (*func)(struct rq *rq); -+ struct balance_callback *next; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ while (head) { -+ func = (void (*)(struct rq *))head->func; -+ next = head->next; -+ head->next = NULL; -+ head = next; -+ -+ func(rq); -+ } -+} -+ -+static void balance_push(struct rq *rq); -+ -+/* -+ * balance_push_callback is a right abuse of the callback interface and plays -+ * by significantly different rules. -+ * -+ * Where the normal balance_callback's purpose is to be ran in the same context -+ * that queued it (only later, when it's safe to drop rq->lock again), -+ * balance_push_callback is specifically targeted at __schedule(). -+ * -+ * This abuse is tolerated because it places all the unlikely/odd cases behind -+ * a single test, namely: rq->balance_callback == NULL. -+ */ -+struct balance_callback balance_push_callback = { -+ .next = NULL, -+ .func = balance_push, -+}; -+ -+static inline struct balance_callback * -+__splice_balance_callbacks(struct rq *rq, bool split) -+{ -+ struct balance_callback *head = rq->balance_callback; -+ -+ if (likely(!head)) -+ return NULL; -+ -+ lockdep_assert_rq_held(rq); -+ /* -+ * Must not take balance_push_callback off the list when -+ * splice_balance_callbacks() and balance_callbacks() are not -+ * in the same rq->lock section. -+ * -+ * In that case it would be possible for __schedule() to interleave -+ * and observe the list empty. -+ */ -+ if (split && head == &balance_push_callback) -+ head = NULL; -+ else -+ rq->balance_callback = NULL; -+ -+ return head; -+} -+ -+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -+{ -+ return __splice_balance_callbacks(rq, true); -+} -+ -+static void __balance_callbacks(struct rq *rq) -+{ -+ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); -+} -+ -+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -+{ -+ unsigned long flags; -+ -+ if (unlikely(head)) { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ do_balance_callbacks(rq, head); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+} -+ -+#else -+ -+static inline void __balance_callbacks(struct rq *rq) -+{ -+} -+ -+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -+{ -+ return NULL; -+} -+ -+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -+{ -+} -+ -+#endif -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ __balance_callbacks(rq); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/* -+ * NOP if the arch has not defined these: -+ */ -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+ -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+static inline void kmap_local_sched_out(void) -+{ -+#ifdef CONFIG_KMAP_LOCAL -+ if (unlikely(current->kmap_ctrl.idx)) -+ __kmap_local_sched_out(); -+#endif -+} -+ -+static inline void kmap_local_sched_in(void) -+{ -+#ifdef CONFIG_KMAP_LOCAL -+ if (unlikely(current->kmap_ctrl.idx)) -+ __kmap_local_sched_in(); -+#endif -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ kmap_local_sched_out(); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ unsigned int prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = READ_ONCE(prev->__state); -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ tick_nohz_task_switch(); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ /* -+ * kmap_local_sched_out() is invoked with rq::lock held and -+ * interrupts disabled. There is no requirement for that, but the -+ * sched out code does not have an interrupt enabled section. -+ * Restoring the maps on sched in does not require interrupts being -+ * disabled either. -+ */ -+ kmap_local_sched_in(); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop_sched(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ lru_gen_use_mm(next->mm); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned int nr_running(void) -+{ -+ unsigned int i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned int nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how it's mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned int nr_iowait(void) -+{ -+ unsigned int i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * sched_exec - execve() is a valuable balancing opportunity, because at -+ * this point the task has the smallest effective memory and cache -+ * footprint. -+ */ -+void sched_exec(void) -+{ -+} -+ -+#endif -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ cgroup_account_cputime(p, ns); -+ account_group_exec_runtime(p, ns); -+ -+ p->time_slice -= ns; -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ update_curr(rq, p); -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks have less than RESCHED_NS of time slice left they will be -+ * rescheduled. -+ */ -+ if (p->time_slice >= RESCHED_NS) -+ return; -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+#ifdef CONFIG_SCHED_DEBUG -+static u64 cpu_resched_latency(struct rq *rq) -+{ -+ int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); -+ u64 resched_latency, now = rq_clock(rq); -+ static bool warned_once; -+ -+ if (sysctl_resched_latency_warn_once && warned_once) -+ return 0; -+ -+ if (!need_resched() || !latency_warn_ms) -+ return 0; -+ -+ if (system_state == SYSTEM_BOOTING) -+ return 0; -+ -+ if (!rq->last_seen_need_resched_ns) { -+ rq->last_seen_need_resched_ns = now; -+ rq->ticks_without_resched = 0; -+ return 0; -+ } -+ -+ rq->ticks_without_resched++; -+ resched_latency = now - rq->last_seen_need_resched_ns; -+ if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) -+ return 0; -+ -+ warned_once = true; -+ -+ return resched_latency; -+} -+ -+static int __init setup_resched_latency_warn_ms(char *str) -+{ -+ long val; -+ -+ if ((kstrtol(str, 0, &val))) { -+ pr_warn("Unable to set resched_latency_warn_ms\n"); -+ return 1; -+ } -+ -+ sysctl_resched_latency_warn_ms = val; -+ return 1; -+} -+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); -+#else -+static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } -+#endif /* CONFIG_SCHED_DEBUG */ -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ u64 resched_latency; -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ scheduler_task_tick(rq); -+ if (sched_feat(LATENCY_WARN)) -+ resched_latency = cpu_resched_latency(rq); -+ calc_global_load_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ if (sched_feat(LATENCY_WARN) && resched_latency) -+ resched_latency_warn(cpu, resched_latency); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_SCHED_SMT -+static inline int sg_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ cpumask_t tmp; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* _something_ may have changed the task, double check again */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask) && -+ !is_migration_disabled(p)) { -+ int cpu = cpu_of(rq); -+ int dcpu = __best_mask_cpu(&tmp, per_cpu(sched_cpu_llc_mask, cpu)); -+ rq = move_queued_task(rq, p, dcpu); -+ } -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* sg_balance_trigger - trigger slibing group balance for @cpu */ -+static inline int sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq= cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ int res; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return 0; -+ curr = rq->curr; -+ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ -+ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ -+ !is_migration_disabled(curr) && (!rq->active_balance); -+ -+ if (res) -+ rq->active_balance = 1; -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (res) -+ stop_one_cpu_nowait(cpu, sg_balance_cpu_stop, curr, -+ &rq->active_balance_work); -+ return res; -+} -+ -+/* -+ * sg_balance - slibing group balance check for run queue @rq -+ */ -+static inline void sg_balance(struct rq *rq) -+{ -+ cpumask_t chk; -+ int cpu = cpu_of(rq); -+ -+ /* exit when cpu is offline */ -+ if (unlikely(!rq->online)) -+ return; -+ -+ /* -+ * Only cpu in slibing idle group will do the checking and then -+ * find potential cpus which can migrate the current running task -+ */ -+ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && -+ cpumask_andnot(&chk, cpu_online_mask, sched_rq_watermark) && -+ cpumask_andnot(&chk, &chk, &sched_rq_pending_mask)) { -+ int i; -+ -+ for_each_cpu_wrap(i, &chk, cpu) { -+ if (cpumask_subset(cpu_smt_mask(i), &chk) && -+ sg_balance_trigger(i)) -+ return; -+ } -+ } -+} -+#endif /* CONFIG_SCHED_SMT */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ scheduler_task_tick(rq); -+ -+ calc_load_nohz_remote(rq); -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_TYPE_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_TYPE_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+ -+ if (task_scs_end_corrupted(prev)) -+ panic("corrupted shadow stack detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ SCHED_WARN_ON(ct_state() == CONTEXT_USER); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * Compile time debug macro -+ * #define ALT_SCHED_DEBUG -+ */ -+ -+#ifdef ALT_SCHED_DEBUG -+void alt_sched_debug(void) -+{ -+ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", -+ sched_rq_pending_mask.bits[0], -+ sched_rq_watermark[0].bits[0], -+ sched_sg_idle_mask.bits[0]); -+} -+#else -+inline void alt_sched_debug(void) {} -+#endif -+ -+#ifdef CONFIG_SMP -+ -+#ifdef CONFIG_PREEMPT_RT -+#define SCHED_NR_MIGRATE_BREAK 8 -+#else -+#define SCHED_NR_MIGRATE_BREAK 32 -+#endif -+ -+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; -+ -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) -+{ -+ struct task_struct *p, *skip = rq->curr; -+ int nr_migrated = 0; -+ int nr_tries = min(rq->nr_running / 2, sysctl_sched_nr_migrate); -+ -+ while (skip != rq->idle && nr_tries && -+ (p = sched_rq_next_task(skip, rq)) != rq->idle) { -+ skip = sched_rq_next_task(p, rq); -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { -+ __SCHED_DEQUEUE_TASK(p, rq, 0); -+ set_task_cpu(p, dest_cpu); -+ sched_task_sanity_check(p, dest_rq); -+ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int take_other_rq_tasks(struct rq *rq, int cpu) -+{ -+ struct cpumask *topo_mask, *end_mask; -+ -+ if (unlikely(!rq->online)) -+ return 0; -+ -+ if (cpumask_empty(&sched_rq_pending_mask)) -+ return 0; -+ -+ topo_mask = per_cpu(sched_cpu_topo_masks, cpu) + 1; -+ end_mask = per_cpu(sched_cpu_topo_end_mask, cpu); -+ do { -+ int i; -+ for_each_cpu_and(i, &sched_rq_pending_mask, topo_mask) { -+ int nr_migrated; -+ struct rq *src_rq; -+ -+ src_rq = cpu_rq(i); -+ if (!do_raw_spin_trylock(&src_rq->lock)) -+ continue; -+ spin_acquire(&src_rq->lock.dep_map, -+ SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { -+ src_rq->nr_running -= nr_migrated; -+ if (src_rq->nr_running < 2) -+ cpumask_clear_cpu(i, &sched_rq_pending_mask); -+ -+ rq->nr_running += nr_migrated; -+ if (rq->nr_running > 1) -+ cpumask_set_cpu(cpu, &sched_rq_pending_mask); -+ -+ cpufreq_update_util(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ return 1; -+ } -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ } -+ } while (++topo_mask < end_mask); -+ -+ return 0; -+} -+#endif -+ -+/* -+ * Timeslices below RESCHED_NS are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. -+ */ -+static inline void check_curr(struct task_struct *p, struct rq *rq) -+{ -+ if (unlikely(rq->idle == p)) -+ return; -+ -+ update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_NS) -+ time_slice_expired(p, rq); -+} -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next; -+ -+ if (unlikely(rq->skip)) { -+ next = rq_runnable_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ rq->skip = NULL; -+ schedstat_inc(rq->sched_goidle); -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = rq_runnable_task(rq); -+#endif -+ } -+ rq->skip = NULL; -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ return next; -+ } -+ -+ next = sched_rq_first_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ schedstat_inc(rq->sched_goidle); -+ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = sched_rq_first_task(rq); -+#endif -+ } -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, -+ * next);*/ -+ return next; -+} -+ -+/* -+ * Constants for the sched_mode argument of __schedule(). -+ * -+ * The mode argument allows RT enabled kernels to differentiate a -+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that -+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to -+ * optimize the AND operation out and just check for zero. -+ */ -+#define SM_NONE 0x0 -+#define SM_PREEMPT 0x1 -+#define SM_RTLOCK_WAIT 0x2 -+ -+#ifndef CONFIG_PREEMPT_RT -+# define SM_MASK_PREEMPT (~0U) -+#else -+# define SM_MASK_PREEMPT SM_PREEMPT -+#endif -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(unsigned int sched_mode) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ unsigned long prev_state; -+ struct rq *rq; -+ int cpu; -+ int deactivated = 0; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, !!sched_mode); -+ -+ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(!!sched_mode); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(): -+ * -+ * __set_current_state(@state) signal_wake_up() -+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) -+ * wake_up_state(p, state) -+ * LOCK rq->lock LOCK p->pi_state -+ * smp_mb__after_spinlock() smp_mb__after_spinlock() -+ * if (signal_pending_state()) if (p->state & @state) -+ * -+ * Also, the membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ /* -+ * We must load prev->state once (task_struct::state is volatile), such -+ * that we form a control dependency vs deactivate_task() below. -+ */ -+ prev_state = READ_ONCE(prev->__state); -+ if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { -+ if (signal_pending_state(prev_state, prev)) { -+ WRITE_ONCE(prev->__state, TASK_RUNNING); -+ } else { -+ prev->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & TASK_FROZEN); -+ -+ if (prev->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ /* -+ * __schedule() ttwu() -+ * prev_state = prev->state; if (p->on_rq && ...) -+ * if (prev_state) goto out; -+ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); -+ * p->state = TASK_WAKING -+ * -+ * Where __schedule() and ttwu() have matching control dependencies. -+ * -+ * After this, schedule() must not care about p->state any more. -+ */ -+ sched_task_deactivate(prev, rq); -+ deactivate_task(prev, rq); -+ deactivated = 1; -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ check_curr(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+#ifdef CONFIG_SCHED_DEBUG -+ rq->last_seen_need_resched_ns = 0; -+#endif -+ -+ if (likely(prev != next)) { -+ if (deactivated) -+ update_sched_rq_watermark(rq); -+ next->last_ran = rq->clock_task; -+ rq->last_ts_switch = rq->clock; -+ -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+ } else { -+ __balance_callbacks(rq); -+ raw_spin_unlock_irq(&rq->lock); -+ } -+ -+#ifdef CONFIG_SCHED_SMT -+ sg_balance(rq); -+#endif -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ -+ __schedule(SM_NONE); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ unsigned int task_flags; -+ -+ if (task_is_running(tsk)) -+ return; -+ -+ task_flags = tsk->flags; -+ /* -+ * If a worker goes to sleep, notify and ask workqueue whether it -+ * wants to wake up a task to maintain concurrency. -+ */ -+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (task_flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ } -+ -+ /* -+ * spinlock and rwlock must not flush block requests. This will -+ * deadlock if the callback attempts to acquire a lock which is -+ * already acquired. -+ */ -+ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ blk_flush_plug(tsk->plug, true); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(SM_NONE); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->__state); -+ do { -+ __schedule(SM_NONE); -+ } while (need_resched()); -+} -+ -+#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+#ifdef CONFIG_PREEMPT_RT -+void __sched notrace schedule_rtlock(void) -+{ -+ do { -+ preempt_disable(); -+ __schedule(SM_RTLOCK_WAIT); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+} -+NOKPROBE_SYMBOL(schedule_rtlock); -+#endif -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(SM_PREEMPT); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -+#ifndef preempt_schedule_dynamic_enabled -+#define preempt_schedule_dynamic_enabled preempt_schedule -+#define preempt_schedule_dynamic_disabled NULL -+#endif -+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); -+EXPORT_STATIC_CALL_TRAMP(preempt_schedule); -+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); -+void __sched notrace dynamic_preempt_schedule(void) -+{ -+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) -+ return; -+ preempt_schedule(); -+} -+NOKPROBE_SYMBOL(dynamic_preempt_schedule); -+EXPORT_SYMBOL(dynamic_preempt_schedule); -+#endif -+#endif -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(SM_PREEMPT); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -+#ifndef preempt_schedule_notrace_dynamic_enabled -+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace -+#define preempt_schedule_notrace_dynamic_disabled NULL -+#endif -+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); -+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); -+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); -+void __sched notrace dynamic_preempt_schedule_notrace(void) -+{ -+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) -+ return; -+ preempt_schedule_notrace(); -+} -+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); -+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); -+#endif -+#endif -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(SM_PREEMPT); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void check_task_changed(struct task_struct *p, struct rq *rq) -+{ -+ int idx; -+ -+ /* Trigger resched if task sched_prio has been modified. */ -+ if (task_on_rq_queued(p) && (idx = task_sched_prio_idx(p, rq)) != p->sq_idx) { -+ requeue_task(p, rq, idx); -+ check_preempt_curr(rq); -+ } -+} -+ -+static void __setscheduler_prio(struct task_struct *p, int prio) -+{ -+ p->prio = prio; -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guarantees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ -+ __setscheduler_prio(p, prio); -+ -+ check_task_changed(p, rq); -+out_unlock: -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ -+ __balance_callbacks(rq); -+ __task_access_unlock(p, lock); -+ -+ preempt_enable(); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ p->static_prio = NICE_TO_PRIO(nice); -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it won't have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->prio = effective_prio(p); -+ -+ check_task_changed(p, rq); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * is_nice_reduction - check if nice value is an actual reduction -+ * -+ * Similar to can_nice() but does not perform a capability check. -+ * -+ * @p: task -+ * @nice: nice value -+ */ -+static bool is_nice_reduction(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); -+} -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * -+ * sched policy return value kernel prio user prio/nice -+ * -+ * (BMQ)normal, batch, idle[0 ... 53] [100 ... 139] 0/[-20 ... 19]/[-7 ... 7] -+ * (PDS)normal, batch, idle[0 ... 39] 100 0/[-20 ... 19] -+ * fifo, rr [-1 ... -100] [99 ... 0] [0 ... 99] -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ return (p->prio < MAX_RT_PRIO) ? p->prio - MAX_RT_PRIO : -+ task_sched_prio_normal(p, task_rq(p)); -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (rq->ttwu_pending) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+/* -+ * Allow unprivileged RT tasks to decrease priority. -+ * Only issue a capable test if needed and only once to avoid an audit -+ * event on permitted non-privileged operations: -+ */ -+static int user_check_sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ int policy, int reset_on_fork) -+{ -+ if (rt_policy(policy)) { -+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy: */ -+ if (policy != p->policy && !rlim_rtprio) -+ goto req_priv; -+ -+ /* Can't increase priority: */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ goto req_priv; -+ } -+ -+ /* Can't change other user's priorities: */ -+ if (!check_same_owner(p)) -+ goto req_priv; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag: */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ goto req_priv; -+ -+ return 0; -+ -+req_priv: -+ if (!capable(CAP_SYS_NICE)) -+ return -EPERM; -+ -+ return 0; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int oldpolicy = -1, policy = attr->sched_policy; -+ int retval, newprio; -+ struct balance_callback *head; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ if (user) { -+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork); -+ if (retval) -+ return retval; -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ newprio = __normal_prio(policy, attr->sched_priority, NICE_TO_PRIO(attr->sched_nice)); -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ newprio = rt_effective_prio(p, newprio); -+ } -+ -+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { -+ __setscheduler_params(p, attr); -+ __setscheduler_prio(p, newprio); -+ } -+ -+ check_task_changed(p, rq); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ head = splice_balance_callbacks(rq); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ /* Run balance callbacks after we've adjusted the PI chain: */ -+ balance_callbacks(rq, head); -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Use sched_set_fifo(), read its comment. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr_nocheck); -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+ -+/* -+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally -+ * incapable of resource management, which is the one thing an OS really should -+ * be doing. -+ * -+ * This is of course the reason it is limited to privileged users only. -+ * -+ * Worse still; it is fundamentally impossible to compose static priority -+ * workloads. You cannot take two correctly working static prio workloads -+ * and smash them together and still expect them to work. -+ * -+ * For this reason 'all' FIFO tasks the kernel creates are basically at: -+ * -+ * MAX_RT_PRIO / 2 -+ * -+ * The administrator _MUST_ configure the system, the kernel simply doesn't -+ * know enough information to make a sensible choice. -+ */ -+void sched_set_fifo(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo); -+ -+/* -+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. -+ */ -+void sched_set_fifo_low(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = 1 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo_low); -+ -+void sched_set_normal(struct task_struct *p, int nice) -+{ -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ .sched_nice = nice, -+ }; -+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_normal); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setattr(p, &attr); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (p->sched_reset_on_fork) -+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ if (task_has_rt_policy(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ kattr.sched_flags &= SCHED_FLAG_ALL; -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+static int -+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) -+{ -+ int retval; -+ cpumask_var_t cpus_allowed, new_mask; -+ -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); -+ if (retval) -+ goto out_free_new_mask; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ -+out_free_new_mask: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ retval = -EPERM; -+ goto out_put_task; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_put_task; -+ -+ retval = __sched_setaffinity(p, in_mask); -+out_put_task: -+ put_task_struct(p); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ schedstat_inc(rq->yld_count); -+ -+ if (1 == sched_yield_type) { -+ if (!rt_task(current)) -+ do_sched_yield_type_1(current, rq); -+ } else if (2 == sched_yield_type) { -+ if (rq->nr_running > 1) -+ rq->skip = current; -+ } -+ -+ preempt_disable(); -+ raw_spin_unlock_irq(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. If there are no -+ * other threads running on this CPU then this function will return. -+ * -+ * Return: 0. -+ */ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) -+int __sched __cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ /* -+ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick -+ * whether the current CPU is in an RCU read-side critical section, -+ * so the tick can report quiescent states even for CPUs looping -+ * in kernel context. In contrast, in non-preemptible kernels, -+ * RCU readers leave no in-memory hints, which means that CPU-bound -+ * processes executing in kernel context might never report an -+ * RCU quiescent state. Therefore, the following code causes -+ * cond_resched() to report a quiescent state, but only when RCU -+ * is in urgent need of one. -+ */ -+#ifndef CONFIG_PREEMPT_RCU -+ rcu_all_qs(); -+#endif -+ return 0; -+} -+EXPORT_SYMBOL(__cond_resched); -+#endif -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -+#define cond_resched_dynamic_enabled __cond_resched -+#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) -+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); -+EXPORT_STATIC_CALL_TRAMP(cond_resched); -+ -+#define might_resched_dynamic_enabled __cond_resched -+#define might_resched_dynamic_disabled ((void *)&__static_call_return0) -+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); -+EXPORT_STATIC_CALL_TRAMP(might_resched); -+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); -+int __sched dynamic_cond_resched(void) -+{ -+ if (!static_branch_unlikely(&sk_dynamic_cond_resched)) -+ return 0; -+ return __cond_resched(); -+} -+EXPORT_SYMBOL(dynamic_cond_resched); -+ -+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); -+int __sched dynamic_might_resched(void) -+{ -+ if (!static_branch_unlikely(&sk_dynamic_might_resched)) -+ return 0; -+ return __cond_resched(); -+} -+EXPORT_SYMBOL(dynamic_might_resched); -+#endif -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (!_cond_resched()) -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+int __cond_resched_rwlock_read(rwlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held_read(lock); -+ -+ if (rwlock_needbreak(lock) || resched) { -+ read_unlock(lock); -+ if (!_cond_resched()) -+ cpu_relax(); -+ ret = 1; -+ read_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_rwlock_read); -+ -+int __cond_resched_rwlock_write(rwlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held_write(lock); -+ -+ if (rwlock_needbreak(lock) || resched) { -+ write_unlock(lock); -+ if (!_cond_resched()) -+ cpu_relax(); -+ ret = 1; -+ write_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_rwlock_write); -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+ -+#ifdef CONFIG_GENERIC_ENTRY -+#include -+#endif -+ -+/* -+ * SC:cond_resched -+ * SC:might_resched -+ * SC:preempt_schedule -+ * SC:preempt_schedule_notrace -+ * SC:irqentry_exit_cond_resched -+ * -+ * -+ * NONE: -+ * cond_resched <- __cond_resched -+ * might_resched <- RET0 -+ * preempt_schedule <- NOP -+ * preempt_schedule_notrace <- NOP -+ * irqentry_exit_cond_resched <- NOP -+ * -+ * VOLUNTARY: -+ * cond_resched <- __cond_resched -+ * might_resched <- __cond_resched -+ * preempt_schedule <- NOP -+ * preempt_schedule_notrace <- NOP -+ * irqentry_exit_cond_resched <- NOP -+ * -+ * FULL: -+ * cond_resched <- RET0 -+ * might_resched <- RET0 -+ * preempt_schedule <- preempt_schedule -+ * preempt_schedule_notrace <- preempt_schedule_notrace -+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched -+ */ -+ -+enum { -+ preempt_dynamic_undefined = -1, -+ preempt_dynamic_none, -+ preempt_dynamic_voluntary, -+ preempt_dynamic_full, -+}; -+ -+int preempt_dynamic_mode = preempt_dynamic_undefined; -+ -+int sched_dynamic_mode(const char *str) -+{ -+ if (!strcmp(str, "none")) -+ return preempt_dynamic_none; -+ -+ if (!strcmp(str, "voluntary")) -+ return preempt_dynamic_voluntary; -+ -+ if (!strcmp(str, "full")) -+ return preempt_dynamic_full; -+ -+ return -EINVAL; -+} -+ -+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -+#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) -+#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) -+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -+#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) -+#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) -+#else -+#error "Unsupported PREEMPT_DYNAMIC mechanism" -+#endif -+ -+void sched_dynamic_update(int mode) -+{ -+ /* -+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in -+ * the ZERO state, which is invalid. -+ */ -+ preempt_dynamic_enable(cond_resched); -+ preempt_dynamic_enable(might_resched); -+ preempt_dynamic_enable(preempt_schedule); -+ preempt_dynamic_enable(preempt_schedule_notrace); -+ preempt_dynamic_enable(irqentry_exit_cond_resched); -+ -+ switch (mode) { -+ case preempt_dynamic_none: -+ preempt_dynamic_enable(cond_resched); -+ preempt_dynamic_disable(might_resched); -+ preempt_dynamic_disable(preempt_schedule); -+ preempt_dynamic_disable(preempt_schedule_notrace); -+ preempt_dynamic_disable(irqentry_exit_cond_resched); -+ pr_info("Dynamic Preempt: none\n"); -+ break; -+ -+ case preempt_dynamic_voluntary: -+ preempt_dynamic_enable(cond_resched); -+ preempt_dynamic_enable(might_resched); -+ preempt_dynamic_disable(preempt_schedule); -+ preempt_dynamic_disable(preempt_schedule_notrace); -+ preempt_dynamic_disable(irqentry_exit_cond_resched); -+ pr_info("Dynamic Preempt: voluntary\n"); -+ break; -+ -+ case preempt_dynamic_full: -+ preempt_dynamic_disable(cond_resched); -+ preempt_dynamic_disable(might_resched); -+ preempt_dynamic_enable(preempt_schedule); -+ preempt_dynamic_enable(preempt_schedule_notrace); -+ preempt_dynamic_enable(irqentry_exit_cond_resched); -+ pr_info("Dynamic Preempt: full\n"); -+ break; -+ } -+ -+ preempt_dynamic_mode = mode; -+} -+ -+static int __init setup_preempt_mode(char *str) -+{ -+ int mode = sched_dynamic_mode(str); -+ if (mode < 0) { -+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); -+ return 0; -+ } -+ -+ sched_dynamic_update(mode); -+ return 1; -+} -+__setup("preempt=", setup_preempt_mode); -+ -+static void __init preempt_dynamic_init(void) -+{ -+ if (preempt_dynamic_mode == preempt_dynamic_undefined) { -+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { -+ sched_dynamic_update(preempt_dynamic_none); -+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { -+ sched_dynamic_update(preempt_dynamic_voluntary); -+ } else { -+ /* Default static call setting, nothing to do */ -+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); -+ preempt_dynamic_mode = preempt_dynamic_full; -+ pr_info("Dynamic Preempt: full\n"); -+ } -+ } -+} -+ -+#define PREEMPT_MODEL_ACCESSOR(mode) \ -+ bool preempt_model_##mode(void) \ -+ { \ -+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ -+ return preempt_dynamic_mode == preempt_dynamic_##mode; \ -+ } \ -+ EXPORT_SYMBOL_GPL(preempt_model_##mode) -+ -+PREEMPT_MODEL_ACCESSOR(none); -+PREEMPT_MODEL_ACCESSOR(voluntary); -+PREEMPT_MODEL_ACCESSOR(full); -+ -+#else /* !CONFIG_PREEMPT_DYNAMIC */ -+ -+static inline void preempt_dynamic_init(void) { } -+ -+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, it's already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In Alt schedule FW, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_flush_plug(current->plug, true); -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_RT_PRIO - 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ alt_sched_debug(); -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(sched_timeslice_ns); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); -+ -+ if (task_is_running(p)) -+ pr_cont(" running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", -+ free, task_pid_nr(p), ppid, -+ read_task_thread_flags(p)); -+ -+ print_worker_info(KERN_INFO, p); -+ print_stop_info(KERN_INFO, p); -+ show_stack(p, NULL, KERN_INFO); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ unsigned int state = READ_ONCE(p->__state); -+ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD)) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned int state_filter) -+{ -+ struct task_struct *g, *p; -+ -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* TODO: Alt schedule FW should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ if (cpu == smp_processor_id() && in_hardirq()) { -+ struct pt_regs *regs; -+ -+ regs = get_irq_regs(); -+ if (regs) { -+ show_regs(regs); -+ return; -+ } -+ } -+ -+ if (trigger_single_cpu_backtrace(cpu)) -+ return; -+ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: CPU the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void __init init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ __sched_fork(0, idle); -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->__state = TASK_RUNNING; -+ /* -+ * PF_KTHREAD should already be set at this point; regardless, make it -+ * look like a proper per-CPU kthread. -+ */ -+ idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; -+ kthread_set_per_cpu(idle, cpu); -+ -+ sched_queue_init_idle(&rq->queue, idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_effective_cpus) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+ -+static int __balance_push_cpu_stop(void *arg) -+{ -+ struct task_struct *p = arg; -+ struct rq *rq = this_rq(); -+ struct rq_flags rf; -+ int cpu; -+ -+ raw_spin_lock_irq(&p->pi_lock); -+ rq_lock(rq, &rf); -+ -+ update_rq_clock(rq); -+ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) { -+ cpu = select_fallback_rq(rq->cpu, p); -+ rq = __migrate_task(rq, p, cpu); -+ } -+ -+ rq_unlock(rq, &rf); -+ raw_spin_unlock_irq(&p->pi_lock); -+ -+ put_task_struct(p); -+ -+ return 0; -+} -+ -+static DEFINE_PER_CPU(struct cpu_stop_work, push_work); -+ -+/* -+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only -+ * effective when the hotplug motion is down. -+ */ -+static void balance_push(struct rq *rq) -+{ -+ struct task_struct *push_task = rq->curr; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ /* -+ * Ensure the thing is persistent until balance_push_set(.on = false); -+ */ -+ rq->balance_callback = &balance_push_callback; -+ -+ /* -+ * Only active while going offline and when invoked on the outgoing -+ * CPU. -+ */ -+ if (!cpu_dying(rq->cpu) || rq != this_rq()) -+ return; -+ -+ /* -+ * Both the cpu-hotplug and stop task are in this case and are -+ * required to complete the hotplug process. -+ */ -+ if (kthread_is_per_cpu(push_task) || -+ is_migration_disabled(push_task)) { -+ -+ /* -+ * If this is the idle task on the outgoing CPU try to wake -+ * up the hotplug control thread which might wait for the -+ * last task to vanish. The rcuwait_active() check is -+ * accurate here because the waiter is pinned on this CPU -+ * and can't obviously be running in parallel. -+ * -+ * On RT kernels this also has to check whether there are -+ * pinned and scheduled out tasks on the runqueue. They -+ * need to leave the migrate disabled section first. -+ */ -+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) && -+ rcuwait_active(&rq->hotplug_wait)) { -+ raw_spin_unlock(&rq->lock); -+ rcuwait_wake_up(&rq->hotplug_wait); -+ raw_spin_lock(&rq->lock); -+ } -+ return; -+ } -+ -+ get_task_struct(push_task); -+ /* -+ * Temporarily drop rq->lock such that we can wake-up the stop task. -+ * Both preemption and IRQs are still disabled. -+ */ -+ raw_spin_unlock(&rq->lock); -+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, -+ this_cpu_ptr(&push_work)); -+ /* -+ * At this point need_resched() is true and we'll take the loop in -+ * schedule(). The next pick is obviously going to be the stop task -+ * which kthread_is_per_cpu() and will push this task away. -+ */ -+ raw_spin_lock(&rq->lock); -+} -+ -+static void balance_push_set(int cpu, bool on) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rq_lock_irqsave(rq, &rf); -+ if (on) { -+ WARN_ON_ONCE(rq->balance_callback); -+ rq->balance_callback = &balance_push_callback; -+ } else if (rq->balance_callback == &balance_push_callback) { -+ rq->balance_callback = NULL; -+ } -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+/* -+ * Invoked from a CPUs hotplug control thread after the CPU has been marked -+ * inactive. All tasks which are not per CPU kernel threads are either -+ * pushed off this CPU now via balance_push() or placed on a different CPU -+ * during wakeup. Wait until the CPU is quiescent. -+ */ -+static void balance_hotplug_wait(void) -+{ -+ struct rq *rq = this_rq(); -+ -+ rcuwait_wait_event(&rq->hotplug_wait, -+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq), -+ TASK_UNINTERRUPTIBLE); -+} -+ -+#else -+ -+static void balance_push(struct rq *rq) -+{ -+} -+ -+static void balance_push_set(int cpu, bool on) -+{ -+} -+ -+static inline void balance_hotplug_wait(void) -+{ -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* -+ * Clear the balance_push callback and prepare to schedule -+ * regular tasks. -+ */ -+ balance_push_set(cpu, false); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ -+ /* -+ * From this point forward, this CPU will refuse to run any task that -+ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively -+ * push those tasks away until this gets cleared, see -+ * sched_cpu_dying(). -+ */ -+ balance_push_set(cpu, true); -+ -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Specifically, we rely on ttwu to no longer target this CPU, see -+ * ttwu_queue_cond() and is_cpu_allowed(). -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ update_rq_clock(rq); -+ set_rq_offline(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { -+ static_branch_dec_cpuslocked(&sched_smt_present); -+ if (!static_branch_likely(&sched_smt_present)) -+ cpumask_clear(&sched_sg_idle_mask); -+ } -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ balance_push_set(cpu, false); -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+ -+/* -+ * Invoked immediately before the stopper thread is invoked to bring the -+ * CPU down completely. At this point all per CPU kthreads except the -+ * hotplug thread (current) and the stopper thread (inactive) have been -+ * either parked or have been unbound from the outgoing CPU. Ensure that -+ * any of those which might be on the way out are gone. -+ * -+ * If after this point a bound task is being woken on this CPU then the -+ * responsible hotplug callback has failed to do it's job. -+ * sched_cpu_dying() will catch it with the appropriate fireworks. -+ */ -+int sched_cpu_wait_empty(unsigned int cpu) -+{ -+ balance_hotplug_wait(); -+ return 0; -+} -+ -+/* -+ * Since this CPU is going 'away' for a while, fold any nr_active delta we -+ * might have. Called from the CPU stopper task after ensuring that the -+ * stopper is the last running task on the CPU, so nr_active count is -+ * stable. We need to take the teardown thread which is calling this into -+ * account, so we hand in adjust = 1 to the load calculation. -+ * -+ * Also see the comment "Global load-average calculations". -+ */ -+static void calc_load_migrate(struct rq *rq) -+{ -+ long delta = calc_load_fold_active(rq, 1); -+ -+ if (delta) -+ atomic_long_add(delta, &calc_load_tasks); -+} -+ -+static void dump_rq_tasks(struct rq *rq, const char *loglvl) -+{ -+ struct task_struct *g, *p; -+ int cpu = cpu_of(rq); -+ -+ lockdep_assert_held(&rq->lock); -+ -+ printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); -+ for_each_process_thread(g, p) { -+ if (task_cpu(p) != cpu) -+ continue; -+ -+ if (!task_on_rq_queued(p)) -+ continue; -+ -+ printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); -+ } -+} -+ -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_tick_stop(cpu); -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { -+ WARN(true, "Dying CPU not properly vacated!"); -+ dump_rq_tasks(rq, KERN_WARNING); -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ calc_load_migrate(rq); -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ /* init topo masks */ -+ tmp = per_cpu(sched_cpu_topo_masks, cpu); -+ -+ cpumask_copy(tmp, cpumask_of(cpu)); -+ tmp++; -+ cpumask_copy(tmp, cpu_possible_mask); -+ per_cpu(sched_cpu_llc_mask, cpu) = tmp; -+ per_cpu(sched_cpu_topo_end_mask, cpu) = ++tmp; -+ /*per_cpu(sd_llc_id, cpu) = cpu;*/ -+ } -+} -+ -+#define TOPOLOGY_CPUMASK(name, mask, last)\ -+ if (cpumask_and(topo, topo, mask)) { \ -+ cpumask_copy(topo, mask); \ -+ printk(KERN_INFO "sched: cpu#%02d topo: 0x%08lx - "#name, \ -+ cpu, (topo++)->bits[0]); \ -+ } \ -+ if (!last) \ -+ cpumask_complement(topo, mask) -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *topo; -+ -+ for_each_online_cpu(cpu) { -+ /* take chance to reset time slice for idle tasks */ -+ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; -+ -+ topo = per_cpu(sched_cpu_topo_masks, cpu) + 1; -+ -+ cpumask_complement(topo, cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); -+#endif -+ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); -+ per_cpu(sched_cpu_llc_mask, cpu) = topo; -+ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); -+ -+ per_cpu(sched_cpu_topo_end_mask, cpu) = topo; -+ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", -+ cpu, per_cpu(sd_llc_id, cpu), -+ (int) (per_cpu(sched_cpu_llc_mask, cpu) - -+ per_cpu(sched_cpu_topo_masks, cpu))); -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) -+ BUG(); -+ current->flags &= ~PF_NO_SETAFFINITY; -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+#ifdef CONFIG_FAIR_GROUP_SCHED -+ unsigned long shares; -+#endif -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ printk(KERN_INFO ALT_SCHED_VERSION_MSG); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < SCHED_QUEUE_BITS; i++) -+ cpumask_copy(sched_rq_watermark + i, cpu_present_mask); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ -+ sched_queue_init(&rq->queue); -+ rq->watermark = IDLE_TASK_SCHED_PRIO; -+ rq->skip = NULL; -+ -+ raw_spin_lock_init(&rq->lock); -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+#ifdef CONFIG_SCHED_SMT -+ rq->active_balance = 0; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+ INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); -+#endif -+ rq->balance_callback = &balance_push_callback; -+#ifdef CONFIG_HOTPLUG_CPU -+ rcuwait_init(&rq->hotplug_wait); -+#endif -+#endif /* CONFIG_SMP */ -+ rq->nr_switches = 0; -+ -+ hrtick_rq_init(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * The idle task doesn't need the kthread struct to function, but it -+ * is dressed up as a per-CPU kthread and thus needs to play the part -+ * if we want to avoid special-casing it in code that deals with per-CPU -+ * kthreads. -+ */ -+ WARN_ON(!set_kthread_struct(current)); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ balance_push_set(smp_processor_id(), false); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ psi_init(); -+ -+ preempt_dynamic_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ -+void __might_sleep(const char *file, int line) -+{ -+ unsigned int state = get_current_state(); -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%x set at [<%p>] %pS\n", state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ __might_resched(file, line, 0); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) -+{ -+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) -+ return; -+ -+ if (preempt_count() == preempt_offset) -+ return; -+ -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, ip); -+} -+ -+static inline bool resched_offsets_ok(unsigned int offsets) -+{ -+ unsigned int nested = preempt_count(); -+ -+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; -+ -+ return nested == offsets; -+} -+ -+void __might_resched(const char *file, int line, unsigned int offsets) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((resched_offsets_ok(offsets) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ pr_err("BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(), -+ offsets & MIGHT_RESCHED_PREEMPT_MASK); -+ -+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { -+ pr_err("RCU nest depth: %d, expected: %u\n", -+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); -+ } -+ -+ if (task_stack_end_corrupted(current)) -+ pr_emerg("Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+ -+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, -+ preempt_disable_ip); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(__might_resched); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+ -+#ifdef CONFIG_SMP -+void __cant_migrate(const char *file, int line) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (is_migration_disabled(current)) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > 0) -+ return; -+ -+ if (current->migration_flags & MDF_FORCE_ENABLED) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); -+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), is_migration_disabled(current), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_migrate); -+#endif -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ schedstat_set(p->stats.wait_start, 0); -+ schedstat_set(p->stats.sleep_start, 0); -+ schedstat_set(p->stats.block_start, 0); -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+static void sched_unregister_group(struct task_group *tg) -+{ -+ /* -+ * We have to wait for yet another RCU grace period to expire, as -+ * print_cfs_stats() might run concurrently. -+ */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_unregister_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs: */ -+ sched_unregister_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete: */ -+ call_rcu(&tg->rcu, sched_unregister_group_rcu); -+} -+ -+void sched_release_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_release_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_unregister_group(tg); -+} -+ -+#ifdef CONFIG_RT_GROUP_SCHED -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+#endif -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+#ifdef CONFIG_FAIR_GROUP_SCHED -+static DEFINE_MUTEX(shares_mutex); -+ -+int sched_group_set_shares(struct task_group *tg, unsigned long shares) -+{ -+ /* -+ * We can't change the weight of the root cgroup. -+ */ -+ if (&root_task_group == tg) -+ return -EINVAL; -+ -+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); -+ -+ mutex_lock(&shares_mutex); -+ if (tg->shares == shares) -+ goto done; -+ -+ tg->shares = shares; -+done: -+ mutex_unlock(&shares_mutex); -+ return 0; -+} -+ -+static int cpu_shares_write_u64(struct cgroup_subsys_state *css, -+ struct cftype *cftype, u64 shareval) -+{ -+ if (shareval > scale_load_down(ULONG_MAX)) -+ shareval = MAX_SHARES; -+ return sched_group_set_shares(css_tg(css), scale_load(shareval)); -+} -+ -+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, -+ struct cftype *cft) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ return (u64) scale_load_down(tg->shares); -+} -+#endif -+ -+static struct cftype cpu_legacy_files[] = { -+#ifdef CONFIG_FAIR_GROUP_SCHED -+ { -+ .name = "shares", -+ .read_u64 = cpu_shares_read_u64, -+ .write_u64 = cpu_shares_write_u64, -+ }, -+#endif -+ { } /* Terminate */ -+}; -+ -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+#ifdef CONFIG_RT_GROUP_SCHED -+ .can_attach = cpu_cgroup_can_attach, -+#endif -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c -new file mode 100644 -index 000000000000..1212a031700e ---- /dev/null -+++ b/kernel/sched/alt_debug.c -@@ -0,0 +1,31 @@ -+/* -+ * kernel/sched/alt_debug.c -+ * -+ * Print the alt scheduler debugging details -+ * -+ * Author: Alfred Chen -+ * Date : 2020 -+ */ -+#include "sched.h" -+ -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -new file mode 100644 -index 000000000000..7c1cc0cbca0d ---- /dev/null -+++ b/kernel/sched/alt_sched.h -@@ -0,0 +1,660 @@ -+#ifndef ALT_SCHED_H -+#define ALT_SCHED_H -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include "../workqueue_internal.h" -+ -+#include "cpupri.h" -+ -+#ifdef CONFIG_SCHED_BMQ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+/* bits: RT(0-99), reserved(100-127), NORMAL_PRIO_NUM, cpu idle task */ -+#define SCHED_BITS (MIN_NORMAL_PRIO + NORMAL_PRIO_NUM + 1) -+#endif /* CONFIG_SCHED_PDS */ -+ -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+#ifdef CONFIG_SCHED_DEBUG -+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -+extern void resched_latency_warn(int cpu, u64 latency); -+#else -+# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -+static inline void resched_latency_warn(int cpu, u64 latency) {} -+#endif -+ -+/* -+ * Increase resolution of nice-level calculations for 64-bit architectures. -+ * The extra resolution improves shares distribution and load balancing of -+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup -+ * hierarchies, especially on larger systems. This is not a user-visible change -+ * and does not change the user-interface for setting shares/weights. -+ * -+ * We increase resolution only if we have enough bits to allow this increased -+ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit -+ * are pretty high and the returns do not justify the increased costs. -+ * -+ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to -+ * increase coverage and consistency always enable it on 64-bit platforms. -+ */ -+#ifdef CONFIG_64BIT -+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) -+# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -+# define scale_load_down(w) \ -+({ \ -+ unsigned long __w = (w); \ -+ if (__w) \ -+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ -+ __w; \ -+}) -+#else -+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) -+# define scale_load(w) (w) -+# define scale_load_down(w) (w) -+#endif -+ -+#ifdef CONFIG_FAIR_GROUP_SCHED -+#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD -+ -+/* -+ * A weight of 0 or 1 can cause arithmetics problems. -+ * A weight of a cfs_rq is the sum of weights of which entities -+ * are queued on this cfs_rq, so a weight of a entity should not be -+ * too large, so as the shares value of a task group. -+ * (The default weight is 1024 - so there's no practical -+ * limitation from this.) -+ */ -+#define MIN_SHARES (1UL << 1) -+#define MAX_SHARES (1UL << 18) -+#endif -+ -+/* -+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off: -+ */ -+#ifdef CONFIG_SCHED_DEBUG -+# define const_debug __read_mostly -+#else -+# define const_debug const -+#endif -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+#define SCHED_QUEUE_BITS (SCHED_BITS - 1) -+ -+struct sched_queue { -+ DECLARE_BITMAP(bitmap, SCHED_QUEUE_BITS); -+ struct list_head heads[SCHED_BITS]; -+}; -+ -+struct rq; -+struct balance_callback { -+ struct balance_callback *next; -+ void (*func)(struct rq *rq); -+}; -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop, *skip; -+ struct mm_struct *prev_mm; -+ -+ struct sched_queue queue; -+#ifdef CONFIG_SCHED_PDS -+ u64 time_edge; -+#endif -+ unsigned long watermark; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_SCHED_DEBUG -+ u64 last_seen_need_resched_ns; -+ int ticks_without_resched; -+#endif -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ unsigned int ttwu_pending; -+ unsigned char nohz_idle_balance; -+ unsigned char idle_balance; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+ struct balance_callback *balance_callback; -+#ifdef CONFIG_HOTPLUG_CPU -+ struct rcuwait hotplug_wait; -+#endif -+ unsigned int nr_pinned; -+ -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* For genenal cpu load util */ -+ s32 load_history; -+ u64 load_block; -+ u64 load_stamp; -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 last_ts_switch; -+ u64 clock_task; -+ -+ unsigned int nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+ ktime_t hrtick_time; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#ifdef CONFIG_SMP -+ call_single_data_t nohz_csd; -+#endif -+ atomic_t nohz_flags; -+#endif /* CONFIG_NO_HZ_COMMON */ -+}; -+ -+extern unsigned long rq_load_util(struct rq *rq, unsigned long max); -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+ -+#ifdef CONFIG_SMP -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern bool sched_smp_initialized; -+ -+enum { -+ ITSELF_LEVEL_SPACE_HOLDER, -+#ifdef CONFIG_SCHED_SMT -+ SMT_LEVEL_SPACE_HOLDER, -+#endif -+ COREGROUP_LEVEL_SPACE_HOLDER, -+ CORE_LEVEL_SPACE_HOLDER, -+ OTHER_LEVEL_SPACE_HOLDER, -+ NR_CPU_AFFINITY_LEVELS -+}; -+ -+DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks); -+DECLARE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); -+ -+static inline int -+__best_mask_cpu(const cpumask_t *cpumask, const cpumask_t *mask) -+{ -+ int cpu; -+ -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ -+ return cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *mask) -+{ -+ return __best_mask_cpu(mask, per_cpu(sched_cpu_topo_masks, cpu)); -+} -+ -+extern void flush_smp_call_function_queue(void); -+ -+#else /* !CONFIG_SMP */ -+static inline void flush_smp_call_function_queue(void) { } -+#endif -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : BMQ need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_lock(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock(&rq->lock); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline void -+rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline raw_spinlock_t *__rq_lockp(struct rq *rq) -+{ -+ return &rq->lock; -+} -+ -+static inline raw_spinlock_t *rq_lockp(struct rq *rq) -+{ -+ return __rq_lockp(rq); -+} -+ -+static inline void lockdep_assert_rq_held(struct rq *rq) -+{ -+ lockdep_assert_held(__rq_lockp(rq)); -+} -+ -+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); -+extern void raw_spin_rq_unlock(struct rq *rq); -+ -+static inline void raw_spin_rq_lock(struct rq *rq) -+{ -+ raw_spin_rq_lock_nested(rq, 0); -+} -+ -+static inline void raw_spin_rq_lock_irq(struct rq *rq) -+{ -+ local_irq_disable(); -+ raw_spin_rq_lock(rq); -+} -+ -+static inline void raw_spin_rq_unlock_irq(struct rq *rq) -+{ -+ raw_spin_rq_unlock(rq); -+ local_irq_enable(); -+} -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline bool task_on_cpu(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern int task_running_nice(struct task_struct *p); -+ -+extern struct static_key_false sched_schedstats; -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#define NOHZ_BALANCE_KICK_BIT 0 -+#define NOHZ_STATS_KICK_BIT 1 -+ -+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) -+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) -+ -+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) -+ -+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -+ -+/* TODO: needed? -+extern void nohz_balance_exit_idle(struct rq *rq); -+#else -+static inline void nohz_balance_exit_idle(struct rq *rq) { } -+*/ -+#endif -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+extern void swake_up_all_locked(struct swait_queue_head *q); -+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#ifdef CONFIG_PREEMPT_DYNAMIC -+extern int preempt_dynamic_mode; -+extern int sched_dynamic_mode(const char *str); -+extern void sched_dynamic_update(int mode); -+#endif -+ -+static inline void nohz_run_idle_balance(int cpu) { } -+ -+static inline -+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, -+ struct task_struct *p) -+{ -+ return util; -+} -+ -+static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } -+ -+#endif /* ALT_SCHED_H */ -diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h -new file mode 100644 -index 000000000000..66b77291b9d0 ---- /dev/null -+++ b/kernel/sched/bmq.h -@@ -0,0 +1,110 @@ -+#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+/* -+ * BMQ only routines -+ */ -+#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) -+#define boost_threshold(p) (sched_timeslice_ns >>\ -+ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) -+ -+static inline void boost_task(struct task_struct *p) -+{ -+ int limit; -+ -+ switch (p->policy) { -+ case SCHED_NORMAL: -+ limit = -MAX_PRIORITY_ADJ; -+ break; -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ limit = 0; -+ break; -+ default: -+ return; -+ } -+ -+ if (p->boost_prio > limit) -+ p->boost_prio--; -+} -+ -+static inline void deboost_task(struct task_struct *p) -+{ -+ if (p->boost_prio < MAX_PRIORITY_ADJ) -+ p->boost_prio++; -+} -+ -+/* -+ * Common interfaces -+ */ -+static inline void sched_timeslice_imp(const int timeslice_ms) {} -+ -+static inline int -+task_sched_prio_normal(const struct task_struct *p, const struct rq *rq) -+{ -+ return p->prio + p->boost_prio - MAX_RT_PRIO; -+} -+ -+static inline int task_sched_prio(const struct task_struct *p) -+{ -+ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; -+} -+ -+static inline int -+task_sched_prio_idx(const struct task_struct *p, const struct rq *rq) -+{ -+ return task_sched_prio(p); -+} -+ -+static inline int sched_prio2idx(int prio, struct rq *rq) -+{ -+ return prio; -+} -+ -+static inline int sched_idx2prio(int idx, struct rq *rq) -+{ -+ return idx; -+} -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { -+ if (SCHED_RR != p->policy) -+ deboost_task(p); -+ requeue_task(p, rq, task_sched_prio_idx(p, rq)); -+ } -+} -+ -+static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) {} -+ -+inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = MAX_PRIORITY_ADJ; -+} -+ -+static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = MAX_PRIORITY_ADJ; -+} -+ -+#ifdef CONFIG_SMP -+static inline void sched_task_ttwu(struct task_struct *p) -+{ -+ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) -+ boost_task(p); -+} -+#endif -+ -+static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq) -+{ -+ if (rq_switch_time(rq) < boost_threshold(p)) -+ boost_task(p); -+} -+ -+static inline void update_rq_time_edge(struct rq *rq) {} -diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c -index d9dc9ab3773f..71a25540d65e 100644 ---- a/kernel/sched/build_policy.c -+++ b/kernel/sched/build_policy.c -@@ -42,13 +42,19 @@ - - #include "idle.c" - -+#ifndef CONFIG_SCHED_ALT - #include "rt.c" -+#endif - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - # include "cpudeadline.c" -+#endif - # include "pelt.c" - #endif - - #include "cputime.c" --#include "deadline.c" - -+#ifndef CONFIG_SCHED_ALT -+#include "deadline.c" -+#endif -diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c -index 99bdd96f454f..23f80a86d2d7 100644 ---- a/kernel/sched/build_utility.c -+++ b/kernel/sched/build_utility.c -@@ -85,7 +85,9 @@ - - #ifdef CONFIG_SMP - # include "cpupri.c" -+#ifndef CONFIG_SCHED_ALT - # include "stop_task.c" -+#endif - # include "topology.c" - #endif - -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 1207c78f85c1..68812e0756cb 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -159,9 +159,14 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) - struct rq *rq = cpu_rq(sg_cpu->cpu); - - sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+#ifndef CONFIG_SCHED_ALT - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), - FREQUENCY_UTIL, NULL); -+#else -+ sg_cpu->bw_dl = 0; -+ sg_cpu->util = rq_load_util(rq, sg_cpu->max); -+#endif /* CONFIG_SCHED_ALT */ - } - - /** -@@ -305,8 +310,10 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) - { -+#ifndef CONFIG_SCHED_ALT - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_cpu->sg_policy->limits_changed = true; -+#endif - } - - static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, -@@ -606,6 +613,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -838,7 +846,9 @@ cpufreq_governor_init(schedutil_gov); - #ifdef CONFIG_ENERGY_MODEL - static void rebuild_sd_workfn(struct work_struct *work) - { -+#ifndef CONFIG_SCHED_ALT - rebuild_sched_domains_energy(); -+#endif /* CONFIG_SCHED_ALT */ - } - static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 95fc77853743..b48b3f9ed47f 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -- if (task_nice(p) > 0) { -+ if (task_running_nice(p)) { - task_group_account_field(p, CPUTIME_NICE, cputime); - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -284,7 +284,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -294,7 +294,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -626,7 +626,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - if (task_cputime(p, &cputime.utime, &cputime.stime)) -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 1637b65ba07a..033c6deeb515 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -7,6 +7,7 @@ - * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar - */ - -+#ifndef CONFIG_SCHED_ALT - /* - * This allows printing both to /proc/sched_debug and - * to the console -@@ -215,6 +216,7 @@ static const struct file_operations sched_scaling_fops = { - }; - - #endif /* SMP */ -+#endif /* !CONFIG_SCHED_ALT */ - - #ifdef CONFIG_PREEMPT_DYNAMIC - -@@ -278,6 +280,7 @@ static const struct file_operations sched_dynamic_fops = { - - #endif /* CONFIG_PREEMPT_DYNAMIC */ - -+#ifndef CONFIG_SCHED_ALT - __read_mostly bool sched_debug_verbose; - - static const struct seq_operations sched_debug_sops; -@@ -293,6 +296,7 @@ static const struct file_operations sched_debug_fops = { - .llseek = seq_lseek, - .release = seq_release, - }; -+#endif /* !CONFIG_SCHED_ALT */ - - static struct dentry *debugfs_sched; - -@@ -302,12 +306,15 @@ static __init int sched_init_debug(void) - - debugfs_sched = debugfs_create_dir("sched", NULL); - -+#ifndef CONFIG_SCHED_ALT - debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); - debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose); -+#endif /* !CONFIG_SCHED_ALT */ - #ifdef CONFIG_PREEMPT_DYNAMIC - debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); - #endif - -+#ifndef CONFIG_SCHED_ALT - debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); - debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); - debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); -@@ -337,11 +344,13 @@ static __init int sched_init_debug(void) - #endif - - debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); -+#endif /* !CONFIG_SCHED_ALT */ - - return 0; - } - late_initcall(sched_init_debug); - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_SMP - - static cpumask_var_t sd_sysctl_cpus; -@@ -1068,6 +1077,7 @@ void proc_sched_set_task(struct task_struct *p) - memset(&p->stats, 0, sizeof(p->stats)); - #endif - } -+#endif /* !CONFIG_SCHED_ALT */ - - void resched_latency_warn(int cpu, u64 latency) - { -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f26ab2675f7d..480d4ad16d45 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -400,6 +400,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * idle-task scheduling class. - */ -@@ -521,3 +522,4 @@ DEFINE_SCHED_CLASS(idle) = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h -new file mode 100644 -index 000000000000..56a649d02e49 ---- /dev/null -+++ b/kernel/sched/pds.h -@@ -0,0 +1,127 @@ -+#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+static int sched_timeslice_shift = 22; -+ -+#define NORMAL_PRIO_MOD(x) ((x) & (NORMAL_PRIO_NUM - 1)) -+ -+/* -+ * Common interfaces -+ */ -+static inline void sched_timeslice_imp(const int timeslice_ms) -+{ -+ if (2 == timeslice_ms) -+ sched_timeslice_shift = 21; -+} -+ -+static inline int -+task_sched_prio_normal(const struct task_struct *p, const struct rq *rq) -+{ -+ s64 delta = p->deadline - rq->time_edge + NORMAL_PRIO_NUM - NICE_WIDTH; -+ -+ if (WARN_ONCE(delta > NORMAL_PRIO_NUM - 1, -+ "pds: task_sched_prio_normal() delta %lld\n", delta)) -+ return NORMAL_PRIO_NUM - 1; -+ -+ return (delta < 0) ? 0 : delta; -+} -+ -+static inline int task_sched_prio(const struct task_struct *p) -+{ -+ return (p->prio < MAX_RT_PRIO) ? p->prio : -+ MIN_NORMAL_PRIO + task_sched_prio_normal(p, task_rq(p)); -+} -+ -+static inline int -+task_sched_prio_idx(const struct task_struct *p, const struct rq *rq) -+{ -+ return (p->prio < MAX_RT_PRIO) ? p->prio : MIN_NORMAL_PRIO + -+ NORMAL_PRIO_MOD(task_sched_prio_normal(p, rq) + rq->time_edge); -+} -+ -+static inline int sched_prio2idx(int prio, struct rq *rq) -+{ -+ return (IDLE_TASK_SCHED_PRIO == prio || prio < MAX_RT_PRIO) ? prio : -+ MIN_NORMAL_PRIO + NORMAL_PRIO_MOD((prio - MIN_NORMAL_PRIO) + -+ rq->time_edge); -+} -+ -+static inline int sched_idx2prio(int idx, struct rq *rq) -+{ -+ return (idx < MAX_RT_PRIO) ? idx : MIN_NORMAL_PRIO + -+ NORMAL_PRIO_MOD((idx - MIN_NORMAL_PRIO) + NORMAL_PRIO_NUM - -+ NORMAL_PRIO_MOD(rq->time_edge)); -+} -+ -+static inline void sched_renew_deadline(struct task_struct *p, const struct rq *rq) -+{ -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = (rq->clock >> sched_timeslice_shift) + -+ p->static_prio - (MAX_PRIO - NICE_WIDTH); -+} -+ -+int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio > DEFAULT_PRIO); -+} -+ -+static inline void update_rq_time_edge(struct rq *rq) -+{ -+ struct list_head head; -+ u64 old = rq->time_edge; -+ u64 now = rq->clock >> sched_timeslice_shift; -+ u64 prio, delta; -+ -+ if (now == old) -+ return; -+ -+ delta = min_t(u64, NORMAL_PRIO_NUM, now - old); -+ INIT_LIST_HEAD(&head); -+ -+ for_each_set_bit(prio, &rq->queue.bitmap[2], delta) -+ list_splice_tail_init(rq->queue.heads + MIN_NORMAL_PRIO + -+ NORMAL_PRIO_MOD(prio + old), &head); -+ -+ rq->queue.bitmap[2] = (NORMAL_PRIO_NUM == delta) ? 0UL : -+ rq->queue.bitmap[2] >> delta; -+ rq->time_edge = now; -+ if (!list_empty(&head)) { -+ u64 idx = MIN_NORMAL_PRIO + NORMAL_PRIO_MOD(now); -+ struct task_struct *p; -+ -+ list_for_each_entry(p, &head, sq_node) -+ p->sq_idx = idx; -+ -+ list_splice(&head, rq->queue.heads + idx); -+ rq->queue.bitmap[2] |= 1UL; -+ } -+} -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = sched_timeslice_ns; -+ sched_renew_deadline(p, rq); -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq, task_sched_prio_idx(p, rq)); -+} -+ -+static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) -+{ -+ u64 max_dl = rq->time_edge + NICE_WIDTH - 1; -+ if (unlikely(p->deadline > max_dl)) -+ p->deadline = max_dl; -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ sched_renew_deadline(p, rq); -+} -+ -+static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ time_slice_expired(p, rq); -+} -+ -+#ifdef CONFIG_SMP -+static inline void sched_task_ttwu(struct task_struct *p) {} -+#endif -+static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index 0f310768260c..bd38bf738fe9 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -266,6 +266,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * sched_entity: - * -@@ -383,8 +384,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - /* - * thermal: - * -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index 3a0e0dc28721..e8a7d84aa5a5 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,13 +1,15 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_ALT - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); - - static inline u64 thermal_load_avg(struct rq *rq) -@@ -44,6 +46,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) - return PELT_MIN_DIVIDER + avg->period_contrib; - } - -+#ifndef CONFIG_SCHED_ALT - static inline void cfs_se_util_change(struct sched_avg *avg) - { - unsigned int enqueued; -@@ -180,9 +183,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_ALT */ - - #else - -+#ifndef CONFIG_SCHED_ALT - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -200,6 +205,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - { - return 0; - } -+#endif - - static inline int - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index a4a20046e586..c363693cd869 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -5,6 +5,10 @@ - #ifndef _KERNEL_SCHED_SCHED_H - #define _KERNEL_SCHED_SCHED_H - -+#ifdef CONFIG_SCHED_ALT -+#include "alt_sched.h" -+#else -+ - #include - #include - #include -@@ -3183,4 +3187,9 @@ static inline void update_current_exec_runtime(struct task_struct *curr, - cgroup_account_cputime(curr, delta_exec); - } - -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (task_nice(p) > 0); -+} -+#endif /* !CONFIG_SCHED_ALT */ - #endif /* _KERNEL_SCHED_SCHED_H */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 857f837f52cb..5486c63e4790 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -125,8 +125,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -143,6 +145,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -171,6 +174,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h -index 84a188913cc9..53934e7ef5db 100644 ---- a/kernel/sched/stats.h -+++ b/kernel/sched/stats.h -@@ -89,6 +89,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt - - #endif /* CONFIG_SCHEDSTATS */ - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_FAIR_GROUP_SCHED - struct sched_entity_stats { - struct sched_entity se; -@@ -105,6 +106,7 @@ __schedstats_from_se(struct sched_entity *se) - #endif - return &task_of(se)->stats; - } -+#endif /* CONFIG_SCHED_ALT */ - - #ifdef CONFIG_PSI - void psi_task_change(struct task_struct *task, int clear, int set); -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 8739c2a5a54e..d8dd6c15eb47 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -3,6 +3,7 @@ - * Scheduler topology setup/handling methods - */ - -+#ifndef CONFIG_SCHED_ALT - DEFINE_MUTEX(sched_domains_mutex); - - /* Protected by sched_domains_mutex: */ -@@ -1413,8 +1414,10 @@ static void asym_cpu_capacity_scan(void) - */ - - static int default_relax_domain_level = -1; -+#endif /* CONFIG_SCHED_ALT */ - int sched_domain_level_max; - -+#ifndef CONFIG_SCHED_ALT - static int __init setup_relax_domain_level(char *str) - { - if (kstrtoint(str, 0, &default_relax_domain_level)) -@@ -1647,6 +1650,7 @@ sd_init(struct sched_domain_topology_level *tl, - - return sd; - } -+#endif /* CONFIG_SCHED_ALT */ - - /* - * Topology list, bottom-up. -@@ -1683,6 +1687,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) - sched_domain_topology_saved = NULL; - } - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA - - static const struct cpumask *sd_numa_mask(int cpu) -@@ -2645,3 +2650,15 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); - } -+#else /* CONFIG_SCHED_ALT */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{} -+ -+#ifdef CONFIG_NUMA -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+#endif -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index c6d9dec11b74..2bc42ce8b48e 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -93,6 +93,10 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); - - /* Constants used for minimum and maximum */ - -+#ifdef CONFIG_SCHED_ALT -+extern int sched_yield_type; -+#endif -+ - #ifdef CONFIG_PERF_EVENTS - static const int six_hundred_forty_kb = 640 * 1024; - #endif -@@ -1633,6 +1637,7 @@ int proc_do_static_key(struct ctl_table *table, int write, - } - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA_BALANCING - { - .procname = "numa_balancing", -@@ -1652,6 +1657,7 @@ static struct ctl_table kern_table[] = { - .extra1 = SYSCTL_ZERO, - }, - #endif /* CONFIG_NUMA_BALANCING */ -+#endif /* !CONFIG_SCHED_ALT */ - { - .procname = "panic", - .data = &panic_timeout, -@@ -1953,6 +1959,17 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_ALT -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_TWO, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 3ae661ab6260..35f0176dcdb0 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -2088,8 +2088,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - int ret = 0; - u64 slack; - -+#ifndef CONFIG_SCHED_ALT - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) -+#endif - slack = 0; - - hrtimer_init_sleeper_on_stack(&t, clockid, mode); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index cb925e8ef9a8..67d823510f5c 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -223,7 +223,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -866,6 +866,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_ALT - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -873,6 +874,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -900,8 +902,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -915,7 +919,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1151,8 +1155,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index a2d301f58ced..2ccdede8585c 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1143,10 +1143,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_ALT -+ /* No deadline on BMQ/PDS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/sys-kernel/pinephone-sources/files/5021_sched-alt-missing-rq-lock-irq-function.patch b/sys-kernel/pinephone-sources/files/5021_sched-alt-missing-rq-lock-irq-function.patch deleted file mode 100644 index 04cca61..0000000 --- a/sys-kernel/pinephone-sources/files/5021_sched-alt-missing-rq-lock-irq-function.patch +++ /dev/null @@ -1,30 +0,0 @@ -From 4157360d2e1cbdfb8065f151dbe057b17188a23f Mon Sep 17 00:00:00 2001 -From: Tor Vic -Date: Mon, 7 Nov 2022 15:11:54 +0100 -Subject: [PATCH] sched/alt: Add missing rq_lock_irq() function to header file - ---- - kernel/sched/alt_sched.h | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -index 93ff3bddd36f..a00bc84b93b2 100644 ---- a/kernel/sched/alt_sched.h -+++ b/kernel/sched/alt_sched.h -@@ -387,6 +387,13 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); - } - -+static inline void -+rq_lock_irq(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irq(&rq->lock); -+} -+ - static inline void - rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) --- -GitLab - diff --git a/sys-kernel/pinephone-sources/pinephone-sources-6.1.4.ebuild b/sys-kernel/pinephone-sources/pinephone-sources-6.2.2.ebuild similarity index 92% rename from sys-kernel/pinephone-sources/pinephone-sources-6.1.4.ebuild rename to sys-kernel/pinephone-sources/pinephone-sources-6.2.2.ebuild index eae3067..1ebef22 100644 --- a/sys-kernel/pinephone-sources/pinephone-sources-6.1.4.ebuild +++ b/sys-kernel/pinephone-sources/pinephone-sources-6.2.2.ebuild @@ -17,15 +17,15 @@ DEPEND="${RDEPEND} DESCRIPTION="Full sources for the Linux kernel, with megi's patch for pinephone and gentoo patchset" -MEGI_TAG="orange-pi-6.1-20230104-1712" +MEGI_TAG="orange-pi-6.2-20230307-1859" SRC_URI="https://github.com/megous/linux/archive/${MEGI_TAG}.tar.gz" PATCHES=( #Kernel patch - ${FILESDIR}/1003_linux-6.1.4.patch #Gentoo Patches ${FILESDIR}/1500_XATTR_USER_PREFIX.patch + ${FILESDIR}/1510_fs-enable-link-security-restrictions-by-default.patch ${FILESDIR}/1700_sparc-address-warray-bound-warnings.patch ${FILESDIR}/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch ${FILESDIR}/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch @@ -34,8 +34,6 @@ PATCHES=( ${FILESDIR}/3000_Support-printing-firmware-info.patch ${FILESDIR}/4567_distro-Gentoo-Kconfig.patch ${FILESDIR}/5010_enable-cpu-optimizations-universal.patch - ${FILESDIR}/5020_BMQ-and-PDS-io-scheduler-v6.1-r0.patch - ${FILESDIR}/5021_sched-alt-missing-rq-lock-irq-function.patch # Drop Megi's Modem-Power ${FILESDIR}/0101-arm64-dts-pinephone-drop-modem-power-node.patch