View Issue Details

IDProjectCategoryView StatusLast Update
0017674CentOS-8kernel-pluspublic2020-08-16 17:42
Reporterkabe 
PrioritynormalSeverityfeatureReproducibilityalways
Status acknowledgedResolutionopen 
Platformi686OSCentOS 8.2.2004OS Version8
Product Version8.2.2004 
Target VersionFixed in Version 
Summary0017674: Patches needed for i686 compile of kernel-4.18.0-193.6.3.centos.plus
Descriptionreference: https://bugs.centos.org/view.php?id=17195 (0017195: Patches needed for i686 compile of kernel-4.18.0-147.5.1.centos.plus)

These are the patched needed to create a working i686, based on kernel-plus.
Tested on Hyper-V and ThinkPad R51.

RHEL 8.2 kernel already had become harder to compile on i686.

No, there's no i686 CentOS 8 distribution (yet), this is a FYI "bugreport".
Steps To Reproduce./rpmbuild --target=i686 -v -bb \
--without debug --without debuginfo --with baseonly \
--without kabidupchk --without kabidwchk --without kabidw_base \
--without bpftool \
SPECS/kernel.spec
Additional InformationToracat, you do not need to act on this report;
this is just a FYI report in case AltArch i686 sig become interested in CentOS 8 i686
(perhaps after 7.x had gone EOL).

By the way, the git branch "c8-sig-centosplus-kernel" of kernel.git has 193.14.2.el8_2 as latest, but
.kernel.metadata isn't updated; it points to linux-4.18.0-80.1.2.el8_0.tar.xz binary. Can you look into it if you had time.
Tagsi386

Relationships

related to 0017195 acknowledgedtoracat Patches needed for i686 compile of kernel-4.18.0-147.5.1.centos.plus 

Activities

kabe

kabe

2020-08-15 02:32

reporter   ~0037539

Enlarge struct netlink_callback.args[6] to [8].
I scratched head for several hours, and conculuded that [8] is the best answer.
I couldn't make this a u64, because that breaks compiling somewhere else.
We don't care about ABI compatibility on i686, and this is a netlink call which most userland doesn't use.

i686-netlink_callback-s64-6_2.patch (1,046 bytes)
Enlargen args[6] to args[8], to accomodate struct xfrm_state_walk
in net/xfrm/xfrm_user.c:xfrm_dump_sa() BUILD_BUG_ON.
We don't care about RH_KABI on i686.

diff -up ./include/linux/netlink.h.nlcb ./include/linux/netlink.h
--- ./include/linux/netlink.h.nlcb	2020-07-30 08:59:22.725284039 +0900
+++ ./include/linux/netlink.h	2020-07-30 09:00:44.494497447 +0900
@@ -190,14 +190,18 @@ struct netlink_callback {
 	u16			family;
 	u16			min_dump_alloc;
 	unsigned int		prev_seq, seq;
-	RH_KABI_REPLACE(long	args[6],
+	RH_KABI_REPLACE(long	args[8],
 		        union {
-		                u8              ctx[48];
+#if CONFIG_X86_32
+		                u8              ctx[32];
+#else
+		                u8              ctx[64];
+#endif
 
 				/* args is deprecated. Cast a struct over ctx instead
 		                 * for proper type safety.
 		                 */
-		                long            args[6];
+		                long            args[8];
 		        };)
 	RH_KABI_EXTEND(struct netlink_ext_ack *extack)
 	RH_KABI_EXTEND(bool strict_check)
kabe

kabe

2020-08-15 02:34

reporter   ~0037540

cherry-pick of entry_32.S from kernel.org upstream

patch-32-SAVE_ALL.patch (8,106 bytes)
diff -up ./arch/x86/entry/entry_32.S.sws ./arch/x86/entry/entry_32.S
--- ./arch/x86/entry/entry_32.S.sws	2020-06-02 04:12:31.000000000 +0900
+++ ./arch/x86/entry/entry_32.S	2020-07-27 01:19:34.941674695 +0900
@@ -154,7 +154,7 @@
 
 #endif /* CONFIG_X86_32_LAZY_GS */
 
-.macro SAVE_ALL pt_regs_ax=%eax
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
 	cld
 	PUSH_GS
 	pushl	%fs
@@ -173,6 +173,12 @@
 	movl	$(__KERNEL_PERCPU), %edx
 	movl	%edx, %fs
 	SET_KERNEL_GS %edx
+
+	/* Switch to kernel stack if necessary */
+.if \switch_stacks > 0
+	SWITCH_TO_KERNEL_STACK
+.endif
+
 .endm
 
 /*
@@ -221,6 +227,72 @@
 	POP_GS_EX
 .endm
 
+
+/*
+ * Called with pt_regs fully populated and kernel segments loaded,
+ * so we can access PER_CPU and use the integer registers.
+ *
+ * We need to be very careful here with the %esp switch, because an NMI
+ * can happen everywhere. If the NMI handler finds itself on the
+ * entry-stack, it will overwrite the task-stack and everything we
+ * copied there. So allocate the stack-frame on the task-stack and
+ * switch to it before we do any copying.
+ */
+.macro SWITCH_TO_KERNEL_STACK
+
+	ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
+
+	/* Are we on the entry stack? Bail out if not! */
+	movl	PER_CPU_VAR(cpu_entry_area), %ecx
+	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+	subl	%esp, %ecx	/* ecx = (end of entry_stack) - esp */
+	cmpl	$SIZEOF_entry_stack, %ecx
+	jae	.Lend_\@
+
+	/* Load stack pointer into %esi and %edi */
+	movl	%esp, %esi
+	movl	%esi, %edi
+
+	/* Move %edi to the top of the entry stack */
+	andl	$(MASK_entry_stack), %edi
+	addl	$(SIZEOF_entry_stack), %edi
+
+	/* Load top of task-stack into %edi */
+	movl	TSS_entry2task_stack(%edi), %edi
+
+	/* Bytes to copy */
+	movl	$PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esi)
+	jz	.Lcopy_pt_regs_\@
+
+	/*
+	 * Stack-frame contains 4 additional segment registers when
+	 * coming from VM86 mode
+	 */
+	addl	$(4 * 4), %ecx
+
+.Lcopy_pt_regs_\@:
+#endif
+
+	/* Allocate frame on task-stack */
+	subl	%ecx, %edi
+
+	/* Switch to task-stack */
+	movl	%edi, %esp
+
+	/*
+	 * We are now on the task-stack and can safely copy over the
+	 * stack-frame
+	 */
+	shrl	$2, %ecx
+	cld
+	rep movsl
+
+.Lend_\@:
+.endm
+
 /*
  * %eax: prev task
  * %edx: next task
@@ -422,7 +494,7 @@ ENTRY(entry_SYSENTER_32)
 	pushl	$__USER_CS		/* pt_regs->cs */
 	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
 	pushl	%eax			/* pt_regs->orig_ax */
-	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
+	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest, stack already switched */
 
 	/*
 	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
@@ -533,7 +605,8 @@ ENDPROC(entry_SYSENTER_32)
 ENTRY(entry_INT80_32)
 	ASM_CLAC
 	pushl	%eax			/* pt_regs->orig_ax */
-	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
+
+	SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1	/* save rest */
 
 	/*
 	 * User mode is traced as though IRQs are on, and the interrupt gate
@@ -696,7 +769,8 @@ ENDPROC(common_spurious)
 common_interrupt:
 	ASM_CLAC
 	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
-	SAVE_ALL
+
+	SAVE_ALL switch_stacks=1
 	ENCODE_FRAME_POINTER
 	TRACE_IRQS_OFF
 	movl	%esp, %eax
@@ -704,16 +778,16 @@ common_interrupt:
 	jmp	ret_from_intr
 ENDPROC(common_interrupt)
 
-#define BUILD_INTERRUPT3(name, nr, fn)	\
-ENTRY(name)				\
-	ASM_CLAC;			\
-	pushl	$~(nr);			\
-	SAVE_ALL;			\
-	ENCODE_FRAME_POINTER;		\
-	TRACE_IRQS_OFF			\
-	movl	%esp, %eax;		\
-	call	fn;			\
-	jmp	ret_from_intr;		\
+#define BUILD_INTERRUPT3(name, nr, fn)			\
+ENTRY(name)						\
+	ASM_CLAC;					\
+	pushl	$~(nr);					\
+	SAVE_ALL switch_stacks=1;			\
+	ENCODE_FRAME_POINTER;				\
+	TRACE_IRQS_OFF					\
+	movl	%esp, %eax;				\
+	call	fn;					\
+	jmp	ret_from_intr;				\
 ENDPROC(name)
 
 #define BUILD_INTERRUPT(name, nr)		\
@@ -945,16 +1019,20 @@ common_exception:
 	pushl	%es
 	pushl	%ds
 	pushl	%eax
+	movl	$(__USER_DS), %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	$(__KERNEL_PERCPU), %eax
+	movl	%eax, %fs
 	pushl	%ebp
 	pushl	%edi
 	pushl	%esi
 	pushl	%edx
 	pushl	%ecx
 	pushl	%ebx
+	SWITCH_TO_KERNEL_STACK
 	ENCODE_FRAME_POINTER
 	cld
-	movl	$(__KERNEL_PERCPU), %ecx
-	movl	%ecx, %fs
 	UNWIND_ESPFIX_STACK
 	GS_TO_REG %ecx
 	movl	PT_GS(%esp), %edi		# get the function address
@@ -962,9 +1040,6 @@ common_exception:
 	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
 	REG_TO_PTGS %ecx
 	SET_KERNEL_GS %ecx
-	movl	$(__USER_DS), %ecx
-	movl	%ecx, %ds
-	movl	%ecx, %es
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
 	CALL_NOSPEC %edi
@@ -983,6 +1058,7 @@ ENTRY(debug)
 	 */
 	ASM_CLAC
 	pushl	$-1				# mark this as an int
+
 	SAVE_ALL
 	ENCODE_FRAME_POINTER
 	xorl	%edx, %edx			# error code 0
@@ -1018,6 +1094,7 @@ END(debug)
  */
 ENTRY(nmi)
 	ASM_CLAC
+
 #ifdef CONFIG_X86_ESPFIX32
 	pushl	%eax
 	movl	%ss, %eax
@@ -1081,7 +1158,8 @@ END(nmi)
 ENTRY(int3)
 	ASM_CLAC
 	pushl	$-1				# mark this as an int
-	SAVE_ALL
+
+	SAVE_ALL switch_stacks=1
 	ENCODE_FRAME_POINTER
 	TRACE_IRQS_OFF
 	xorl	%edx, %edx			# zero error code
diff -up ./arch/x86/include/asm/switch_to.h.sws ./arch/x86/include/asm/switch_to.h
--- ./arch/x86/include/asm/switch_to.h.sws	2020-06-02 04:12:31.000000000 +0900
+++ ./arch/x86/include/asm/switch_to.h	2020-07-27 01:10:27.091017349 +0900
@@ -87,13 +87,23 @@ static inline void refresh_sysenter_cs(s
 /* This is used when switching tasks or entering/exiting vm86 mode. */
 static inline void update_sp0(struct task_struct *task)
 {
-	/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
+	/* sp0 always points to the entry trampoline stack, which is constant: */
 #ifdef CONFIG_X86_32
-	load_sp0(task->thread.sp0);
+	if (static_cpu_has(X86_FEATURE_XENPV))
+		load_sp0(task->thread.sp0);
+	else
+		this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
+	/*
+	 * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
+	 * doesn't work on x86-32 because sp1 and
+	 * cpu_current_top_of_stack have different values (because of
+	 * the non-zero stack-padding on 32bit).
+	 */
 	if (static_cpu_has(X86_FEATURE_XENPV))
 		load_sp0(task_top_of_stack(task));
 #endif
+
 }
 
 #endif /* _ASM_X86_SWITCH_TO_H */
diff -up ./arch/x86/kernel/asm-offsets.c.sws ./arch/x86/kernel/asm-offsets.c
--- ./arch/x86/kernel/asm-offsets.c.sws	2020-06-02 04:12:31.000000000 +0900
+++ ./arch/x86/kernel/asm-offsets.c	2020-07-27 01:10:27.092017346 +0900
@@ -102,6 +102,7 @@ void common(void) {
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
 	DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
+	DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
 
 	/* Offset for fields in tss_struct */
 	OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
diff -up ./arch/x86/kernel/cpu/common.c.sws ./arch/x86/kernel/cpu/common.c
--- ./arch/x86/kernel/cpu/common.c.sws	2020-06-02 04:12:31.000000000 +0900
+++ ./arch/x86/kernel/cpu/common.c	2020-07-27 01:10:27.093017344 +0900
@@ -1923,11 +1923,12 @@ void cpu_init(void)
 	enter_lazy_tlb(&init_mm, curr);
 
 	/*
-	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
-	 * task never enters user mode.
+	 * Initialize the TSS.  sp0 points to the entry trampoline stack
+	 * regardless of what task is running.
 	 */
 	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 	load_TR_desc();
+	load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
 
 	load_mm_ldt(&init_mm);
 
diff -up ./arch/x86/kernel/process.c.sws ./arch/x86/kernel/process.c
--- ./arch/x86/kernel/process.c.sws	2020-06-02 04:12:31.000000000 +0900
+++ ./arch/x86/kernel/process.c	2020-07-27 01:10:27.094017341 +0900
@@ -63,14 +63,12 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(st
 		 */
 		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-#ifdef CONFIG_X86_64
 		/*
 		 * .sp1 is cpu_current_top_of_stack.  The init task never
 		 * runs user code, but cpu_current_top_of_stack should still
 		 * be well defined before the first context switch.
 		 */
 		.sp1 = TOP_OF_INIT_STACK,
-#endif
 
 #ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
patch-32-SAVE_ALL.patch (8,106 bytes)
kabe

kabe

2020-08-15 02:35

reporter   ~0037541

cherry-pick from kernel.org upstream

patch-xfrm-time64.patch (1,032 bytes)
commit 03dc7a35fcc83a199121a5156c4a7a976b836682
Author: Arnd Bergmann <arnd@arndb.de>
Date:   Wed Jul 11 12:19:14 2018 +0200

    ipv6: xfrm: use 64-bit timestamps
    
    get_seconds() is deprecated because it can overflow on 32-bit
    architectures.  For the xfrm_state->lastused member, we treat the data
    as a 64-bit number already, so we just need to use the right accessor
    that works on both 32-bit and 64-bit machines.
    
    Signed-off-by: Arnd Bergmann <arnd@arndb.de>
    Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>


Ported to CentOS 8.2 by kabe

diff -up ./include/net/xfrm.h.xfrm ./include/net/xfrm.h
--- ./include/net/xfrm.h.xfrm	2020-07-27 02:01:06.585030434 +0900
+++ ./include/net/xfrm.h	2020-07-27 02:02:43.861709739 +0900
@@ -243,7 +243,8 @@ struct xfrm_state {
 	long		saved_tmo;
 
 	/* Last used time */
-	RH_KABI_REPLACE(unsigned long lastused, time64_t lastused)
+	/*RH_KABI_REPLACE(unsigned long lastused, time64_t lastused)*/
+	time64_t lastused;
 
 	struct page_frag xfrag;
 
patch-xfrm-time64.patch (1,032 bytes)
kabe

kabe

2020-08-15 02:36

reporter   ~0037542

cherry-pick of 32bit patch from kernel.org upstream

patch-TSS_sysenter_sp0-TSS_entry2task_stack.patch (3,043 bytes)
commit ae2e565bc6aaee3f3db420fec5fdd39755c9813e
Author: Joerg Roedel <jroedel@suse.de>
Date:   Wed Jul 18 11:40:39 2018 +0200

    x86/entry/32: Rename TSS_sysenter_sp0 to TSS_entry2task_stack
    
    The stack address doesn't need to be stored in tss.sp0 if the stack is
    switched manually like on sysenter. Rename the offset so that it still
    makes sense when its location is changed in later patches.
    
    This stackk will also be used for all kernel-entry points, not just
    sysenter. Reflect that and the fact that it is the offset to the task-stack
    location in the name as well.
    
    Signed-off-by: Joerg Roedel <jroedel@suse.de>
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
    Tested-by: Pavel Machek <pavel@ucw.cz>
    Cc: "H . Peter Anvin" <hpa@zytor.com>
    Cc: linux-mm@kvack.org
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Andy Lutomirski <luto@kernel.org>
    Cc: Dave Hansen <dave.hansen@intel.com>
    Cc: Josh Poimboeuf <jpoimboe@redhat.com>
    Cc: Juergen Gross <jgross@suse.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Borislav Petkov <bp@alien8.de>
    Cc: Jiri Kosina <jkosina@suse.cz>
    Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
    Cc: Brian Gerst <brgerst@gmail.com>
    Cc: David Laight <David.Laight@aculab.com>
    Cc: Denys Vlasenko <dvlasenk@redhat.com>
    Cc: Eduardo Valentin <eduval@amazon.com>
    Cc: Greg KH <gregkh@linuxfoundation.org>
    Cc: Will Deacon <will.deacon@arm.com>
    Cc: aliguori@amazon.com
    Cc: daniel.gruss@iaik.tugraz.at
    Cc: hughd@google.com
    Cc: keescook@google.com
    Cc: Andrea Arcangeli <aarcange@redhat.com>
    Cc: Waiman Long <llong@redhat.com>
    Cc: "David H . Gutteridge" <dhgutteridge@sympatico.ca>
    Cc: joro@8bytes.org
    Link: https://lkml.kernel.org/r/1531906876-13451-3-git-send-email-joro@8bytes.org

Ported to CentOS 8.2

diff -up ./arch/x86/entry/entry_32.S.e2t ./arch/x86/entry/entry_32.S
--- ./arch/x86/entry/entry_32.S.e2t	2020-07-27 02:10:03.853259204 +0900
+++ ./arch/x86/entry/entry_32.S	2020-07-27 02:48:57.525336422 +0900
@@ -485,7 +485,7 @@ ENTRY(xen_sysenter_target)
  * 0(%ebp) arg6
  */
 ENTRY(entry_SYSENTER_32)
-	movl	TSS_sysenter_sp0(%esp), %esp
+	movl	TSS_entry2task_stack(%esp), %esp
 .Lsysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
 	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
diff -up ./arch/x86/kernel/asm-offsets_32.c.e2t ./arch/x86/kernel/asm-offsets_32.c
--- ./arch/x86/kernel/asm-offsets_32.c.e2t	2020-06-02 04:12:31.000000000 +0900
+++ ./arch/x86/kernel/asm-offsets_32.c	2020-07-27 02:50:27.625118246 +0900
@@ -47,7 +47,9 @@ void foo(void)
 	BLANK();
 
 	/* Offset from the sysenter stack to tss.sp0 */
-	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+	/* Offset from the entry stack to task stack stored in TSS */
+	DEFINE(TSS_entry2task_stack,
+	       offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
 	       offsetofend(struct cpu_entry_area, entry_stack_page.stack));
 
 #ifdef CONFIG_STACKPROTECTOR
kabe

kabe

2020-08-15 02:37

reporter   ~0037543

backport

patch-backport-ktime_get_boottime_ns.patch (596 bytes)
Backport ktime_get_boottime_ns(void), as in
include/drm-backport/linux/timekeeping.h

diff -up ./include/linux/timekeeping.h.kgbn ./include/linux/timekeeping.h
--- ./include/linux/timekeeping.h.kgbn	2020-06-02 04:12:31.000000000 +0900
+++ ./include/linux/timekeeping.h	2020-07-30 11:42:41.674258796 +0900
@@ -116,6 +116,10 @@ static inline u64 ktime_get_real_ns(void
 	return ktime_to_ns(ktime_get_real());
 }
 
+#ifndef ktime_get_boottime_ns
+#define ktime_get_boottime_ns() ktime_get_boot_ns()
+#endif
+
 static inline u64 ktime_get_boot_ns(void)
 {
 	return ktime_to_ns(ktime_get_boottime());
kabe

kabe

2020-08-15 02:39

reporter   ~0037544

a bugfix which may not appear in current circumstances

patch-PROPERTY_ENTRY_STRING.patch (1,551 bytes)
commit c835b4417c18fc2868a38d4689274e3daed5c32b
Author: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date:   Wed Jan 23 17:44:16 2019 +0300

    device property: Fix the length used in PROPERTY_ENTRY_STRING()
    
    commit 2b6e492467c78183bb629bb0a100ea3509b615a5 upstream.
    
    With string type property entries we need to use
    sizeof(const char *) instead of the number of characters as
    the length of the entry.
    
    If the string was shorter then sizeof(const char *),
    attempts to read it would have failed with -EOVERFLOW. The
    problem has been hidden because all build-in string
    properties have had a string longer then 8 characters until
    now.
    
    Fixes: a85f42047533 ("device property: helper macros for property entry creation")
    Cc: 4.5+ <stable@vger.kernel.org> # 4.5+
    Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
    Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
    Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/include/linux/property.h b/include/linux/property.h
index ac8a1ebc4c1b..1a12364050d8 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -258,7 +258,7 @@ struct property_entry {
 #define PROPERTY_ENTRY_STRING(_name_, _val_)		\
 (struct property_entry) {				\
 	.name = _name_,					\
-	.length = sizeof(_val_),			\
+	.length = sizeof(const char *),			\
 	.type = DEV_PROP_STRING,			\
 	{ .value = { .str = _val_ } },			\
 }
kabe

kabe

2020-08-15 02:40

reporter   ~0037545

cherry-pick of 32bit fix from kernel.org

patch-sock-sk_stamp.patch (7,500 bytes)
commit 60f05dddf1eb5db3595e011f293eefa37cefae2e
Author: Deepa Dinamani <deepa.kernel@gmail.com>
Date:   Thu Dec 27 18:55:09 2018 -0800

    sock: Make sock->sk_stamp thread-safe
    
    [ Upstream commit 3a0ed3e9619738067214871e9cb826fa23b2ddb9 ]
    
    Al Viro mentioned (Message-ID
    <20170626041334.GZ10672@ZenIV.linux.org.uk>)
    that there is probably a race condition
    lurking in accesses of sk_stamp on 32-bit machines.
    
    sock->sk_stamp is of type ktime_t which is always an s64.
    On a 32 bit architecture, we might run into situations of
    unsafe access as the access to the field becomes non atomic.
    
    Use seqlocks for synchronization.
    This allows us to avoid using spinlocks for readers as
    readers do not need mutual exclusion.
    
    Another approach to solve this is to require sk_lock for all
    modifications of the timestamps. The current approach allows
    for timestamps to have their own lock: sk_stamp_lock.
    This allows for the patch to not compete with already
    existing critical sections, and side effects are limited
    to the paths in the patch.
    
    The addition of the new field maintains the data locality
    optimizations from
    commit 9115e8cd2a0c ("net: reorganize struct sock for better data
    locality")
    
    Note that all the instances of the sk_stamp accesses
    are either through the ioctl or the syscall recvmsg.
    
    Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(Ported to linux-4.18.0-80.11.2.el8_0.centos.plus)

diff -up ./include/net/sock.h.60f05dddf1eb5 ./include/net/sock.h
--- ./include/net/sock.h.60f05dddf1eb5	2019-09-15 19:14:11.000000000 +0900
+++ ./include/net/sock.h	2020-01-12 20:38:11.884065686 +0900
@@ -300,6 +300,7 @@ struct sock_common {
   *	@sk_filter: socket filtering instructions
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
+  *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
   *	@sk_tsflags: SO_TIMESTAMPING socket options
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
   *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
@@ -476,6 +477,9 @@ struct sock {
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
 	ktime_t			sk_stamp;
+#if BITS_PER_LONG==32
+	seqlock_t		sk_stamp_seq;
+#endif
 	u16			sk_tsflags;
 	u8			sk_shutdown;
 	u32			sk_tskey;
@@ -2321,6 +2325,34 @@ static inline void sk_drops_add(struct s
 	atomic_add(segs, &sk->sk_drops);
 }
 
+static inline ktime_t sock_read_timestamp(struct sock *sk)
+{
+#if BITS_PER_LONG==32
+	unsigned int seq;
+	ktime_t kt;
+
+	do {
+		seq = read_seqbegin(&sk->sk_stamp_seq);
+		kt = sk->sk_stamp;
+	} while (read_seqretry(&sk->sk_stamp_seq, seq));
+
+	return kt;
+#else
+	return sk->sk_stamp;
+#endif
+}
+
+static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
+{
+#if BITS_PER_LONG==32
+	write_seqlock(&sk->sk_stamp_seq);
+	sk->sk_stamp = kt;
+	write_sequnlock(&sk->sk_stamp_seq);
+#else
+	sk->sk_stamp = kt;
+#endif
+}
+
 void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 			   struct sk_buff *skb);
 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
@@ -2345,7 +2377,7 @@ sock_recv_timestamp(struct msghdr *msg,
 	     (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
 		__sock_recv_timestamp(msg, sk, skb);
 	else
-		sk->sk_stamp = kt;
+		sock_write_timestamp(sk, kt);
 
 	if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
 		__sock_recv_wifi_status(msg, sk, skb);
@@ -2366,9 +2398,9 @@ static inline void sock_recv_ts_and_drop
 	if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
 		__sock_recv_ts_and_drops(msg, sk, skb);
 	else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
-		sk->sk_stamp = skb->tstamp;
+		sock_write_timestamp(sk, skb->tstamp);
 	else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP))
-		sk->sk_stamp = 0;
+		sock_write_timestamp(sk, 0);
 }
 
 void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);
diff -up ./net/compat.c.60f05dddf1eb5 ./net/compat.c
--- ./net/compat.c.60f05dddf1eb5	2020-01-12 20:38:11.904065691 +0900
+++ ./net/compat.c	2020-01-12 20:40:46.817107258 +0900
@@ -468,12 +468,13 @@ int compat_sock_get_timestamp(struct soc
 	err = -ENOENT;
 	if (!sock_flag(sk, SOCK_TIMESTAMP))
 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	tv = ktime_to_timeval(sk->sk_stamp);
+	tv = ktime_to_timeval(sock_read_timestamp(sk));
 	if (tv.tv_sec == -1)
 		return err;
 	if (tv.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
-		tv = ktime_to_timeval(sk->sk_stamp);
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
+		tv = ktime_to_timeval(kt);
 	}
 	err = 0;
 	if (put_user(tv.tv_sec, &ctv->tv_sec) ||
@@ -496,12 +497,13 @@ int compat_sock_get_timestampns(struct s
 	err = -ENOENT;
 	if (!sock_flag(sk, SOCK_TIMESTAMP))
 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	ts = ktime_to_timespec(sk->sk_stamp);
+	ts = ktime_to_timespec(sock_read_timestamp(sk));
 	if (ts.tv_sec == -1)
 		return err;
 	if (ts.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
-		ts = ktime_to_timespec(sk->sk_stamp);
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
+		ts = ktime_to_timespec(kt);
 	}
 	err = 0;
 	if (put_user(ts.tv_sec, &ctv->tv_sec) ||
diff -up ./net/core/sock.c.60f05dddf1eb5 ./net/core/sock.c
--- ./net/core/sock.c.60f05dddf1eb5	2019-09-15 19:14:11.000000000 +0900
+++ ./net/core/sock.c	2020-01-12 20:42:49.167140088 +0900
@@ -2846,6 +2846,9 @@ void sock_init_data(struct socket *sock,
 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
 
 	sk->sk_stamp = SK_DEFAULT_STAMP;
+#if BITS_PER_LONG==32
+	seqlock_init(&sk->sk_stamp_seq);
+#endif
 	atomic_set(&sk->sk_zckey, 0);
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -2945,12 +2948,13 @@ int sock_get_timestamp(struct sock *sk,
 	struct timeval tv;
 	if (!sock_flag(sk, SOCK_TIMESTAMP))
 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	tv = ktime_to_timeval(sk->sk_stamp);
+	tv = ktime_to_timeval(sock_read_timestamp(sk));
 	if (tv.tv_sec == -1)
 		return -ENOENT;
 	if (tv.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
-		tv = ktime_to_timeval(sk->sk_stamp);
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
+		tv = ktime_to_timeval(kt);
 	}
 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
 }
@@ -2961,11 +2965,12 @@ int sock_get_timestampns(struct sock *sk
 	struct timespec ts;
 	if (!sock_flag(sk, SOCK_TIMESTAMP))
 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	ts = ktime_to_timespec(sk->sk_stamp);
+	ts = ktime_to_timespec(sock_read_timestamp(sk));
 	if (ts.tv_sec == -1)
 		return -ENOENT;
 	if (ts.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
 		ts = ktime_to_timespec(sk->sk_stamp);
 	}
 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
diff -up ./net/sunrpc/svcsock.c.60f05dddf1eb5 ./net/sunrpc/svcsock.c
--- ./net/sunrpc/svcsock.c.60f05dddf1eb5	2019-09-15 19:14:11.000000000 +0900
+++ ./net/sunrpc/svcsock.c	2020-01-12 20:38:11.960065706 +0900
@@ -574,7 +574,7 @@ static int svc_udp_recvfrom(struct svc_r
 		/* Don't enable netstamp, sunrpc doesn't
 		   need that much accuracy */
 	}
-	svsk->sk_sk->sk_stamp = skb->tstamp;
+	sock_write_timestamp(svsk->sk_sk, skb->tstamp);
 	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
 
 	len  = skb->len;
kabe

kabe

2020-08-15 02:42

reporter   ~0037546

cherry pick from kernel.org
mm: zero out the vma in vma_init()
I'm not sure if this patch is mandatory, but nevertheless appyling it.

patch-zero-out-vma.patch (4,374 bytes)
commit a670468f5e0b5fad4db6e4d195f15915dc2a35c1
Author: Andrew Morton <akpm@linux-foundation.org>
Date:   Tue Aug 21 21:53:06 2018 -0700

    mm: zero out the vma in vma_init()
    
    Rather than in vm_area_alloc().  To ensure that the various oddball
    stack-based vmas are in a good state.  Some of the callers were zeroing
    them out, others were not.
    
    Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
    Cc: Russell King <rmk+kernel@arm.linux.org.uk>
    Cc: Dmitry Vyukov <dvyukov@google.com>
    Cc: Oleg Nesterov <oleg@redhat.com>
    Cc: Andrea Arcangeli <aarcange@redhat.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index d9c299133111..82ab015bf42b 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -330,16 +330,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
  * atomic helpers. Insert it into the gate_vma so that it is visible
  * through ptrace and /proc/<pid>/mem.
  */
-static struct vm_area_struct gate_vma = {
-	.vm_start	= 0xffff0000,
-	.vm_end		= 0xffff0000 + PAGE_SIZE,
-	.vm_flags	= VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC,
-};
+static struct vm_area_struct gate_vma;
 
 static int __init gate_vma_init(void)
 {
 	vma_init(&gate_vma, NULL);
 	gate_vma.vm_page_prot = PAGE_READONLY_EXEC;
+	gate_vma.vm_start = 0xffff0000;
+	gate_vma.vm_end	= 0xffff0000 + PAGE_SIZE;
+	gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC;
 	return 0;
 }
 arch_initcall(gate_vma_init);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 346a146c7617..32920a10100e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -410,7 +410,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	int i, freed = 0;
 	bool truncate_op = (lend == LLONG_MAX);
 
-	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
 	vma_init(&pseudo_vma, current->mm);
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
 	pagevec_init(&pvec);
@@ -595,7 +594,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 	 * allocation routines.  If NUMA is configured, use page index
 	 * as input to create an allocation policy.
 	 */
-	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
 	vma_init(&pseudo_vma, mm);
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
 	pseudo_vma.vm_file = file;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a3cae495f9ce..3a4b87d1a59a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
 
+	memset(vma, 0, sizeof(*vma));
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5ee74c113381..8c760effa42e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep;
 
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
-	struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *vma;
 
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (vma)
 		vma_init(vma, mm);
 	return vma;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 01f1a14facc4..4861ba738d6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 			goto put_new;
 
 		/* Create pseudo-vma that contains just the policy */
-		memset(&pvma, 0, sizeof(struct vm_area_struct));
 		vma_init(&pvma, NULL);
 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
diff --git a/mm/shmem.c b/mm/shmem.c
index c48c79018a7c..fb04baacc9fa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
 		struct shmem_inode_info *info, pgoff_t index)
 {
 	/* Create a pseudo vma that just contains the policy */
-	memset(vma, 0, sizeof(*vma));
 	vma_init(vma, NULL);
 	/* Bias interleave by inode number to distribute better across nodes */
 	vma->vm_pgoff = index + info->vfs_inode.i_ino;
patch-zero-out-vma.patch (4,374 bytes)
kabe

kabe

2020-08-15 02:44

reporter   ~0037547

cherry-pick from kernel.org
linux/kernel.h: fix overflow for DIV_ROUND_UP_ULL

patch-DIV_ROUND_UP_ULL.patch (1,663 bytes)
commit 2656ee5a5ad59300bbe183d0833867a582910dcc
Author: Vinod Koul <vkoul@kernel.org>
Date:   Fri Jun 28 12:07:21 2019 -0700

    linux/kernel.h: fix overflow for DIV_ROUND_UP_ULL
    
    [ Upstream commit 8f9fab480c7a87b10bb5440b5555f370272a5d59 ]
    
    DIV_ROUND_UP_ULL adds the two arguments and then invokes
    DIV_ROUND_DOWN_ULL.  But on a 32bit system the addition of two 32 bit
    values can overflow.  DIV_ROUND_DOWN_ULL does it correctly and stashes
    the addition into a unsigned long long so cast the result to unsigned
    long long here to avoid the overflow condition.
    
    [akpm@linux-foundation.org: DIV_ROUND_UP_ULL must be an rval]
    Link: http://lkml.kernel.org/r/20190625100518.30753-1-vkoul@kernel.org
    Signed-off-by: Vinod Koul <vkoul@kernel.org>
    Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
    Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
    Cc: Randy Dunlap <rdunlap@infradead.org>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    Signed-off-by: Sasha Levin <sashal@kernel.org>

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3d83ebb302cf..f6f94e54ab96 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -118,7 +118,8 @@
 #define DIV_ROUND_DOWN_ULL(ll, d) \
 	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
 
-#define DIV_ROUND_UP_ULL(ll, d)		DIV_ROUND_DOWN_ULL((ll) + (d) - 1, (d))
+#define DIV_ROUND_UP_ULL(ll, d) \
+	DIV_ROUND_DOWN_ULL((unsigned long long)(ll) + (d) - 1, (d))
 
 #if BITS_PER_LONG == 32
 # define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d)
kabe

kabe

2020-08-15 02:45

reporter   ~0037548

cherry-pick from kernel.org
The kernel won't start properly without this.

patch-BSS_MAIN.patch (2,009 bytes)
commit 3b51d71365e0801e19fe81b66b34f2f19935a9ed
Author: Sami Tolvanen <samitolvanen@google.com>
Date:   Mon Apr 15 09:49:56 2019 -0700

    x86/build/lto: Fix truncated .bss with -fdata-sections
    
    [ Upstream commit 6a03469a1edc94da52b65478f1e00837add869a3 ]
    
    With CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y, we compile the kernel with
    -fdata-sections, which also splits the .bss section.
    
    The new section, with a new .bss.* name, which pattern gets missed by the
    main x86 linker script which only expects the '.bss' name. This results
    in the discarding of the second part and a too small, truncated .bss
    section and an unhappy, non-working kernel.
    
    Use the common BSS_MAIN macro in the linker script to properly capture
    and merge all the generated BSS sections.
    
    Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
    Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
    Reviewed-by: Kees Cook <keescook@chromium.org>
    Cc: Borislav Petkov <bp@alien8.de>
    Cc: Kees Cook <keescook@chromium.org>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Nicholas Piggin <npiggin@gmail.com>
    Cc: Nick Desaulniers <ndesaulniers@google.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Thomas Gleixner <tglx@linutronix.de>
    Link: http://lkml.kernel.org/r/20190415164956.124067-1-samitolvanen@google.com
    [ Extended the changelog. ]
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Signed-off-by: Sasha Levin <sashal@kernel.org>

(ported to linux-4.18.0-147.5.1.el8_1.centos.plus)

diff -up ./arch/x86/kernel/vmlinux.lds.S.bssmain ./arch/x86/kernel/vmlinux.lds.S
--- ./arch/x86/kernel/vmlinux.lds.S.bssmain	2020-03-07 20:53:20.744531788 +0900
+++ ./arch/x86/kernel/vmlinux.lds.S	2020-03-07 20:55:07.842228414 +0900
@@ -359,7 +359,7 @@ SECTIONS
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 		__bss_start = .;
 		*(.bss..page_aligned)
-		*(.bss)
+		*(BSS_MAIN)
 		BSS_DECRYPTED
 		. = ALIGN(PAGE_SIZE);
 		__bss_stop = .;
patch-BSS_MAIN.patch (2,009 bytes)
kabe

kabe

2020-08-15 02:46

reporter   ~0037549

cherry-pick from kernel.org
x86/mm/pti: Make pti_clone_kernel_text() compile on 32 bit

patch-__end_rodata_aligned.patch (4,358 bytes)
commit 39d668e04edad25abe184fb329ce35a131146ee5
Author: Joerg Roedel <jroedel@suse.de>
Date:   Wed Jul 18 11:41:04 2018 +0200

    x86/mm/pti: Make pti_clone_kernel_text() compile on 32 bit
    
    The pti_clone_kernel_text() function references __end_rodata_hpage_align,
    which is only present on x86-64.  This makes sense as the end of the rodata
    section is not huge-page aligned on 32 bit.
    
    Nevertheless a symbol is required for the function that points at the right
    address for both 32 and 64 bit. Introduce __end_rodata_aligned for that
    purpose and use it in pti_clone_kernel_text().
    
    Signed-off-by: Joerg Roedel <jroedel@suse.de>
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
    Tested-by: Pavel Machek <pavel@ucw.cz>
    Cc: "H . Peter Anvin" <hpa@zytor.com>
    Cc: linux-mm@kvack.org
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Cc: Andy Lutomirski <luto@kernel.org>
    Cc: Dave Hansen <dave.hansen@intel.com>
    Cc: Josh Poimboeuf <jpoimboe@redhat.com>
    Cc: Juergen Gross <jgross@suse.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Borislav Petkov <bp@alien8.de>
    Cc: Jiri Kosina <jkosina@suse.cz>
    Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
    Cc: Brian Gerst <brgerst@gmail.com>
    Cc: David Laight <David.Laight@aculab.com>
    Cc: Denys Vlasenko <dvlasenk@redhat.com>
    Cc: Eduardo Valentin <eduval@amazon.com>
    Cc: Greg KH <gregkh@linuxfoundation.org>
    Cc: Will Deacon <will.deacon@arm.com>
    Cc: aliguori@amazon.com
    Cc: daniel.gruss@iaik.tugraz.at
    Cc: hughd@google.com
    Cc: keescook@google.com
    Cc: Andrea Arcangeli <aarcange@redhat.com>
    Cc: Waiman Long <llong@redhat.com>
    Cc: "David H . Gutteridge" <dhgutteridge@sympatico.ca>
    Cc: joro@8bytes.org
    Link: https://lkml.kernel.org/r/1531906876-13451-28-git-send-email-joro@8bytes.org

(ported to 4.18.0-147.5.1.el8.centos.plus)

diff -up ./arch/x86/include/asm/sections.h.rodat ./arch/x86/include/asm/sections.h
--- ./arch/x86/include/asm/sections.h.rodat	2020-01-14 23:54:17.000000000 +0900
+++ ./arch/x86/include/asm/sections.h	2020-03-07 21:01:14.842615598 +0900
@@ -7,6 +7,7 @@
 
 extern char __brk_base[], __brk_limit[];
 extern struct exception_table_entry __stop___ex_table[];
+extern char __end_rodata_aligned[];
 
 #if defined(CONFIG_X86_64)
 extern char __end_rodata_hpage_align[];
diff -up ./arch/x86/kernel/vmlinux.lds.S.rodat ./arch/x86/kernel/vmlinux.lds.S
--- ./arch/x86/kernel/vmlinux.lds.S.rodat	2020-03-07 21:01:14.832615533 +0900
+++ ./arch/x86/kernel/vmlinux.lds.S	2020-03-07 21:05:25.579167858 +0900
@@ -55,11 +55,12 @@ jiffies_64 = jiffies;
  * so we can enable protection checks as well as retain 2MB large page
  * mappings for kernel text.
  */
-#define X64_ALIGN_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
+#define X86_ALIGN_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
 
-#define X64_ALIGN_RODATA_END					\
+#define X86_ALIGN_RODATA_END					\
 		. = ALIGN(HPAGE_SIZE);				\
-		__end_rodata_hpage_align = .;
+		__end_rodata_hpage_align = .;			\
+		__end_rodata_aligned = .;
 
 #define ALIGN_ENTRY_TEXT_BEGIN	. = ALIGN(PMD_SIZE);
 #define ALIGN_ENTRY_TEXT_END	. = ALIGN(PMD_SIZE);
@@ -83,8 +84,10 @@ jiffies_64 = jiffies;
 
 #else
 
-#define X64_ALIGN_RODATA_BEGIN
-#define X64_ALIGN_RODATA_END
+#define X86_ALIGN_RODATA_BEGIN
+#define X86_ALIGN_RODATA_END					\
+		. = ALIGN(PAGE_SIZE);				\
+		__end_rodata_aligned = .;
 
 #define ALIGN_ENTRY_TEXT_BEGIN
 #define ALIGN_ENTRY_TEXT_END
@@ -149,9 +152,9 @@ SECTIONS
 
 	/* .text should occupy whole number of pages */
 	. = ALIGN(PAGE_SIZE);
-	X64_ALIGN_RODATA_BEGIN
+	X86_ALIGN_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
-	X64_ALIGN_RODATA_END
+	X86_ALIGN_RODATA_END
 
 	/* Data */
 	.data : AT(ADDR(.data) - LOAD_OFFSET) {
diff -up ./arch/x86/mm/pti.c.rodat ./arch/x86/mm/pti.c
--- ./arch/x86/mm/pti.c.rodat	2020-01-14 23:54:17.000000000 +0900
+++ ./arch/x86/mm/pti.c	2020-03-07 21:01:14.843615604 +0900
@@ -502,7 +502,7 @@ void pti_clone_kernel_text(void)
 	 * clone the areas past rodata, they might contain secrets.
 	 */
 	unsigned long start = PFN_ALIGN(_text);
-	unsigned long end_clone  = (unsigned long)__end_rodata_hpage_align;
+	unsigned long end_clone  = (unsigned long)__end_rodata_aligned;
 	unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
 
 	if (!pti_kernel_image_global_ok())
kabe

kabe

2020-08-15 02:48

reporter   ~0037550

fix from https://bugzilla.kernel.org/show_bug.cgi?id=206181#c12
memory hot-add is planned to be disabled for x86_32 upstream kernel.org, so
this is the last chance to enable it.

patch-hv_balloon-hotadd-panic.patch (1,369 bytes)
This fixes panic on Hyper-V, which occurs around 66 seconds after boot,
when there's memory pressure, and Hyper-V host tries to hot-add memory 
to guest.

Workaround: "hv_balloon.hot_add=0" kernel command line.

Since we're hot-adding then online-ing the page,
we shouldn't free the page.

Posted as https://bugzilla.kernel.org/show_bug.cgi?id=206181#c12

diff -up ./drivers/hv/hv_balloon.c.ha00 ./drivers/hv/hv_balloon.c
--- ./drivers/hv/hv_balloon.c.ha00	2020-01-14 23:54:17.000000000 +0900
+++ ./drivers/hv/hv_balloon.c	2020-03-07 21:12:44.446718500 +0900
@@ -692,7 +692,7 @@ static void hv_page_online_one(struct hv
 	/* This frame is currently backed; online the page. */
 	__online_page_set_limits(pg);
 	__online_page_increment_counters(pg);
-	__online_page_free(pg);
+	/*__online_page_free(pg);*/
 
 	lockdep_assert_held(&dm_device.ha_lock);
 	dm_device.num_pages_onlined++;
@@ -740,6 +740,8 @@ static void hv_mem_hot_add(unsigned long
 		dm_device.ha_waiting = !memhp_auto_online;
 
 		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
+
+		pr_debug("%s: calling add_memory(nid=%d, ((start_pfn=0x%lx) << PAGE_SHIFT)=0x%llx, (HA_CHUNK << PAGE_SHIFT)=%lu)\n", __func__, nid, start_pfn, ((unsigned long long)start_pfn << PAGE_SHIFT), ((unsigned long)HA_CHUNK << PAGE_SHIFT));
 		ret = add_memory(nid, PFN_PHYS((start_pfn)),
 				(HA_CHUNK << PAGE_SHIFT));
 
kabe

kabe

2020-08-15 02:50

reporter   ~0037551

See

Bug 206401 Summary: kernel panic on Hyper-V after 5 minutes due to memory hot-add
https://bugzilla.kernel.org/show_bug.cgi?id=206401

for rationale of this patch.

patch-bhe-hyperv-hotplug.patch (1,944 bytes)
See 

Bug 206401 Summary: kernel panic on Hyper-V after 5 minutes due to memory hot-add 
https://bugzilla.kernel.org/show_bug.cgi?id=206401

for rationale of this patch.

diff -up ./mm/memory_hotplug.c.ha00 ./mm/memory_hotplug.c
--- ./mm/memory_hotplug.c.ha00	2019-09-15 19:14:11.000000000 +0900
+++ ./mm/memory_hotplug.c	2020-02-21 11:15:28.372889966 +0900
@@ -820,15 +820,19 @@ static struct zone *default_kernel_zone_
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	int zid;
+	enum  zone_type default_zone = ZONE_NORMAL; /*9faf47bd*/
 
-	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+#ifdef CONFIG_HIGHMEM				/*9faf47bd*/
+	default_zone = ZONE_HIGHMEM;		/*9faf47bd*/
+#endif						/*9faf47bd*/
+	for (zid = 0; zid <= default_zone; zid++) { /*9faf47bd*/
 		struct zone *zone = &pgdat->node_zones[zid];
 
 		if (zone_intersects(zone, start_pfn, nr_pages))
 			return zone;
 	}
 
-	return &pgdat->node_zones[ZONE_NORMAL];
+	return &pgdat->node_zones[default_zone]; /*9faf47bd*/
 }
 
 static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
diff -up ./mm/sparse.c.ha00 ./mm/sparse.c
--- ./mm/sparse.c.ha00	2019-09-15 19:14:11.000000000 +0900
+++ ./mm/sparse.c	2020-02-21 11:18:04.615247199 +0900
@@ -594,16 +594,21 @@ static struct page *__kmalloc_section_me
 	page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
 	if (page)
 		goto got_map_page;
+	pr_debug("%s: alloc_pages() returned 0x%px (should be 0), reverting to vmalloc(memmap_size=%lu)\n", __func__, page, memmap_size);
+	BUG_ON(page != 0);
 
 	ret = vmalloc(memmap_size);
+	pr_debug("%s: vmalloc(%lu) returned 0x%px\n", __func__, memmap_size, ret);
 	if (ret)
 		goto got_map_ptr;
 
 	return NULL;
 got_map_page:
 	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+	pr_debug("%s: allocated struct page *page=0x%px\n", __func__, page);
 got_map_ptr:
 
+	pr_debug("%s: returning struct page * =0x%px\n", __func__, ret);
 	return ret;
 }
 
kabe

kabe

2020-08-15 02:54

reporter   ~0037552

This is a fix for i915.ko from git://anongit.freedesktop.org/drm-tip
which is still not ported to RHEL codebase.
It makes kernel panic when you move (render) a mouse cursor.
i915.ko is also enabled in CentOS 8 x86_64, but it may not work on older hardware.

drm-i915-Wean-off-drm_pci_alloc-drm_pci_free-el8_2.patch (9,478 bytes)
commit c6790dc22312f592c1434577258b31c48c72d52a
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Sun Feb 2 15:39:34 2020 +0000

    drm/i915: Wean off drm_pci_alloc/drm_pci_free
    
    drm_pci_alloc and drm_pci_free are just very thin wrappers around
    dma_alloc_coherent, with a note that we should be removing them.
    Furthermore since
    
    commit de09d31dd38a50fdce106c15abd68432eebbd014
    Author: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
    Date:   Fri Jan 15 16:51:42 2016 -0800
    
        page-flags: define PG_reserved behavior on compound pages
    
        As far as I can see there's no users of PG_reserved on compound pages.
        Let's use PF_NO_COMPOUND here.
    
    drm_pci_alloc has been declared broken since it mixes GFP_COMP and
    SetPageReserved. Avoid this conflict by weaning ourselves off using the
    abstraction and using the dma functions directly.
    
    Reported-by: Taketo Kabe
    Closes: https://gitlab.freedesktop.org/drm/intel/issues/1027
    Fixes: de09d31dd38a ("page-flags: define PG_reserved behavior on compound pages")
    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
    Cc: <stable@vger.kernel.org> # v4.5+
    Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
    Link: https://patchwork.freedesktop.org/patch/msgid/20200202153934.3899472-1-chris@chris-wilson.co.uk


Ported to CentOS 8.2 kernel by T.Kabe

diff -up ./drivers/gpu/drm/i915/display/intel_display.c.i915 ./drivers/gpu/drm/i915/display/intel_display.c
--- ./drivers/gpu/drm/i915/display/intel_display.c.i915	2020-06-02 04:12:31.000000000 +0900
+++ ./drivers/gpu/drm/i915/display/intel_display.c	2020-08-10 12:33:30.905928678 +0900
@@ -10361,7 +10361,7 @@ static u32 intel_cursor_base(const struc
 	u32 base;
 
 	if (INTEL_INFO(dev_priv)->display.cursor_needs_physical)
-		base = obj->phys_handle->busaddr;
+		base = sg_dma_address(obj->mm.pages->sgl);
 	else
 		base = intel_plane_ggtt_offset(plane_state);
 
diff -up ./drivers/gpu/drm/i915/gem/i915_gem_object_types.h.i915 ./drivers/gpu/drm/i915/gem/i915_gem_object_types.h
--- ./drivers/gpu/drm/i915/gem/i915_gem_object_types.h.i915	2020-06-02 04:12:31.000000000 +0900
+++ ./drivers/gpu/drm/i915/gem/i915_gem_object_types.h	2020-08-10 12:58:14.397467363 +0900
@@ -227,6 +227,18 @@ struct drm_i915_gem_object {
 		bool quirked:1;
 	} mm;
 
+	/** Breadcrumb of last rendering to the buffer.
+	 * There can only be one writer, but we allow for multiple readers.
+	 * If there is a writer that necessarily implies that all other
+	 * read requests are complete - but we may only be lazily clearing
+	 * the read requests. A read request is naturally the most recent
+	 * request on a ring, so we may have two different write and read
+	 * requests on one ring where the write request is older than the
+	 * read request. This allows for the CPU to read from an active
+	 * buffer by only waiting for the write to complete.
+	 */
+	struct reservation_object *resv;
+
 	/** References from framebuffers, locks out tiling changes. */
 	unsigned int framebuffer_references;
 
@@ -247,8 +259,7 @@ struct drm_i915_gem_object {
 		void *gvt_info;
 	};
 
-	/** for phys allocated objects */
-	struct drm_dma_handle *phys_handle;
+	struct reservation_object __builtin_resv;
 };
 
 static inline struct drm_i915_gem_object *
diff -up ./drivers/gpu/drm/i915/gem/i915_gem_phys.c.i915 ./drivers/gpu/drm/i915/gem/i915_gem_phys.c
--- ./drivers/gpu/drm/i915/gem/i915_gem_phys.c.i915	2020-08-10 11:36:47.435394473 +0900
+++ ./drivers/gpu/drm/i915/gem/i915_gem_phys.c	2020-08-10 12:29:07.582942074 +0900
@@ -20,88 +20,87 @@
 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 {
 	struct address_space *mapping = obj->base.filp->f_mapping;
-	struct drm_dma_handle *phys;
-	struct sg_table *st;
 	struct scatterlist *sg;
-	char *vaddr;
+	struct sg_table *st;
+	dma_addr_t dma;
+	void *vaddr;
+	void *dst;
 	int i;
-	int err;
 
 	if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 		return -EINVAL;
 
-	/* Always aligning to the object size, allows a single allocation
+	/*
+	 * Always aligning to the object size, allows a single allocation
 	 * to handle all possible callers, and given typical object sizes,
 	 * the alignment of the buddy allocation will naturally match.
 	 */
-	phys = drm_pci_alloc(obj->base.dev,
-			     roundup_pow_of_two(obj->base.size),
-			     roundup_pow_of_two(obj->base.size));
-	if (!phys)
+	vaddr = dma_alloc_coherent(&obj->base.dev->pdev->dev,
+				   roundup_pow_of_two(obj->base.size),
+				   &dma, GFP_KERNEL);
+	if (!vaddr)
 		return -ENOMEM;
 
-	vaddr = phys->vaddr;
+	st = kmalloc(sizeof(*st), GFP_KERNEL);
+	if (!st)
+		goto err_pci;
+
+	if (sg_alloc_table(st, 1, GFP_KERNEL))
+		goto err_st;
+
+	sg = st->sgl;
+	sg->offset = 0;
+	sg->length = obj->base.size;
+
+	sg_assign_page(sg, (struct page *)vaddr);
+	sg_dma_address(sg) = dma;
+	sg_dma_len(sg) = obj->base.size;
+
+	dst = vaddr;
 	for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 		struct page *page;
-		char *src;
+		void *src;
 
 		page = shmem_read_mapping_page(mapping, i);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto err_phys;
-		}
+		if (IS_ERR(page))
+			goto err_st;
 
 		src = kmap_atomic(page);
-		memcpy(vaddr, src, PAGE_SIZE);
-		drm_clflush_virt_range(vaddr, PAGE_SIZE);
+		memcpy(dst, src, PAGE_SIZE);
+		drm_clflush_virt_range(dst, PAGE_SIZE);
 		kunmap_atomic(src);
 
 		put_page(page);
-		vaddr += PAGE_SIZE;
+		dst += PAGE_SIZE;
 	}
 
 	i915_gem_chipset_flush(to_i915(obj->base.dev));
 
-	st = kmalloc(sizeof(*st), GFP_KERNEL);
-	if (!st) {
-		err = -ENOMEM;
-		goto err_phys;
-	}
-
-	if (sg_alloc_table(st, 1, GFP_KERNEL)) {
-		kfree(st);
-		err = -ENOMEM;
-		goto err_phys;
-	}
-
-	sg = st->sgl;
-	sg->offset = 0;
-	sg->length = obj->base.size;
-
-	sg_dma_address(sg) = phys->busaddr;
-	sg_dma_len(sg) = obj->base.size;
-
-	obj->phys_handle = phys;
-
 	__i915_gem_object_set_pages(obj, st, sg->length);
 
 	return 0;
 
-err_phys:
-	drm_pci_free(obj->base.dev, phys);
-
-	return err;
+err_st:
+	kfree(st);
+err_pci:
+	dma_free_coherent(&obj->base.dev->pdev->dev,
+			  roundup_pow_of_two(obj->base.size),
+			  vaddr, dma);
+	return -ENOMEM;
 }
 
 static void
 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 			       struct sg_table *pages)
 {
+	dma_addr_t dma = sg_dma_address(pages->sgl);
+	void *vaddr = sg_page(pages->sgl);
+
 	__i915_gem_object_release_shmem(obj, pages, false);
 
 	if (obj->mm.dirty) {
 		struct address_space *mapping = obj->base.filp->f_mapping;
-		char *vaddr = obj->phys_handle->vaddr;
+		void *src = vaddr;
 		int i;
 
 		for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
@@ -113,15 +112,16 @@ i915_gem_object_put_pages_phys(struct dr
 				continue;
 
 			dst = kmap_atomic(page);
-			drm_clflush_virt_range(vaddr, PAGE_SIZE);
-			memcpy(dst, vaddr, PAGE_SIZE);
+			drm_clflush_virt_range(src, PAGE_SIZE);
+			memcpy(dst, src, PAGE_SIZE);
 			kunmap_atomic(dst);
 
 			set_page_dirty(page);
 			if (obj->mm.madv == I915_MADV_WILLNEED)
 				mark_page_accessed(page);
 			put_page(page);
-			vaddr += PAGE_SIZE;
+
+			src += PAGE_SIZE;
 		}
 		obj->mm.dirty = false;
 	}
@@ -129,7 +129,9 @@ i915_gem_object_put_pages_phys(struct dr
 	sg_free_table(pages);
 	kfree(pages);
 
-	drm_pci_free(obj->base.dev, obj->phys_handle);
+	dma_free_coherent(&obj->base.dev->pdev->dev,
+			  roundup_pow_of_two(obj->base.size),
+			  vaddr, dma);
 }
 
 static void
diff -up ./drivers/gpu/drm/i915/i915_gem.c.i915 ./drivers/gpu/drm/i915/i915_gem.c
--- ./drivers/gpu/drm/i915/i915_gem.c.i915	2020-08-10 11:36:47.502394300 +0900
+++ ./drivers/gpu/drm/i915/i915_gem.c	2020-08-10 12:29:07.583942092 +0900
@@ -130,7 +130,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem
 		     struct drm_i915_gem_pwrite *args,
 		     struct drm_file *file)
 {
-	void *vaddr = obj->phys_handle->vaddr + args->offset;
+	void *vaddr = sg_page(obj->mm.pages->sgl) + args->offset;
 	char __user *user_data = u64_to_user_ptr(args->data_ptr);
 
 	/* We manually control the domain here and pretend that it
@@ -844,10 +844,10 @@ i915_gem_pwrite_ioctl(struct drm_device
 		ret = i915_gem_gtt_pwrite_fast(obj, args);
 
 	if (ret == -EFAULT || ret == -ENOSPC) {
-		if (obj->phys_handle)
-			ret = i915_gem_phys_pwrite(obj, args, file);
-		else
+		if (i915_gem_object_has_struct_page(obj))
 			ret = i915_gem_shmem_pwrite(obj, args);
+		else
+			ret = i915_gem_phys_pwrite(obj, args, file);
 	}
 
 	i915_gem_object_unpin_pages(obj);
diff -up ./drivers/gpu/drm/i915/i915_gem_object.h.i915 ./drivers/gpu/drm/i915/i915_gem_object.h
--- ./drivers/gpu/drm/i915/i915_gem_object.h.i915	2020-08-10 11:36:47.556394161 +0900
+++ ./drivers/gpu/drm/i915/i915_gem_object.h	2020-08-10 12:29:07.583942092 +0900
@@ -289,9 +289,6 @@ struct drm_i915_gem_object {
 		void *gvt_info;
 	};
 
-	/** for phys allocated objects */
-	struct drm_dma_handle *phys_handle;
-
 	struct reservation_object __builtin_resv;
 };
 
diff -up ./drivers/gpu/drm/i915/intel_display.c.i915 ./drivers/gpu/drm/i915/intel_display.c
--- ./drivers/gpu/drm/i915/intel_display.c.i915	2020-08-10 11:36:47.579394102 +0900
+++ ./drivers/gpu/drm/i915/intel_display.c	2020-08-10 12:33:40.895117846 +0900
@@ -10070,7 +10070,7 @@ static u32 intel_cursor_base(const struc
 	u32 base;
 
 	if (INTEL_INFO(dev_priv)->display.cursor_needs_physical)
-		base = obj->phys_handle->busaddr;
+		base = sg_dma_address(obj->mm.pages->sgl);
 	else
 		base = intel_plane_ggtt_offset(plane_state);
 
kabe

kabe

2020-08-15 02:58

reporter   ~0037553

CONFIG_DEFERRED_STRUCT_PAGE_INIT doesn't play well on 32bit; just disable it. It makes kernel panic.
It only benefits when having over around 10GB of memory, which couldn't on 32bit non-PAE.

patch-889c695d-no-DEFERRED_STRUCT_PAGE_INIT-32bit.patch (3,904 bytes)
commit 889c695d419f19e5db52592dafbaf26143c36d1f
Author: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
Date:   Thu Sep 20 12:22:30 2018 -0700

    mm: disable deferred struct page for 32-bit arches
    
    Deferred struct page init is needed only on systems with large amount of
    physical memory to improve boot performance.  32-bit systems do not
    benefit from this feature.
    
    Jiri reported a problem where deferred struct pages do not work well with
    x86-32:
    
    [    0.035162] Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
    [    0.035725] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
    [    0.036269] Initializing CPU#0
    [    0.036513] Initializing HighMem for node 0 (00036ffe:0007ffe0)
    [    0.038459] page:f6780000 is uninitialized and poisoned
    [    0.038460] raw: ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff
    [    0.039509] page dumped because: VM_BUG_ON_PAGE(1 && PageCompound(page))
    [    0.040038] ------------[ cut here ]------------
    [    0.040399] kernel BUG at include/linux/page-flags.h:293!
    [    0.040823] invalid opcode: 0000 [#1] SMP PTI
    [    0.041166] CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc1_pt_jiri #9
    [    0.041694] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014
    [    0.042496] EIP: free_highmem_page+0x64/0x80
    [    0.042839] Code: 13 46 d8 c1 e8 18 5d 83 e0 03 8d 04 c0 c1 e0 06 ff 80 ec 5f 44 d8 c3 8d b4 26 00 00 00 00 ba 08 65 28 d8 89 d8 e8 fc 71 02 00 <0f> 0b 8d 76 00 8d bc 27 00 00 00 00 ba d0 b1 26 d8 89 d8 e8 e4 71
    [    0.044338] EAX: 0000003c EBX: f6780000 ECX: 00000000 EDX: d856cbe8
    [    0.044868] ESI: 0007ffe0 EDI: d838df20 EBP: d838df00 ESP: d838defc
    [    0.045372] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210086
    [    0.045913] CR0: 80050033 CR2: 00000000 CR3: 18556000 CR4: 00040690
    [    0.046413] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
    [    0.046913] DR6: fffe0ff0 DR7: 00000400
    [    0.047220] Call Trace:
    [    0.047419]  add_highpages_with_active_regions+0xbd/0x10d
    [    0.047854]  set_highmem_pages_init+0x5b/0x71
    [    0.048202]  mem_init+0x2b/0x1e8
    [    0.048460]  start_kernel+0x1d2/0x425
    [    0.048757]  i386_start_kernel+0x93/0x97
    [    0.049073]  startup_32_smp+0x164/0x168
    [    0.049379] Modules linked in:
    [    0.049626] ---[ end trace 337949378db0abbb ]---
    
    We free highmem pages before their struct pages are initialized:
    
    mem_init()
     set_highmem_pages_init()
      add_highpages_with_active_regions()
       free_highmem_page()
        .. Access uninitialized struct page here..
    
    Because there is no reason to have this feature on 32-bit systems, just
    disable it.
    
    Link: http://lkml.kernel.org/r/20180831150506.31246-1-pavel.tatashin@microsoft.com
    Fixes: 2e3ca40f03bb ("mm: relax deferred struct page requirements")
    Signed-off-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
    Reported-by: Jiri Slaby <jslaby@suse.cz>
    Acked-by: Michal Hocko <mhocko@suse.com>
    Cc: <stable@vger.kernel.org>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

(ported to linux-4.18.0-147.5.1.el8_1)
diff -up ./mm/Kconfig.889c695d ./mm/Kconfig
--- ./mm/Kconfig.889c695d	2020-04-25 14:33:40.177165049 +0900
+++ ./mm/Kconfig	2020-04-25 14:34:39.256262894 +0900
@@ -631,8 +631,9 @@ config DEFERRED_STRUCT_PAGE_INIT
 	bool "Defer initialisation of struct pages to kthreads"
 	default n
 	depends on NO_BOOTMEM
-	depends on !FLATMEM
+ 	depends on SPARSEMEM
 	depends on !NEED_PER_CPU_KM
+	depends on 64BIT
 	help
 	  Ordinarily all struct pages are initialised during early boot in a
 	  single thread. On very large machines this can take a considerable
kabe

kabe

2020-08-15 03:00

reporter   ~0037554

code pullup of rtlwifi

patch-no-rate_control_send_low.patch (678 bytes)
Remaining instance of rate_control_send_low(),
erased in upstream 1e87fec9fa52a6f7c223998d6bfbd3464eb37e31 (2019/05/16)

diff -up ./drivers/staging/rtlwifi/rc.c.rcsl ./drivers/staging/rtlwifi/rc.c
--- ./drivers/staging/rtlwifi/rc.c.rcsl	2020-06-02 04:12:31.000000000 +0900
+++ ./drivers/staging/rtlwifi/rc.c	2020-07-30 20:47:31.695719976 +0900
@@ -162,9 +162,6 @@ static void rtl_get_rate(void *ppriv, st
 	u8 try_per_rate, i, rix;
 	bool not_data = !ieee80211_is_data(fc);
 
-	if (rate_control_send_low(sta, priv_sta, txrc))
-		return;
-
 	rix = _rtl_rc_get_highest_rix(rtlpriv, sta, skb, not_data);
 	try_per_rate = 1;
 	_rtl_rc_rate_set_series(rtlpriv, sta, &rates[0], txrc,
kabe

kabe

2020-08-15 03:01

reporter   ~0037555

Additional patch for already patched region;
without this, kernel panics when initrd starts.

patch-i686-sp1.patch (3,452 bytes)
Part of
a6b744f3ce (x86/entry/32: Load task stack from x86_tss.sp1 in SYSENTER handler x86_tss.sp0 will be used to point to the entry stack later to use it as a trampoline stack for other kernel entry points besides SYSENTER.)
is a prerequiste, because tss.x86_tss.sp0 had been enabled also for
x86/32bit in 45d7b25574.

Without this patch, kernel panics when initrd starts:

Welcome to CentOS Linux 8 (Core) dracut-049-27.git20190906.el8 (Initramfs)!

[    6.573748] systemd[1]: Set hostname to <centos8i686.localdomain>.
[    6.584926] Core dump to |/bin/false pipe failed
[    6.590457] BUG: unable to handle kernel NULL pointer dereference at 00000000
[    6.600065] *pde = 00000000
[    6.600240] Oops: 0010 [#1] SMP
[    6.600240] CPU: 0 PID: 176 Comm: systemd Not tainted 4.18.0-193.6.3.el8_2.centos.plus.i686 #1
[    6.600240] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008  12/07/2018
[    6.600240] EIP: 0x0
[    6.600240] Code: Bad RIP value.
[    6.600240] EAX: ff801fec EBX: fffffff6 ECX: 00000001 EDX: 00000001
[    6.600240] ESI: ff802020 EDI: 00000000 EBP: ff801eec ESP: ff801ec8
[    6.600240] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010046
[    6.600240] CR0: 80050033 CR2: fffffff6 CR3: 174bf000 CR4: 003406d0
[    6.600240] Call Trace:
[    6.600240]  <ENTRY_TRAMPOLINE>
[    6.600240]  ? __do_page_fault+0x460/0x460
[    6.600240]  ? do_signal+0x23/0x510
[    6.600240]  ? bad_area_access_error+0x81/0xa0
[    6.600240]  ? __do_page_fault+0x2a6/0x460
[    6.600240]  ? __do_page_fault+0x460/0x460
[    6.600240]  ? exit_to_usermode_loop+0x6a/0xd0
[    6.600240]  ? prepare_exit_to_usermode+0x57/0x90
[    6.600240]  ? resume_userspace+0xe/0x13
[    6.600240]  </ENTRY_TRAMPOLINE>
[    6.600240] Modules linked in:
[    6.600240] CR2: 0000000000000000
[    6.600240] ---[ end trace 64f164775f16e25e ]---
[    6.600240] EIP: 0x0
[    6.600240] Code: Bad RIP value.
[    6.600240] EAX: ff801fec EBX: fffffff6 ECX: 00000001 EDX: 00000001
[    6.600240] ESI: ff802020 EDI: 00000000 EBP: ff801eec ESP: d74c237c
[    6.600240] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010046
[    6.600240] CR0: 80050033 CR2: fffffff6 CR3: 174bf000 CR4: 003406d0
[    6.600240] Kernel panic - not syncing: Fatal exception
[    6.600240] Kernel Offset: 0x15800000 from 0xc1000000 (relocation range: 0xc0000000-0xf77fdfff)
[    6.600240] ---[ end Kernel panic - not syncing: Fatal exception ]---

kabe@

diff -up ./arch/x86/kernel/asm-offsets_32.c.sp1 ./arch/x86/kernel/asm-offsets_32.c
--- ./arch/x86/kernel/asm-offsets_32.c.sp1	2020-08-09 21:11:51.972560973 +0900
+++ ./arch/x86/kernel/asm-offsets_32.c	2020-08-09 21:13:52.133253688 +0900
@@ -46,10 +46,14 @@ void foo(void)
 	OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
 	BLANK();
 
-	/* Offset from the sysenter stack to tss.sp0 */
-	/* Offset from the entry stack to task stack stored in TSS */
+	/*
+	 * Offset from the entry stack to task stack stored in TSS. Kernel entry
+	 * happens on the per-cpu entry-stack, and the asm code switches to the
+	 * task-stack pointer stored in x86_tss.sp1, which is a copy of
+	 * task->thread.sp0 where entry code can find it.
+	 */
 	DEFINE(TSS_entry2task_stack,
-	       offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+	       offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
 	       offsetofend(struct cpu_entry_area, entry_stack_page.stack));
 
 #ifdef CONFIG_STACKPROTECTOR
patch-i686-sp1.patch (3,452 bytes)
kabe

kabe

2020-08-15 03:03

reporter   ~0037556

This is a patch for kexec-tools package, not kernel, but needs an updated one on kernel install.
Just a cherry-pick from upstream, to make it compile for 32bit.

patch-2c9f26ed-build-multiboot2-i386.patch (2,998 bytes)
commit 2c9f26ed20a791a7df0182ba82e93abb52f5a615
Author: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date:   Mon Nov 18 12:52:15 2019 +1300

    kexec: build multiboot2 for i386
    
    This addresses the following compilation issues when building for i386.
    
     kexec/arch/i386/kexec-x86.c:39:22: error: 'multiboot2_x86_probe' undeclared here (not in a function); did you mean 'multiboot_x86_probe'?
       { "multiboot2-x86", multiboot2_x86_probe, multiboot2_x86_load,
                           ^~~~~~~~~~~~~~~~~~~~
                           multiboot_x86_probe
     kexec/arch/i386/kexec-x86.c:39:44: error: 'multiboot2_x86_load' undeclared here (not in a function); did you mean 'multiboot_x86_load'?
       { "multiboot2-x86", multiboot2_x86_probe, multiboot2_x86_load,
                                                 ^~~~~~~~~~~~~~~~~~~
                                                 multiboot_x86_load
     kexec/arch/i386/kexec-x86.c:40:4: error: 'multiboot2_x86_usage' undeclared here (not in a function); did you mean 'multiboot_x86_usage'?
         multiboot2_x86_usage },
         ^~~~~~~~~~~~~~~~~~~~
         multiboot_x86_usage
     make: *** [Makefile:114: kexec/arch/i386/kexec-x86.o] Error 1
     make: *** Waiting for unfinished jobs....
    
    Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
    Signed-off-by: Simon Horman <horms@verge.net.au>

diff --git a/kexec/arch/i386/Makefile b/kexec/arch/i386/Makefile
index 105cefd..f486103 100644
--- a/kexec/arch/i386/Makefile
+++ b/kexec/arch/i386/Makefile
@@ -7,6 +7,7 @@ i386_KEXEC_SRCS += kexec/arch/i386/kexec-elf-x86.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-elf-rel-x86.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-bzImage.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-multiboot-x86.c
+i386_KEXEC_SRCS += kexec/arch/i386/kexec-mb2-x86.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-beoboot-x86.c
 i386_KEXEC_SRCS += kexec/arch/i386/kexec-nbi.c
 i386_KEXEC_SRCS += kexec/arch/i386/x86-linux-setup.c
@@ -14,7 +15,6 @@ i386_KEXEC_SRCS += kexec/arch/i386/crashdump-x86.c
 
 dist += kexec/arch/i386/Makefile $(i386_KEXEC_SRCS)			\
 	kexec/arch/i386/crashdump-x86.h					\
-	kexec/arch/i386/kexec-mb2-x86.c					\
 	kexec/arch/i386/kexec-x86.h					\
 	kexec/arch/i386/x86-linux-setup.h				\
 	kexec/arch/i386/include/arch/options.h
diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h
index 1b58c3b..0f941df 100644
--- a/kexec/arch/i386/kexec-x86.h
+++ b/kexec/arch/i386/kexec-x86.h
@@ -60,6 +60,11 @@ int multiboot_x86_load(int argc, char **argv, const char *buf, off_t len,
 	struct kexec_info *info);
 void multiboot_x86_usage(void);
 
+int multiboot2_x86_load(int argc, char **argv, const char *buf, off_t len,
+			struct kexec_info *info);
+void multiboot2_x86_usage(void);
+int multiboot2_x86_probe(const char *buf, off_t buf_len);
+
 int elf_x86_probe(const char *buf, off_t len);
 int elf_x86_load(int argc, char **argv, const char *buf, off_t len,
 	struct kexec_info *info);
kabe

kabe

2020-08-15 03:05

reporter   ~0037557

.config file for i686 I'm using.
Drop it in as SOURCES/kernel-i686.config .

kernel-i686.config (185,628 bytes)
kabe

kabe

2020-08-15 03:08

reporter   ~0037558

SPECS/kernel.spec I'm using.
Based on kernel-plus.

kernel.spec.gz (682,061 bytes)
kabe

kabe

2020-08-15 03:12

reporter   ~0037559

That's all. That was a lot of patches already.
Thanks for your attention.
toracat

toracat

2020-08-16 14:07

manager   ~0037560

@kabe

Thanks for all the work.
I'll look into the .kernel.metadata issue.
toracat

toracat

2020-08-16 17:35

manager   ~0037561

@kabe

I've updated .kernel.metadata. However please note that the plus kernel version is 4.18.0-193.14.2.el8_2, not 14.3. This mismatch was due to a recent urgent build of the distro kernel. It will be back to normal in the next release.

Issue History

Date Modified Username Field Change
2020-08-15 02:27 kabe New Issue
2020-08-15 02:27 kabe Tag Attached: i386
2020-08-15 02:32 kabe File Added: i686-netlink_callback-s64-6_2.patch
2020-08-15 02:32 kabe Note Added: 0037539
2020-08-15 02:34 kabe File Added: patch-32-SAVE_ALL.patch
2020-08-15 02:34 kabe Note Added: 0037540
2020-08-15 02:35 kabe File Added: patch-xfrm-time64.patch
2020-08-15 02:35 kabe Note Added: 0037541
2020-08-15 02:36 kabe File Added: patch-TSS_sysenter_sp0-TSS_entry2task_stack.patch
2020-08-15 02:36 kabe Note Added: 0037542
2020-08-15 02:37 kabe File Added: patch-backport-ktime_get_boottime_ns.patch
2020-08-15 02:37 kabe Note Added: 0037543
2020-08-15 02:39 kabe File Added: patch-PROPERTY_ENTRY_STRING.patch
2020-08-15 02:39 kabe Note Added: 0037544
2020-08-15 02:40 kabe File Added: patch-sock-sk_stamp.patch
2020-08-15 02:40 kabe Note Added: 0037545
2020-08-15 02:42 kabe File Added: patch-zero-out-vma.patch
2020-08-15 02:42 kabe Note Added: 0037546
2020-08-15 02:44 kabe File Added: patch-DIV_ROUND_UP_ULL.patch
2020-08-15 02:44 kabe Note Added: 0037547
2020-08-15 02:45 kabe File Added: patch-BSS_MAIN.patch
2020-08-15 02:45 kabe Note Added: 0037548
2020-08-15 02:46 kabe File Added: patch-__end_rodata_aligned.patch
2020-08-15 02:46 kabe Note Added: 0037549
2020-08-15 02:48 kabe File Added: patch-hv_balloon-hotadd-panic.patch
2020-08-15 02:48 kabe Note Added: 0037550
2020-08-15 02:50 kabe File Added: patch-bhe-hyperv-hotplug.patch
2020-08-15 02:50 kabe Note Added: 0037551
2020-08-15 02:54 kabe File Added: drm-i915-Wean-off-drm_pci_alloc-drm_pci_free-el8_2.patch
2020-08-15 02:54 kabe Note Added: 0037552
2020-08-15 02:58 kabe File Added: patch-889c695d-no-DEFERRED_STRUCT_PAGE_INIT-32bit.patch
2020-08-15 02:58 kabe Note Added: 0037553
2020-08-15 03:00 kabe File Added: patch-no-rate_control_send_low.patch
2020-08-15 03:00 kabe Note Added: 0037554
2020-08-15 03:01 kabe File Added: patch-i686-sp1.patch
2020-08-15 03:01 kabe Note Added: 0037555
2020-08-15 03:03 kabe File Added: patch-2c9f26ed-build-multiboot2-i386.patch
2020-08-15 03:03 kabe Note Added: 0037556
2020-08-15 03:05 kabe File Added: kernel-i686.config
2020-08-15 03:05 kabe Note Added: 0037557
2020-08-15 03:08 kabe File Added: kernel.spec.gz
2020-08-15 03:08 kabe Note Added: 0037558
2020-08-15 03:12 kabe Note Added: 0037559
2020-08-16 13:59 toracat Status new => acknowledged
2020-08-16 14:07 toracat Note Added: 0037560
2020-08-16 17:35 toracat Note Added: 0037561
2020-08-16 17:42 toracat Relationship added related to 0017195