From b845b517b5e3706a3729f6ea83b88ab85f0725b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Aug 2008 21:47:09 +0200 Subject: [PATCH 01/17] printk: robustify printk Avoid deadlocks against rq->lock and xtime_lock by deferring the klogd wakeup by polling from the timer tick. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 4 ++++ kernel/printk.c | 19 +++++++++++++++++-- kernel/time/tick-sched.c | 2 +- kernel/timer.c | 1 + 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index aaa998f65c7..113ac8d0425 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -200,6 +200,8 @@ extern struct ratelimit_state printk_ratelimit_state; extern int printk_ratelimit(void); extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); +extern void printk_tick(void); +extern int printk_needs_cpu(int); #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); @@ -211,6 +213,8 @@ static inline int printk_ratelimit(void) { return 0; } static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ unsigned int interval_msec) \ { return false; } +static inline void printk_tick(void) { } +static inline int printk_needs_cpu(int) { return 0; } #endif extern void asmlinkage __attribute__((format(printf, 1, 2))) diff --git a/kernel/printk.c b/kernel/printk.c index b51b1567bb5..655cc2ca10c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -982,10 +982,25 @@ int is_console_locked(void) return console_locked; } +static DEFINE_PER_CPU(int, printk_pending); + +void printk_tick(void) +{ + if (__get_cpu_var(printk_pending)) { + __get_cpu_var(printk_pending) = 0; + wake_up_interruptible(&log_wait); + } +} + +int printk_needs_cpu(int cpu) +{ + return per_cpu(printk_pending, cpu); +} + void wake_up_klogd(void) { - if (!oops_in_progress && waitqueue_active(&log_wait)) - wake_up_interruptible(&log_wait); + if (waitqueue_active(&log_wait)) + __get_cpu_var(printk_pending) = 1; } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 825b4c00fe4..c13d4f18237 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -255,7 +255,7 @@ void tick_nohz_stop_sched_tick(int inidle) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - if (rcu_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) delta_jiffies = 1; /* * Do not stop the tick, if we are only one off diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1f159..510fe69351c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -978,6 +978,7 @@ void update_process_times(int user_tick) run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); + printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); } From ced9cd40ac14111befd6b0c73ec90106c22a3fd7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 11 Aug 2008 14:38:12 +0200 Subject: [PATCH 02/17] printk: robustify printk, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: include/linux/kernel.h: In function ‘printk_needs_cpu': include/linux/kernel.h:217: error: parameter name omitted Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 113ac8d0425..3652a456412 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -200,8 +200,6 @@ extern struct ratelimit_state printk_ratelimit_state; extern int printk_ratelimit(void); extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); -extern void printk_tick(void); -extern int printk_needs_cpu(int); #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); @@ -213,10 +211,11 @@ static inline int printk_ratelimit(void) { return 0; } static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ unsigned int interval_msec) \ { return false; } -static inline void printk_tick(void) { } -static inline int printk_needs_cpu(int) { return 0; } #endif +extern int printk_needs_cpu(int cpu); +extern void printk_tick(void); + extern void asmlinkage __attribute__((format(printf, 1, 2))) early_printk(const char *fmt, ...); From fa33507a22623b3bd543b15a21c362cf364b6cff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Aug 2008 09:31:26 +0200 Subject: [PATCH 03/17] printk: robustify printk, fix #2 Dmitry Adamushko reported: > [*] btw., with DEBUG being enabled, pr_debug() generates [1] when > debug_smp_processor_id() is used (CONFIG_DEBUG_PREEMPT). > > the problem seems to be caused by the following commit: > commit b845b517b5e3706a3729f6ea83b88ab85f0725b0 > Author: Peter Zijlstra > Date: Fri Aug 8 21:47:09 2008 +0200 > > printk: robustify printk > > > wake_up_klogd() -> __get_cpu_var() -> smp_processor_id() > > and that's being called from release_console_sem() which is, in turn, > said to be "may be called from any context" [2] > > and in this case, it seems to be called from some non-preemptible > context (although, it can't be printk()... > although, I haven't looked carefully yet). > > Provided [2], __get_cpu_var() is perhaps not the right solution there. > > > [1] > > [ 7697.942005] BUG: using smp_processor_id() in preemptible [00000000] code: syslogd/3542 > [ 7697.942005] caller is wake_up_klogd+0x1b/0x50 > [ 7697.942005] Pid: 3542, comm: syslogd Not tainted 2.6.27-rc3-tip-git #2 > [ 7697.942005] Call Trace: > [ 7697.942005] [] debug_smp_processor_id+0xe8/0xf0 > [ 7697.942005] [] wake_up_klogd+0x1b/0x50 > [ 7697.942005] [] release_console_sem+0x1e7/0x200 > [ 7697.942005] [] do_con_write+0xb7/0x1f30 > [ 7697.942005] [] ? show_trace+0x10/0x20 > [ 7697.942005] [] ? dump_stack+0x72/0x80 > [ 7697.942005] [] ? __ratelimit+0xbd/0xe0 > [ 7697.942005] [] ? debug_smp_processor_id+0xe8/0xf0 > [ 7697.942005] [] ? wake_up_klogd+0x1b/0x50 > [ 7697.942005] [] ? release_console_sem+0x1e7/0x200 > [ 7697.942005] [] con_write+0x19/0x30 > [ 7697.942005] [] write_chan+0x276/0x3c0 > [ 7697.942005] [] ? default_wake_function+0x0/0x10 > [ 7697.942005] [] ? _spin_lock_irqsave+0x22/0x50 > [ 7697.942005] [] tty_write+0x194/0x260 > [ 7697.942005] [] ? write_chan+0x0/0x3c0 > [ 7697.942005] [] redirected_tty_write+0xa4/0xb0 > [ 7697.942005] [] ? redirected_tty_write+0x0/0xb0 > [ 7697.942005] [] do_loop_readv_writev+0x52/0x80 > [ 7697.942005] [] do_readv_writev+0x1bd/0x1d0 > [ 7697.942005] [] vfs_writev+0x39/0x60 > [ 7697.942005] [] sys_writev+0x50/0x90 > [ 7697.942005] [] system_call_fastpath+0x16/0x1b Signed-off-by: Peter Zijlstra Reported-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk.c b/kernel/printk.c index 655cc2ca10c..57e9cd7a958 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1000,7 +1000,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - __get_cpu_var(printk_pending) = 1; + __raw_get_cpu_var(printk_pending) = 1; } /** From 1fa63a817d27af7dc0d5ed454eb8fe5dec65fac7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 20 Aug 2008 14:40:55 +0200 Subject: [PATCH 04/17] printk: robustify printk, update comment Remove the comment describing the possibility of printk() deadlocking on runqueue lock. Signed-off-by: Jiri Kosina Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/printk.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index 57e9cd7a958..6f27c6a4bdc 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -577,9 +577,6 @@ static int have_callable_console(void) * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. - * Be aware of the fact that if oops_in_progress is not set, we might try to - * wake klogd up which could deadlock on runqueue lock if printk() is called - * from scheduler code. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output From 1cf44baad76b6f20f95ece397c6f643320aa44c9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 4 Sep 2008 21:26:06 +0200 Subject: [PATCH 05/17] IO resources: fix/remove printk Andrew Morton noticed that the printk in kernel/resource.c was buggy: | start and end have type resource_size_t. Such types CANNOT be printed | unless cast to a known type. | | Because there is a %s following an incorrect %lld, the above code will | crash the machine. ... and it's probably quite unneeded as well, so remove it. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/resource.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 414d6fc9131..fc59dcc4795 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -549,13 +549,9 @@ static void __init __reserve_region_with_split(struct resource *root, } if (!res) { - printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n", - conflict->name, conflict->start, conflict->end, - name, start, end); - /* failed, split and try again */ - /* conflict coverred whole area */ + /* conflict covered whole area */ if (conflict->start <= start && conflict->end >= end) return; From 978b0116cd225682a29e3d1d5010319bf2de32c2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 6 Sep 2008 20:04:36 +0200 Subject: [PATCH 06/17] softirq: allocate less vectors We don't need whole 32 of them, only NR_SOFTIRQS. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/interrupt.h | 2 ++ kernel/softirq.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 58ff4e74b2f..54b3623434e 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -252,6 +252,8 @@ enum HRTIMER_SOFTIRQ, #endif RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ + + NR_SOFTIRQS }; /* softirq mask and active fields moved to irq_cpustat_t in diff --git a/kernel/softirq.c b/kernel/softirq.c index c506f266a6b..82e32aadedd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -46,7 +46,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; EXPORT_SYMBOL(irq_stat); #endif -static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; +static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); From 600715dcdf567c86f8b2c6173fcfb4b873e25a19 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:45 -0700 Subject: [PATCH 07/17] generic: add phys_addr_t for holding physical addresses Add a kernel-wide "phys_addr_t" which is guaranteed to be able to hold any physical address. By default it equals the word size of the architecture, but a 32-bit architecture can set ARCH_PHYS_ADDR_T_64BIT if it needs a 64-bit phys_addr_t. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/powerpc/Kconfig | 3 +++ arch/powerpc/include/asm/types.h | 7 ------- arch/x86/Kconfig | 3 +++ include/asm-x86/page_32.h | 2 -- include/asm-x86/page_64.h | 1 - include/linux/types.h | 6 ++++++ mm/Kconfig | 3 +++ 7 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 587da5e0990..f5f83ee6041 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -22,6 +22,9 @@ config WORD_SIZE config PPC_MERGE def_bool y +config ARCH_PHYS_ADDR_T_64BIT + def_bool PPC64 || PHYS_64BIT + config MMU bool default y diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index d3374bc865b..c646f34c4e8 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -48,13 +48,6 @@ typedef struct { typedef __vector128 vector128; -/* Physical address used by some IO functions */ -#if defined(CONFIG_PPC64) || defined(CONFIG_PHYS_64BIT) -typedef u64 phys_addr_t; -#else -typedef u32 phys_addr_t; -#endif - #ifdef __powerpc64__ typedef u64 dma_addr_t; #else diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d132..a0ffb5188c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -932,6 +932,9 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config ARCH_PHYS_ADDR_T_64BIT + def_bool X86_64 || X86_PAE + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index ab8528793f0..d0e58605a52 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -33,7 +33,6 @@ typedef u64 pmdval_t; typedef u64 pudval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; -typedef u64 phys_addr_t; typedef union { struct { @@ -54,7 +53,6 @@ typedef unsigned long pmdval_t; typedef unsigned long pudval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; -typedef unsigned long phys_addr_t; typedef union { pteval_t pte; diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index c6916c83e6b..2456fbf2d7d 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -79,7 +79,6 @@ typedef unsigned long pmdval_t; typedef unsigned long pudval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; -typedef unsigned long phys_addr_t; typedef struct page *pgtable_t; diff --git a/include/linux/types.h b/include/linux/types.h index d4a9ce6e276..022c668496d 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -197,6 +197,12 @@ typedef u64 resource_size_t; typedef u32 resource_size_t; #endif +#ifdef CONFIG_PHYS_ADDR_T_64BIT +typedef u64 phys_addr_t; +#else +typedef u32 phys_addr_t; +#endif + struct ustat { __kernel_daddr_t f_tfree; __kernel_ino_t f_tinode; diff --git a/mm/Kconfig b/mm/Kconfig index 0bd9c2dbb2a..91ee3922510 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,6 +187,9 @@ config RESOURCES_64BIT help This option allows memory and IO resources to be 64 bit. +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + config ZONE_DMA_FLAG int default "0" if !ZONE_DMA From 947d0496cf3e12ebfa70b3eaf561c25403247ce9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:48 -0700 Subject: [PATCH 08/17] generic: make PFN_PHYS explicitly return phys_addr_t PFN_PHYS, as its name suggests, turns a pfn into a physical address. However, it is a macro which just operates on its argument without modifying its type. pfns are typed unsigned long, but an unsigned long may not be long enough to hold a physical address (32-bit systems with more than 32 bits of physcial address). Make sure we cast to phys_addr_t to return a complete result. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/m32r/mm/discontig.c | 4 ++-- include/asm-x86/xen/page.h | 4 ++-- include/linux/pfn.h | 6 +++++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c index cbc3c4c5456..7daf897292c 100644 --- a/arch/m32r/mm/discontig.c +++ b/arch/m32r/mm/discontig.c @@ -111,9 +111,9 @@ unsigned long __init setup_memory(void) initrd_start, INITRD_SIZE); } else { printk("initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + "(0x%08lx > 0x%08llx)\ndisabling initrd\n", INITRD_START + INITRD_SIZE, - PFN_PHYS(max_low_pfn)); + (unsigned long long)PFN_PHYS(max_low_pfn)); initrd_start = 0; } diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index 7b3835d3b77..de9170b537a 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -76,13 +76,13 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) static inline xmaddr_t phys_to_machine(xpaddr_t phys) { unsigned offset = phys.paddr & ~PAGE_MASK; - return XMADDR(PFN_PHYS((u64)pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); + return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); } static inline xpaddr_t machine_to_phys(xmaddr_t machine) { unsigned offset = machine.maddr & ~PAGE_MASK; - return XPADDR(PFN_PHYS((u64)mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); + return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); } /* diff --git a/include/linux/pfn.h b/include/linux/pfn.h index bb01f8b92b5..7646637221f 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h @@ -1,9 +1,13 @@ #ifndef _LINUX_PFN_H_ #define _LINUX_PFN_H_ +#ifndef __ASSEMBLY__ +#include +#endif + #define PFN_ALIGN(x) (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK) #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) +#define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT) #endif From 8308c54d7e312f7a03e2ce2057d0837e6fe3843f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:50 -0700 Subject: [PATCH 09/17] generic: redefine resource_size_t as phys_addr_t There's no good reason why a resource_size_t shouldn't just be a physical address, so simply redefine it in terms of phys_addr_t. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/powerpc/platforms/Kconfig.cputype | 1 - arch/powerpc/sysdev/ppc4xx_pci.c | 16 ++++++---------- arch/x86/Kconfig | 1 - arch/x86/kernel/e820.c | 4 +--- drivers/pci/setup-bus.c | 9 ++++----- include/linux/types.h | 8 ++------ 6 files changed, 13 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7f651273386..be852fd407a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -135,7 +135,6 @@ config PTE_64BIT config PHYS_64BIT bool 'Large physical address support' if E500 depends on 44x || E500 - select RESOURCES_64BIT default y if 44x ---help--- This option enables kernel support for larger than 32-bit physical diff --git a/arch/powerpc/sysdev/ppc4xx_pci.c b/arch/powerpc/sysdev/ppc4xx_pci.c index fb368dfde5d..e8a76d9539d 100644 --- a/arch/powerpc/sysdev/ppc4xx_pci.c +++ b/arch/powerpc/sysdev/ppc4xx_pci.c @@ -41,13 +41,10 @@ extern unsigned long total_memory; #define U64_TO_U32_LOW(val) ((u32)((val) & 0x00000000ffffffffULL)) #define U64_TO_U32_HIGH(val) ((u32)((val) >> 32)) -#ifdef CONFIG_RESOURCES_64BIT -#define RES_TO_U32_LOW(val) U64_TO_U32_LOW(val) -#define RES_TO_U32_HIGH(val) U64_TO_U32_HIGH(val) -#else -#define RES_TO_U32_LOW(val) (val) -#define RES_TO_U32_HIGH(val) (0) -#endif +#define RES_TO_U32_LOW(val) \ + ((sizeof(resource_size_t) > sizeof(u32)) ? U64_TO_U32_LOW(val) : (val)) +#define RES_TO_U32_HIGH(val) \ + ((sizeof(resource_size_t) > sizeof(u32)) ? U64_TO_U32_HIGH(val) : (0)) static inline int ppc440spe_revA(void) { @@ -145,12 +142,11 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, /* Use that */ res->start = pci_addr; -#ifndef CONFIG_RESOURCES_64BIT /* Beware of 32 bits resources */ - if ((pci_addr + size) > 0x100000000ull) + if (sizeof(resource_size_t) == sizeof(u32) && + (pci_addr + size) > 0x100000000ull) res->end = 0xffffffff; else -#endif res->end = res->start + size - 1; break; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a0ffb5188c8..b4e1875f986 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -925,7 +925,6 @@ config X86_PAE def_bool n prompt "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G - select RESOURCES_64BIT help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 66e48aa2dd1..477f4bb7e55 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1276,12 +1276,10 @@ void __init e820_reserve_resources(void) res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; -#ifndef CONFIG_RESOURCES_64BIT - if (end > 0x100000000ULL) { + if (end != (resource_size_t)end) { res++; continue; } -#endif res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 1aad599816f..f250a90ee45 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -378,11 +378,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long align = 0; min_align = 0; for (order = 0; order <= max_order; order++) { -#ifdef CONFIG_RESOURCES_64BIT - resource_size_t align1 = 1ULL << (order + 20); -#else - resource_size_t align1 = 1U << (order + 20); -#endif + resource_size_t align1 = 1; + + align1 <<= (order + 20); + if (!align) min_align = align1; else if (ALIGN(align + min_align, min_align) < align1) diff --git a/include/linux/types.h b/include/linux/types.h index 022c668496d..f24f7beb47d 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -191,18 +191,14 @@ typedef __u32 __bitwise __wsum; #ifdef __KERNEL__ typedef unsigned __bitwise__ gfp_t; -#ifdef CONFIG_RESOURCES_64BIT -typedef u64 resource_size_t; -#else -typedef u32 resource_size_t; -#endif - #ifdef CONFIG_PHYS_ADDR_T_64BIT typedef u64 phys_addr_t; #else typedef u32 phys_addr_t; #endif +typedef phys_addr_t resource_size_t; + struct ustat { __kernel_daddr_t f_tfree; __kernel_ino_t f_tinode; From 9a22b6e76ba75fa0f3963cdec7829156d00a7173 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 18 Sep 2008 12:50:18 +0200 Subject: [PATCH 10/17] dmi scan: warn about too early calls to dmi_check_system() It happened to me recently that i added a dmi_check_system() quirk in a too early codepath, and it was silently ignored because all the DMI tables and strings were still empty. As this situation is clearly a programming error / kernel bug, warn when it happens, instead of silently ignoring quirks. Signed-off-by: Ingo Molnar --- drivers/firmware/dmi_scan.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 455575be356..3e526b6d00c 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -15,6 +15,11 @@ */ static char dmi_empty_string[] = " "; +/* + * Catch too early calls to dmi_check_system(): + */ +static int dmi_initialized; + static const char * __init dmi_string_nosave(const struct dmi_header *dm, u8 s) { const u8 *bp = ((u8 *) dm) + dm->length; @@ -366,7 +371,7 @@ void __init dmi_scan_machine(void) if (efi_enabled) { if (efi.smbios == EFI_INVALID_TABLE_ADDR) - goto out; + goto error; /* This is called as a core_initcall() because it isn't * needed during early boot. This also means we can @@ -374,13 +379,13 @@ void __init dmi_scan_machine(void) */ p = dmi_ioremap(efi.smbios, 32); if (p == NULL) - goto out; + goto error; rc = dmi_present(p + 0x10); /* offset of _DMI_ string */ dmi_iounmap(p, 32); if (!rc) { dmi_available = 1; - return; + goto out; } } else { @@ -391,19 +396,22 @@ void __init dmi_scan_machine(void) */ p = dmi_ioremap(0xF0000, 0x10000); if (p == NULL) - goto out; + goto error; for (q = p; q < p + 0x10000; q += 16) { rc = dmi_present(q); if (!rc) { dmi_available = 1; dmi_iounmap(p, 0x10000); - return; + goto out; } } dmi_iounmap(p, 0x10000); } - out: printk(KERN_INFO "DMI not present or invalid.\n"); + error: + printk(KERN_INFO "DMI not present or invalid.\n"); + out: + dmi_initialized = 1; } /** @@ -424,6 +432,8 @@ int dmi_check_system(const struct dmi_system_id *list) int i, count = 0; const struct dmi_system_id *d = list; + WARN(!dmi_initialized, KERN_ERR "dmi check: not initialized yet.\n"); + while (d->ident) { for (i = 0; i < ARRAY_SIZE(d->matches); i++) { int s = d->matches[i].slot; From 3b7ecb5d2ffde82efd1b1bcc6780dc8a019acf02 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 19 Sep 2008 01:56:23 -0700 Subject: [PATCH 11/17] softlockup: Documentation/sysctl/kernel.txt: fix softlockup_thresh description - s/s/seconds/ - s/10 seconds/60 seconds/ - Mention the zero-disables-it feature. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- Documentation/sysctl/kernel.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 276a7e63782..e1ff0d920a5 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -351,9 +351,10 @@ kernel. This value defaults to SHMMAX. softlockup_thresh: -This value can be used to lower the softlockup tolerance -threshold. The default threshold is 10s. If a cpu is locked up -for 10s, the kernel complains. Valid values are 1-60s. +This value can be used to lower the softlockup tolerance threshold. The +default threshold is 60 seconds. If a cpu is locked up for 60 seconds, +the kernel complains. Valid values are 1-60 seconds. Setting this +tunable to zero will disable the softlockup detection altogether. ============================================================== From 379daf6290814e41f14880094b7b773640df2461 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 25 Sep 2008 18:43:34 -0700 Subject: [PATCH 12/17] IO resources, x86: ioremap sanity check to catch mapping requests exceeding the BAR sizes Go through the iomem resource tree to check if any of the ioremap() requests span more than any slot in the iomem resource tree and do a WARN_ON() if we hit this check. This will raise a red-flag, if some driver is mapping more than what is needed. And hopefully identify possible corruptions much earlier. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 6 ++++++ include/linux/ioport.h | 1 + kernel/resource.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4b6e6a29ae..c818b45bd07 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -149,6 +149,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, if (is_ISA_range(phys_addr, last_addr)) return (__force void __iomem *)phys_to_virt(phys_addr); + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ON(iomem_map_sanity_check(phys_addr, size)); + /* * Don't allow anybody to remap normal RAM that we're using.. */ diff --git a/include/linux/ioport.h b/include/linux/ioport.h index fded376b94e..01712cf1a38 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -169,6 +169,7 @@ extern struct resource * __devm_request_region(struct device *dev, extern void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n); +extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size); #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/kernel/resource.c b/kernel/resource.c index fc59dcc4795..1d003a50ee1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -827,3 +827,36 @@ static int __init reserve_setup(char *str) } __setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (p->start <= addr && (p->end >= addr + size - 1)) + continue; + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + addr, addr + size - 1, p->start, p->end, p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} From 13eb83754b40bf01dc84e52a08d4196d1b719a0e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Sep 2008 10:10:12 +0200 Subject: [PATCH 13/17] IO resources, x86: ioremap sanity check to catch mapping requests exceeding, fix fix this build error: kernel/resource.c: In function 'iomem_map_sanity_check': kernel/resource.c:842: error: implicit declaration of function 'r_next' kernel/resource.c:842: warning: assignment makes pointer from integer without a cast r_next() was only available if CONFIG_PROCFS was enabled. and fix this build warning: kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 2 has type 'resource_size_t' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'long unsigned int' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 4 has type 'resource_size_t' kernel/resource.c:855: warning: format '%llx' expects type 'long long unsigned int', but argument 5 has type 'resource_size_t' resource_t can be 32 bits. Signed-off-by: Ingo Molnar --- kernel/resource.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 1d003a50ee1..7797dae85b5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { @@ -852,7 +852,11 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) continue; printk(KERN_WARNING "resource map sanity check conflict: " "0x%llx 0x%llx 0x%llx 0x%llx %s\n", - addr, addr + size - 1, p->start, p->end, p->name); + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); err = -1; break; } From 15160716eea5591eb31f40fd4dba56d83bea4209 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Sep 2008 11:40:53 +0200 Subject: [PATCH 14/17] x86, pci-hotplug, calgary / rio: fix EBDA ioremap() IO resource and ioremap debugging uncovered this ioremap() done by drivers/pci/hotplug/ibmphp_ebda.c: initcall pci_hotplug_init+0x0/0x41 returned 0 after 3 msecs calling ibmphp_init+0x0/0x360 @ 1 ibmphpd: IBM Hot Plug PCI Controller Driver version: 0.6 resource map sanity check conflict: 0x9f800 0xaf5e7 0x9f800 0x9ffff reserved ------------[ cut here ]------------ WARNING: at arch/x86/mm/ioremap.c:175 __ioremap_caller+0x5c/0x226() Pid: 1, comm: swapper Not tainted 2.6.27-rc7-tip-00914-g347b10f-dirty #36038 [] warn_on_slowpath+0x41/0x68 [] ? __lock_acquire+0x9ba/0xa7f [] ? do_flush_tlb_all+0x0/0x59 [] ? smp_call_function_mask+0x74/0x17d [] ? do_flush_tlb_all+0x0/0x59 [] ? printk+0x1a/0x1c [] ? iomem_map_sanity_check+0x82/0x8c [] ? _read_unlock+0x22/0x25 [] ? iomem_map_sanity_check+0x82/0x8c [] ? trace_hardirqs_off+0xb/0xd [] __ioremap_caller+0x5c/0x226 [] ? trace_hardirqs_on+0xb/0xd [] ? iounmap+0x9d/0xa5 [] ioremap_nocache+0x15/0x17 [] ? ioremap+0xd/0xf [] ioremap+0xd/0xf [] ibmphp_access_ebda+0x60/0xa0e [] ibmphp_init+0xb5/0x360 [] do_one_initcall+0x57/0x138 [] ? ibmphp_init+0x0/0x360 [] ? trace_hardirqs_on+0xb/0xd [] ? __queue_work+0x2b/0x30 [] ? ibmphp_init+0x0/0x360 [] kernel_init+0x17b/0x1e2 [] ? kernel_init+0x0/0x1e2 [] kernel_thread_helper+0x7/0x10 ======================= ---[ end trace a7919e7f17c0a725 ]--- initcall ibmphp_init+0x0/0x360 returned -19 after 144 msecs calling zt5550_init+0x0/0x6a @ 1 the problem is this code: io_mem = ioremap (ebda_seg<<4, 65000); it assumes that the EBDA is 65000 bytes. But BIOS EBDA pointers are at most 1K large. _if_ the Rio code truly extends upon the customary EBDA size it needs to iounmap() this memory and ioremap() it larger, once it knows it from the generic descriptors that a Rio system is around. Signed-off-by: Ingo Molnar --- drivers/pci/hotplug/ibmphp_ebda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/ibmphp_ebda.c b/drivers/pci/hotplug/ibmphp_ebda.c index 8467d028732..7d27631e6e6 100644 --- a/drivers/pci/hotplug/ibmphp_ebda.c +++ b/drivers/pci/hotplug/ibmphp_ebda.c @@ -276,7 +276,7 @@ int __init ibmphp_access_ebda (void) iounmap (io_mem); debug ("returned ebda segment: %x\n", ebda_seg); - io_mem = ioremap (ebda_seg<<4, 65000); + io_mem = ioremap(ebda_seg<<4, 1024); if (!io_mem ) return -ENOMEM; next_offset = 0x180; From 8e85b4b553fc932e1c5141feb5fda389b7f5db01 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Oct 2008 10:50:53 +0200 Subject: [PATCH 15/17] softirqs, debug: preemption check if a preempt count leaks out of a softirq handler it can be very hard to figure it out. Add a debug check for this. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/softirq.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/softirq.c b/kernel/softirq.c index 82e32aadedd..1cf1e2f2c40 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -205,7 +205,18 @@ restart: do { if (pending & 1) { + int prev_count = preempt_count(); + h->action(h); + + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered sotfirq %ld %p" + "with preempt_count %08x," + " exited with %08x?\n", h - softirq_vec, + h->action, prev_count, preempt_count()); + preempt_count() = prev_count; + } + rcu_bh_qsctr_inc(cpu); } h++; From 77af7e3403e7314c47b0c07fbc5e4ef21d939532 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 11:39:46 +0200 Subject: [PATCH 16/17] softirq, warning fix: correct a format to avoid a warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Last -tip gives this warning: kernel/softirq.c: Dans la fonction «__do_softirq» : kernel/softirq.c:216: attention : format «%ld» expects type «long int», but argument 2 has type «int» This patch corrects the format type, and a small mistake in the "softirq" word. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 1cf1e2f2c40..be7a8292f99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,7 +210,7 @@ restart: h->action(h); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered sotfirq %ld %p" + printk(KERN_ERR "huh, entered softirq %d %p" "with preempt_count %08x," " exited with %08x?\n", h - softirq_vec, h->action, prev_count, preempt_count()); From 85462323555dda749f1c5373a8d72679464c968d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 8 Jun 2008 21:20:43 +0400 Subject: [PATCH 17/17] do_generic_file_read: s/EINTR/EIO/ if lock_page_killable() fails If lock_page_killable() fails because the task was killed by SIGKILL or any other fatal signal, do_generic_file_read() returns -EIO. This seems to be OK, because in fact the userspace won't see this error, the task will dequeue SIGKILL and exit. However, /sbin/init is different, it will dequeue SIGKILL, ignore it, and return to the user-space with the bogus -EIO. Change the code to return the error code from lock_page_killable(), -EINTR. This doesn't fix the bug, but perhaps makes sense anyway. Imho, with this change the code looks a bit more logical, and the "good" init should handle the spurious EINTR or short read. Afaics we can also change lock_page_killable() to return -ERESTARTNOINTR, but this can't prevent the short reads. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- mm/filemap.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 876bc595d0f..494ff20b6cf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1100,8 +1100,9 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ @@ -1130,8 +1131,9 @@ readpage: } if (!PageUptodate(page)) { - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -1143,15 +1145,14 @@ readpage: } unlock_page(page); shrink_readahead_size_eio(filp, ra); - goto readpage_eio; + error = -EIO; + goto readpage_error; } unlock_page(page); } goto page_ok; -readpage_eio: - error = -EIO; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error;