From 6ac5c5310ca9d7dd3d7e677c2715b1f06a348330 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:58:30 +1030 Subject: [PATCH 01/18] cpumask: Use modern cpumask style in arch/x86/kernel/cpu/mcheck/mce-inject.c Note that there's no freeing the cpu var, since this module has no unload function. Signed-off-by: Rusty Russell Cc: Andi Kleen Cc: Huang Ying LKML-Reference: <200911031458.30987.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 472763d9209..73734baa50f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) m->finished = 0; } -static cpumask_t mce_inject_cpumask; +static cpumask_var_t mce_inject_cpumask; static int mce_raise_notify(struct notifier_block *self, unsigned long val, void *data) @@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self, struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) return NOTIFY_DONE; - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) raise_exception(m, args->regs); else if (m->status) @@ -148,22 +148,22 @@ static void raise_mce(struct mce *m) unsigned long start; int cpu; get_online_cpus(); - mce_inject_cpumask = cpu_online_map; - cpu_clear(get_cpu(), mce_inject_cpumask); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); for_each_online_cpu(cpu) { struct mce *mcpu = &per_cpu(injectm, cpu); if (!mcpu->finished || MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpus_empty(mce_inject_cpumask)) - apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) + apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); start = jiffies; - while (!cpus_empty(mce_inject_cpumask)) { + while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR "Timeout waiting for mce inject NMI %lx\n", - *cpus_addr(mce_inject_cpumask)); + *cpumask_bits(mce_inject_cpumask)); break; } cpu_relax(); @@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, static int inject_init(void) { + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; register_die_notifier(&mce_raise_nb); From e258e4e0b495e6ecbd073d6bef1eafb62a58919a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:51 -0500 Subject: [PATCH 02/18] x86-32: Add new pt_regs stubs Add new stubs which add the pt_regs pointer as the last arg, matching 64-bit. This will allow these syscalls to be easily merged. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 49 +++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 50b9c220e12..34dbfa909dd 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -725,22 +725,49 @@ END(syscall_badsys) /* * System calls that need a pt_regs pointer. */ -#define PTREGSCALL(name) \ +#define PTREGSCALL0(name) \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%eax; \ jmp sys_##name; -PTREGSCALL(iopl) -PTREGSCALL(fork) -PTREGSCALL(clone) -PTREGSCALL(vfork) -PTREGSCALL(execve) -PTREGSCALL(sigaltstack) -PTREGSCALL(sigreturn) -PTREGSCALL(rt_sigreturn) -PTREGSCALL(vm86) -PTREGSCALL(vm86old) +#define PTREGSCALL1(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%edx; \ + movl PT_EBX(%edx),%eax; \ + jmp sys_##name; + +#define PTREGSCALL2(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%ecx; \ + movl PT_ECX(%ecx),%edx; \ + movl PT_EBX(%ecx),%eax; \ + jmp sys_##name; + +#define PTREGSCALL3(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + pushl %eax; \ + movl PT_EDX(%eax),%ecx; \ + movl PT_ECX(%eax),%edx; \ + movl PT_EBX(%eax),%eax; \ + call sys_##name; \ + addl $4,%esp; \ + ret + +PTREGSCALL0(iopl) +PTREGSCALL0(fork) +PTREGSCALL0(clone) +PTREGSCALL0(vfork) +PTREGSCALL0(execve) +PTREGSCALL0(sigaltstack) +PTREGSCALL0(sigreturn) +PTREGSCALL0(rt_sigreturn) +PTREGSCALL0(vm86) +PTREGSCALL0(vm86old) .macro FIXUP_ESPFIX_STACK /* From 27f59559d63375a4d59e7c720a439d9f0b47edad Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:52 -0500 Subject: [PATCH 03/18] x86: Merge sys_iopl Change 32-bit sys_iopl to PTREGSCALL1, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 6 +----- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/ioport.c | 30 ++++++------------------------ 3 files changed, 8 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 372b76edd63..4b694cd904c 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -18,6 +18,7 @@ /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +long sys_iopl(unsigned int, struct pt_regs *); /* kernel/process.c */ int sys_fork(struct pt_regs *); @@ -35,8 +36,6 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 -/* kernel/ioport.c */ -long sys_iopl(struct pt_regs *); /* kernel/process_32.c */ int sys_clone(struct pt_regs *); @@ -70,9 +69,6 @@ int sys_vm86(struct pt_regs *); #else /* CONFIG_X86_32 */ /* X86_64 only */ -/* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned int, struct pt_regs *); - /* kernel/process_64.c */ asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 34dbfa909dd..ab7fcef3745 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -758,7 +758,7 @@ ptregs_##name: \ addl $4,%esp; \ ret -PTREGSCALL0(iopl) +PTREGSCALL1(iopl) PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 99c4d308f16..85ecc7c57ba 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * on system-call entry - see also fork() and the signal handling * code. */ -static int do_iopl(unsigned int level, struct pt_regs *regs) +long sys_iopl(unsigned int level, struct pt_regs *regs) { unsigned int old = (regs->flags >> 12) & 3; + struct thread_struct *t = ¤t->thread; if (level > 3) return -EINVAL; @@ -115,29 +116,10 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); +#ifdef CONFIG_X86_32 + t->iopl = level << 12; + set_iopl_mask(t->iopl); +#endif return 0; } - -#ifdef CONFIG_X86_32 -long sys_iopl(struct pt_regs *regs) -{ - unsigned int level = regs->bx; - struct thread_struct *t = ¤t->thread; - int rc; - - rc = do_iopl(level, regs); - if (rc < 0) - goto out; - - t->iopl = level << 12; - set_iopl_mask(t->iopl); -out: - return rc; -} -#else -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - return do_iopl(level, regs); -} -#endif From 11cf88bd0b8165b65aaabaee0977e9a3ad474ab7 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:53 -0500 Subject: [PATCH 04/18] x86: Merge sys_execve Change 32-bit sys_execve to PTREGSCALL3, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 6 ++---- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/process.c | 26 ++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 25 ------------------------- arch/x86/kernel/process_64.c | 19 ------------------- 5 files changed, 29 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 4b694cd904c..48c48e508b9 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -23,6 +23,8 @@ long sys_iopl(unsigned int, struct pt_regs *); /* kernel/process.c */ int sys_fork(struct pt_regs *); int sys_vfork(struct pt_regs *); +long sys_execve(char __user *, char __user * __user *, + char __user * __user *, struct pt_regs *); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); @@ -39,7 +41,6 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* kernel/process_32.c */ int sys_clone(struct pt_regs *); -int sys_execve(struct pt_regs *); /* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); @@ -73,9 +74,6 @@ int sys_vm86(struct pt_regs *); asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); -asmlinkage long sys_execve(char __user *, char __user * __user *, - char __user * __user *, - struct pt_regs *); long sys_arch_prctl(int, unsigned long); /* kernel/signal.c */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index ab7fcef3745..a96a0d8a0fd 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -762,7 +762,7 @@ PTREGSCALL1(iopl) PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) -PTREGSCALL0(execve) +PTREGSCALL3(execve) PTREGSCALL0(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e2ba634ea1..bb17bd9334f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -235,6 +235,32 @@ int sys_vfork(struct pt_regs *regs) } +/* + * sys_execve() executes a new program. + */ +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs *regs) +{ + long error; + char *filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, regs); + +#ifdef CONFIG_X86_32 + if (error == 0) { + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } +#endif + + putname(filename); + return error; +} + /* * Idle related variables and functions */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 075580b3568..486e38e2900 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -451,31 +451,6 @@ int sys_clone(struct pt_regs *regs) return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); } -/* - * sys_execve() executes a new program. - */ -int sys_execve(struct pt_regs *regs) -{ - int error; - char *filename; - - filename = getname((char __user *) regs->bx); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = do_execve(filename, - (char __user * __user *) regs->cx, - (char __user * __user *) regs->dx, - regs); - if (error == 0) { - /* Make sure we don't return using sysenter.. */ - set_thread_flag(TIF_IRET); - } - putname(filename); -out: - return error; -} - #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c95c8f4e790..671960d8258 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -520,25 +520,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -/* - * sys_execve() executes a new program. - */ -asmlinkage -long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) -{ - long error; - char *filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, regs); - putname(filename); - return error; -} - void set_personality_64bit(void) { /* inherit personality from parent */ From 052acad48a566a6dbcccb95e5d22e5e1b7cac8dd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:54 -0500 Subject: [PATCH 05/18] x86: Merge sys_sigaltstack Change 32-bit sys_sigaltstack to PTREGSCALL2, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-5-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 8 +++----- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/signal.c | 12 +----------- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 48c48e508b9..94e0b61fb04 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -31,6 +31,9 @@ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ long sys_rt_sigreturn(struct pt_regs *); +long sys_sigaltstack(const stack_t __user *, stack_t __user *, + struct pt_regs *); + /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); @@ -46,7 +49,6 @@ int sys_clone(struct pt_regs *); asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); -int sys_sigaltstack(struct pt_regs *); unsigned long sys_sigreturn(struct pt_regs *); /* kernel/sys_i386_32.c */ @@ -76,10 +78,6 @@ asmlinkage long sys_clone(unsigned long, unsigned long, struct pt_regs *); long sys_arch_prctl(int, unsigned long); -/* kernel/signal.c */ -asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, - struct pt_regs *); - /* kernel/sys_x86_64.c */ struct new_utsname; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a96a0d8a0fd..621ef459941 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -763,7 +763,7 @@ PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) PTREGSCALL3(execve) -PTREGSCALL0(sigaltstack) +PTREGSCALL2(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) PTREGSCALL0(vm86) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 74fe6d86dc5..4fd173cd8e5 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -545,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, } #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_X86_32 -int sys_sigaltstack(struct pt_regs *regs) -{ - const stack_t __user *uss = (const stack_t __user *)regs->bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long +long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { return do_sigaltstack(uss, uoss, regs->sp); } -#endif /* CONFIG_X86_32 */ /* * Do a signal return; undo the signal stack. From f1382f157fb1175bba008abad0907310a1e459ce Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:55 -0500 Subject: [PATCH 06/18] x86, 32-bit: Convert sys_vm86 & sys_vm86old Convert these to new PTREGSCALL stubs. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-6-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 4 ++-- arch/x86/kernel/entry_32.S | 4 ++-- arch/x86/kernel/vm86_32.c | 11 +++++------ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 94e0b61fb04..df2c5110656 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -66,8 +66,8 @@ asmlinkage int sys_uname(struct old_utsname __user *); asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ -int sys_vm86old(struct pt_regs *); -int sys_vm86(struct pt_regs *); +int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); +int sys_vm86(unsigned long, unsigned long, struct pt_regs *); #else /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 621ef459941..6c2f25d9b9d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -766,8 +766,8 @@ PTREGSCALL3(execve) PTREGSCALL2(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) -PTREGSCALL0(vm86) -PTREGSCALL0(vm86old) +PTREGSCALL2(vm86) +PTREGSCALL1(vm86old) .macro FIXUP_ESPFIX_STACK /* diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9c4e6253905..5ffb5622f79 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -197,9 +197,8 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -int sys_vm86old(struct pt_regs *regs) +int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) { - struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -227,7 +226,7 @@ out: } -int sys_vm86(struct pt_regs *regs) +int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs) struct vm86plus_struct __user *v86; tsk = current; - switch (regs->bx) { + switch (cmd) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); + ret = do_vm86_irq_handling(cmd, (int)arg); goto out; case VM86_PLUS_INSTALL_CHECK: /* @@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs) ret = -EPERM; if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)regs->cx; + v86 = (struct vm86plus_struct __user *)arg; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); From f839bbc5c81b1c92ff8e81c360e9564f7b961b2e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:56 -0500 Subject: [PATCH 07/18] x86: Merge sys_clone Change 32-bit sys_clone to new PTREGSCALL stub, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-7-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/syscalls.h | 8 ++------ arch/x86/kernel/entry_32.S | 14 +++++++++++++- arch/x86/kernel/process.c | 9 +++++++++ arch/x86/kernel/process_32.c | 15 --------------- arch/x86/kernel/process_64.c | 9 --------- 5 files changed, 24 insertions(+), 31 deletions(-) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index df2c5110656..b0ce7806170 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -25,6 +25,8 @@ int sys_fork(struct pt_regs *); int sys_vfork(struct pt_regs *); long sys_execve(char __user *, char __user * __user *, char __user * __user *, struct pt_regs *); +long sys_clone(unsigned long, unsigned long, void __user *, + void __user *, struct pt_regs *); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); @@ -42,9 +44,6 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 -/* kernel/process_32.c */ -int sys_clone(struct pt_regs *); - /* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, @@ -73,9 +72,6 @@ int sys_vm86(unsigned long, unsigned long, struct pt_regs *); /* X86_64 only */ /* kernel/process_64.c */ -asmlinkage long sys_clone(unsigned long, unsigned long, - void __user *, void __user *, - struct pt_regs *); long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6c2f25d9b9d..6492555d123 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -760,7 +760,6 @@ ptregs_##name: \ PTREGSCALL1(iopl) PTREGSCALL0(fork) -PTREGSCALL0(clone) PTREGSCALL0(vfork) PTREGSCALL3(execve) PTREGSCALL2(sigaltstack) @@ -769,6 +768,19 @@ PTREGSCALL0(rt_sigreturn) PTREGSCALL2(vm86) PTREGSCALL1(vm86old) +/* Clone is an oddball. The 4th arg is in %edi */ + ALIGN; +ptregs_clone: + leal 4(%esp),%eax + pushl %eax + pushl PT_EDI(%eax) + movl PT_EDX(%eax),%ecx + movl PT_ECX(%eax),%edx + movl PT_EBX(%eax),%eax + call sys_clone + addl $8,%esp + ret + .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bb17bd9334f..f3c1a6b3a65 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -234,6 +234,15 @@ int sys_vfork(struct pt_regs *regs) NULL, NULL); } +long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + /* * sys_execve() executes a new program. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 486e38e2900..506d5a7ba17 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -436,21 +436,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -int sys_clone(struct pt_regs *regs) -{ - unsigned long clone_flags; - unsigned long newsp; - int __user *parent_tidptr, *child_tidptr; - - clone_flags = regs->bx; - newsp = regs->cx; - parent_tidptr = (int __user *)regs->dx; - child_tidptr = (int __user *)regs->di; - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); -} - #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 671960d8258..83019f94b83 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -534,15 +534,6 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -asmlinkage long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - unsigned long get_wchan(struct task_struct *p) { unsigned long stack; From ce9119ad90b1caba550447bfcc0a21850558ca49 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 9 Dec 2009 16:33:44 -0800 Subject: [PATCH 08/18] x86-32: Avoid pipeline serialization in PTREGSCALL1 and 2 In the PTREGSCALL1 and 2 macros, we can trivially avoid an unnecessary pipeline serialization, so do so. In PTREGSCALLS3 this is much less clear-cut since we have to push a new value to the stack. Leave it alone for now assuming it is as good as it is going to be; may want to check on Atom or another in-order x86 to see if we can do better. Signed-off-by: H. Peter Anvin Cc: Brian Gerst LKML-Reference: <1260403316-5679-2-git-send-email-brgerst@gmail.com> --- arch/x86/kernel/entry_32.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6492555d123..cb12b9bfc9c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -735,15 +735,15 @@ ptregs_##name: \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%edx; \ - movl PT_EBX(%edx),%eax; \ + movl (PT_EBX+4)(%esp),%eax; \ jmp sys_##name; #define PTREGSCALL2(name) \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%ecx; \ - movl PT_ECX(%ecx),%edx; \ - movl PT_EBX(%ecx),%eax; \ + movl (PT_ECX+4)(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ jmp sys_##name; #define PTREGSCALL3(name) \ From fc380ceed7fe469728ea4acdbda4495ea943ee1c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 9 Dec 2009 16:54:08 -0800 Subject: [PATCH 09/18] x86-64, paravirt: Call set_iopl_mask() on 64 bits set_iopl_mask() is a no-op on 64 bits, but it is also a paravirt hook, so call it even on 64 bits. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Brian Gerst LKML-Reference: <1260403316-5679-3-git-send-email-brgerst@gmail.com> --- arch/x86/kernel/ioport.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 85ecc7c57ba..8eec0ec59af 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -116,10 +116,8 @@ long sys_iopl(unsigned int level, struct pt_regs *regs) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); -#ifdef CONFIG_X86_32 t->iopl = level << 12; set_iopl_mask(t->iopl); -#endif return 0; } From 3bd95dfb182969dc6d2a317c150e0df7107608d3 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:40 -0500 Subject: [PATCH 10/18] x86, 64-bit: Move kernel_thread to C Prepare for merging with 32-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 49 ++------------------------------ arch/x86/kernel/process_64.c | 31 ++++++++++++++++++-- arch/x86/kernel/x8664_ksyms_64.c | 2 -- 3 files changed, 32 insertions(+), 50 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 63bca794c8f..73d9b2c0e21 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1166,63 +1166,20 @@ bad_gs: jmp 2b .previous -/* - * Create a kernel thread. - * - * C extern interface: - * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) - * - * asm input arguments: - * rdi: fn, rsi: arg, rdx: flags - */ -ENTRY(kernel_thread) - CFI_STARTPROC - FAKE_STACK_FRAME $child_rip - SAVE_ALL - - # rdi: flags, rsi: usp, rdx: will be &pt_regs - movq %rdx,%rdi - orq kernel_thread_flags(%rip),%rdi - movq $-1, %rsi - movq %rsp, %rdx - - xorl %r8d,%r8d - xorl %r9d,%r9d - - # clone now - call do_fork - movq %rax,RAX(%rsp) - xorl %edi,%edi - - /* - * It isn't worth to check for reschedule here, - * so internally to the x86_64 port you can rely on kernel_thread() - * not to reschedule the child before returning, this avoids the need - * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] - */ - RESTORE_ALL - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC -END(kernel_thread) - -ENTRY(child_rip) +ENTRY(kernel_thread_helper) pushq $0 # fake return address CFI_STARTPROC /* * Here we are in the child and the registers are set as they were * at kernel_thread() invocation in the parent. */ - movq %rdi, %rax - movq %rsi, %rdi - call *%rax + call *%rsi # exit mov %eax, %edi call do_exit ud2 # padding for call trace CFI_ENDPROC -END(child_rip) +END(kernel_thread_helper) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 83019f94b83..92484c2130c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -59,8 +59,6 @@ asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); -unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) @@ -231,6 +229,35 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS; + regs.flags = X86_EFLAGS_IF; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, ~0UL, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a1029769b6f..9fafaf83b3b 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -17,8 +17,6 @@ EXPORT_SYMBOL(mcount); #endif -EXPORT_SYMBOL(kernel_thread); - EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); From fa4b8f84383ae197e643a46c36bf58ab8dffc95c Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:41 -0500 Subject: [PATCH 11/18] x86, 64-bit: Use user_mode() to determine new stack pointer in copy_thread() Use user_mode() instead of a magic value for sp to determine when returning to kernel mode. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 92484c2130c..00ac66fa5c6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -254,7 +254,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.flags = X86_EFLAGS_IF; /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, ~0UL, ®s, 0, NULL, NULL); + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } EXPORT_SYMBOL(kernel_thread); @@ -312,8 +312,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, *childregs = *regs; childregs->ax = 0; - childregs->sp = sp; - if (sp == ~0UL) + if (user_mode(regs)) + childregs->sp = sp; + else childregs->sp = (unsigned long)childregs; p->thread.sp = (unsigned long) childregs; From e840227c141116171c89ab1abb5cc9fee6fdb488 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:42 -0500 Subject: [PATCH 12/18] x86, 32-bit: Use same regs as 64-bit for kernel_thread_helper The arg should be in %eax, but that is clobbered by the return value of clone. The function pointer can be in any register. Also, don't push args onto the stack, since regparm(3) is the normal calling convention now. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 8 ++------ arch/x86/kernel/process_32.c | 8 ++++---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cb12b9bfc9c..44a8e0dc673 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1047,12 +1047,8 @@ END(spurious_interrupt_bug) ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder CFI_STARTPROC - movl %edx,%eax - push %edx - CFI_ADJUST_CFA_OFFSET 4 - call *%ebx - push %eax - CFI_ADJUST_CFA_OFFSET 4 + movl %edi,%eax + call *%esi call do_exit ud2 # padding for call trace CFI_ENDPROC diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 506d5a7ba17..bd874d2b6ab 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -193,8 +193,8 @@ void show_regs(struct pt_regs *regs) } /* - * This gets run with %bx containing the - * function to call, and %dx containing + * This gets run with %si containing the + * function to call, and %di containing * the "args". */ extern void kernel_thread_helper(void); @@ -208,8 +208,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) memset(®s, 0, sizeof(regs)); - regs.bx = (unsigned long) fn; - regs.dx = (unsigned long) arg; + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; regs.ds = __USER_DS; regs.es = __USER_DS; From f443ff4201dd25cd4dec183f9919ecba90c8edc2 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:43 -0500 Subject: [PATCH 13/18] x86: Sync 32/64-bit kernel_thread Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-5-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_32.c | 5 ++++- arch/x86/kernel/process_64.c | 11 +++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd874d2b6ab..f2e8b05a4f0 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -211,14 +211,17 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.si = (unsigned long) fn; regs.di = (unsigned long) arg; +#ifdef CONFIG_X86_32 regs.ds = __USER_DS; regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; regs.gs = __KERNEL_STACK_CANARY; +#endif + regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 00ac66fa5c6..d49a9094f6f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -248,10 +248,17 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.si = (unsigned long) fn; regs.di = (unsigned long) arg; +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#endif + regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS; - regs.flags = X86_EFLAGS_IF; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); From df59e7bf439918f523ac29e996ec1eebbed60440 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:44 -0500 Subject: [PATCH 14/18] x86: Merge kernel_thread() Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-6-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 36 ------------------------------------ arch/x86/kernel/process_64.c | 36 ------------------------------------ 3 files changed, 35 insertions(+), 72 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f3c1a6b3a65..8705ccedd44 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -243,6 +243,41 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#endif + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); /* * sys_execve() executes a new program. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f2e8b05a4f0..ccf234266a2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -192,42 +192,6 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, ®s->sp, regs->bp); } -/* - * This gets run with %si containing the - * function to call, and %di containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.si = (unsigned long) fn; - regs.di = (unsigned long) arg; - -#ifdef CONFIG_X86_32 - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; -#endif - - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d49a9094f6f..1a362c5bec3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -229,42 +229,6 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } -/* - * This gets run with %si containing the - * function to call, and %di containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.si = (unsigned long) fn; - regs.di = (unsigned long) arg; - -#ifdef CONFIG_X86_32 - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; -#endif - - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { From 494c2ebfb287eb10b229415063099e3700639028 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 14 Dec 2009 10:02:18 -0800 Subject: [PATCH 15/18] x86, msr: Remove incorrect, duplicated code in the MSR driver The MSR driver would compute the values for cpu and c at declaration, and then again in the body of the function. This isn't merely redundant, but unsafe, since cpu might not refer to a valid CPU at that point. Remove the unnecessary and dangerous references in the declarations. This code now matches the equivalent code in the CPUID driver. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/msr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 553449951b8..572b07eee3f 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -172,11 +172,10 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) static int msr_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int cpu; + struct cpuinfo_x86 *c; cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ From 873b5271f878a11729fb4602c6ce967d0ff81119 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 14 Dec 2009 13:55:20 -0800 Subject: [PATCH 16/18] x86: Regex support and known-movable symbols for relocs, fix _end This adds a new category of symbols to the relocs program: symbols which are known to be relative, even though the linker emits them as absolute; this is the case for symbols that live in the linker script, which currently applies to _end. Unfortunately the previous workaround of putting _end in its own empty section was defeated by newer binutils, which remove empty sections completely. This patch also changes the symbol matching to use regular expressions instead of hardcoded C for specific patterns. This is a decidedly non-minimal patch: a modified version of the relocs program is used as part of the Syslinux build, and this is basically a backport to Linux of some of those changes; they have thus been well tested. Signed-off-by: H. Peter Anvin LKML-Reference: <4AF86211.3070103@zytor.com> Acked-by: Michal Marek Tested-by: Sedat Dilek --- arch/x86/boot/compressed/relocs.c | 89 +++++++++++++++++++++---------- arch/x86/kernel/vmlinux.lds.S | 4 +- 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index bbeb0c3fbd9..89bbf4e4d05 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -9,6 +9,9 @@ #include #define USE_BSD #include +#include + +static void die(char *fmt, ...); #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) static Elf32_Ehdr ehdr; @@ -30,25 +33,47 @@ static struct section *secs; * the address for which it has been compiled. Don't warn user about * absolute relocations present w.r.t these symbols. */ -static const char* safe_abs_relocs[] = { - "xen_irq_disable_direct_reloc", - "xen_save_fl_direct_reloc", -}; - -static int is_safe_abs_reloc(const char* sym_name) +static const char abs_sym_regex[] = + "^(xen_irq_disable_direct_reloc$|" + "xen_save_fl_direct_reloc$|" + "VDSO|" + "__crc_)"; +static regex_t abs_sym_regex_c; +static int is_abs_reloc(const char *sym_name) { - int i; + return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0); +} - for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { - if (!strcmp(sym_name, safe_abs_relocs[i])) - /* Match found */ - return 1; - } - if (strncmp(sym_name, "VDSO", 4) == 0) - return 1; - if (strncmp(sym_name, "__crc_", 6) == 0) - return 1; - return 0; +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ +static const char rel_sym_regex[] = + "^_end$"; +static regex_t rel_sym_regex_c; +static int is_rel_reloc(const char *sym_name) +{ + return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0); +} + +static void regex_init(void) +{ + char errbuf[128]; + int err; + + err = regcomp(&abs_sym_regex_c, abs_sym_regex, + REG_EXTENDED|REG_NOSUB); + if (err) { + regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf); + die("%s", errbuf); + } + + err = regcomp(&rel_sym_regex_c, rel_sym_regex, + REG_EXTENDED|REG_NOSUB); + if (err) { + regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf); + die("%s", errbuf); + } } static void die(char *fmt, ...) @@ -131,7 +156,7 @@ static const char *rel_type(unsigned type) #undef REL_TYPE }; const char *name = "unknown type rel type name"; - if (type < ARRAY_SIZE(type_name)) { + if (type < ARRAY_SIZE(type_name) && type_name[type]) { name = type_name[type]; } return name; @@ -448,7 +473,7 @@ static void print_absolute_relocs(void) * Before warning check if this absolute symbol * relocation is harmless. */ - if (is_safe_abs_reloc(name)) + if (is_abs_reloc(name) || is_rel_reloc(name)) continue; if (!printed) { @@ -501,21 +526,26 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; r_type = ELF32_R_TYPE(rel->r_info); /* Don't visit relocations to absolute symbols */ - if (sym->st_shndx == SHN_ABS) { + if (sym->st_shndx == SHN_ABS && + !is_rel_reloc(sym_name(sym_strtab, sym))) { continue; } - if (r_type == R_386_NONE || r_type == R_386_PC32) { + switch (r_type) { + case R_386_NONE: + case R_386_PC32: /* * NONE can be ignored and and PC relative * relocations don't need to be adjusted. */ - } - else if (r_type == R_386_32) { + break; + case R_386_32: /* Visit relocations that need to be adjusted */ visit(rel, sym); - } - else { - die("Unsupported relocation type: %d\n", r_type); + break; + default: + die("Unsupported relocation type: %s (%d)\n", + rel_type(r_type), r_type); + break; } } } @@ -571,16 +601,15 @@ static void emit_relocs(int as_text) } else { unsigned char buf[4]; - buf[0] = buf[1] = buf[2] = buf[3] = 0; /* Print a stop */ - printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); + fwrite("\0\0\0\0", 4, 1, stdout); /* Now print each relocation */ for (i = 0; i < reloc_count; i++) { buf[0] = (relocs[i] >> 0) & 0xff; buf[1] = (relocs[i] >> 8) & 0xff; buf[2] = (relocs[i] >> 16) & 0xff; buf[3] = (relocs[i] >> 24) & 0xff; - printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); + fwrite(buf, 4, 1, stdout); } } } @@ -598,6 +627,8 @@ int main(int argc, char **argv) FILE *fp; int i; + regex_init(); + show_absolute_syms = 0; show_absolute_relocs = 0; as_text = 0; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f3f2104408d..f92a0da608c 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -319,9 +319,7 @@ SECTIONS __brk_limit = .; } - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = .; - } + _end = .; STABS_DEBUG DWARF_DEBUG From 186a25026c44d1bfa97671110ff14dcd0c99678e Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 15 Dec 2009 20:47:56 +0900 Subject: [PATCH 17/18] x86: Split swiotlb initialization into two stages The commit f4780ca005404166cc40af77ef0e86132ab98a81 moves swiotlb initialization before dma32_free_bootmem(). It's supposed to fix a bug that the commit 75f1cdf1dda92cae037ec848ae63690d91913eac introduced, we initialize SWIOTLB right after dma32_free_bootmem so we wrongly steal memory area allocated for GART with broken BIOS earlier. However, the above commit introduced another problem, which likely breaks machines with huge amount of memory. Such a box use the majority of DMA32_ZONE so there is no memory for swiotlb. With this patch, the x86 IOMMU initialization sequence are: 1. We set swiotlb to 1 in the case of (max_pfn > MAX_DMA32_PFN && !no_iommu). If swiotlb usage is forced by the boot option, we go to the step 3 and finish (we don't try to detect IOMMUs). 2. We call the detection functions of all the IOMMUs. The detection function sets x86_init.iommu.iommu_init to the IOMMU initialization function (so we can avoid calling the initialization functions of all the IOMMUs needlessly). 3. We initialize swiotlb (and set dma_ops to swiotlb_dma_ops) if swiotlb is set to 1. 4. If the IOMMU initialization function doesn't need swiotlb (e.g. the initialization is sucessful) then sets swiotlb to zero. 5. If we find that swiotlb is set to zero, we free swiotlb resource. Reported-by: Yinghai Lu Reported-by: Roland Dreier Signed-off-by: FUJITA Tomonori LKML-Reference: <20091215204729A.fujita.tomonori@lab.ntt.co.jp> Tested-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/swiotlb.h | 8 ++++++-- arch/x86/kernel/pci-dma.c | 9 ++++----- arch/x86/kernel/pci-swiotlb.c | 11 +++++++---- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 87ffcb12a1b..8085277e1b8 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -5,13 +5,17 @@ #ifdef CONFIG_SWIOTLB extern int swiotlb; -extern int pci_swiotlb_init(void); +extern int __init pci_swiotlb_detect(void); +extern void __init pci_swiotlb_init(void); #else #define swiotlb 0 -static inline int pci_swiotlb_init(void) +static inline int pci_swiotlb_detect(void) { return 0; } +static inline void pci_swiotlb_init(void) +{ +} #endif static inline void dma_mark_clean(void *addr, size_t size) {} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index fcc2f2bfa39..75e14e21f61 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -120,15 +120,12 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { - int use_swiotlb; - - use_swiotlb = pci_swiotlb_init(); #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif - if (use_swiotlb) - return; + if (pci_swiotlb_detect()) + goto out; gart_iommu_hole_init(); @@ -138,6 +135,8 @@ void __init pci_iommu_alloc(void) /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); +out: + pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e3c0a66b9e7..7d2829dde20 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -43,12 +43,12 @@ static struct dma_map_ops swiotlb_dma_ops = { }; /* - * pci_swiotlb_init - initialize swiotlb if necessary + * pci_swiotlb_detect - set swiotlb to 1 if necessary * * This returns non-zero if we are forced to use swiotlb (by the boot * option). */ -int __init pci_swiotlb_init(void) +int __init pci_swiotlb_detect(void) { int use_swiotlb = swiotlb | swiotlb_force; @@ -60,10 +60,13 @@ int __init pci_swiotlb_init(void) if (swiotlb_force) swiotlb = 1; + return use_swiotlb; +} + +void __init pci_swiotlb_init(void) +{ if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } - - return use_swiotlb; } From 23637568ad0c9b5ab0ad27d2f2f26d1e9282c527 Mon Sep 17 00:00:00 2001 From: Jonathan Nieder Date: Sun, 13 Dec 2009 16:04:38 -0600 Subject: [PATCH 18/18] x86: Fix kprobes build with non-gawk awk The instruction attribute table generator fails when run by mawk or original-awk: $ mawk -f arch/x86/tools/gen-insn-attr-x86.awk \ arch/x86/lib/x86-opcode-map.txt > /dev/null Semantic error at 240: Second IMM error $ echo $? 1 Line 240 contains "c8: ENTER Iw,Ib", which indicates that this instruction has two immediate operands, the second of which is one byte. The script loops through the immediate operands using a for loop. Unfortunately, there is no guarantee in awk that a for (variable in array) loop will return the indices in increasing order. Internally, both original-awk and mawk iterate over a hash table for this purpose, and both implementations happen to produce the index 2 before 1. The supposed second immediate operand is more than one byte wide, producing the error. So loop over the indices in increasing order instead. As a side-effect, with mawk this means the silly two-entry hash table never has to be built. Signed-off-by: Jonathan Nieder Acked-by Masami Hiramatsu Cc: Jim Keniston Cc: Frederic Weisbecker LKML-Reference: <20091213220437.GA27718@progeny.tock> Signed-off-by: Ingo Molnar --- arch/x86/tools/gen-insn-attr-x86.awk | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index e34e92a28eb..7a6850683c3 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -226,12 +226,12 @@ function add_flags(old,new) { } # convert operands to flags. -function convert_operands(opnd, i,imm,mod) +function convert_operands(count,opnd, i,j,imm,mod) { imm = null mod = null - for (i in opnd) { - i = opnd[i] + for (j = 1; j <= count; j++) { + i = opnd[j] if (match(i, imm_expr) == 1) { if (!imm_flag[i]) semantic_error("Unknown imm opnd: " i) @@ -282,8 +282,8 @@ function convert_operands(opnd, i,imm,mod) # parse one opcode if (match($i, opnd_expr)) { opnd = $i - split($(i++), opnds, ",") - flags = convert_operands(opnds) + count = split($(i++), opnds, ",") + flags = convert_operands(count, opnds) } if (match($i, ext_expr)) ext = $(i++)