Merge branch 'for-rmk' of git://git.kernel.org/pub/scm/linux/kernel/git/nico/orion into fixes

2025-01-01 05:36:24 +00:00 · 2008-05-09 23:24:09 +01:00 · 2008-05-09 23:24:09 +01:00 · 1f2ee6496b
commit 1f2ee6496b
parent 5bf6c6e30d da109897a1
323 changed files with 5599 additions and 3417 deletions
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@ -72,7 +72,7 @@
    kgdb is a source level debugger for linux kernel. It is used along
    with gdb to debug a linux kernel.  The expectation is that gdb can
    be used to "break in" to the kernel to inspect memory, variables
-    and look through a cal stack information similar to what an
+    and look through call stack information similar to what an
    application developer would use gdb for.  It is possible to place
    breakpoints in kernel code and perform some limited execution
    stepping.
@ -93,8 +93,10 @@
  <chapter id="CompilingAKernel">
    <title>Compiling a kernel</title>
    <para>
-    To enable <symbol>CONFIG_KGDB</symbol>, look under the "Kernel debugging"
-    and then select "KGDB: kernel debugging with remote gdb".
+    To enable <symbol>CONFIG_KGDB</symbol> you should first turn on
+    "Prompt for development and/or incomplete code/drivers"
+    (CONFIG_EXPERIMENTAL) in  "General setup", then under the
+    "Kernel debugging" select "KGDB: kernel debugging with remote gdb".
    </para>
    <para>
    Next you should choose one of more I/O drivers to interconnect debugging
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@ -92,7 +92,6 @@ prototypes:
 	void (*destroy_inode)(struct inode *);
 	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, int);
-	void (*put_inode) (struct inode *);
 	void (*drop_inode) (struct inode *);
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
@ -115,7 +114,6 @@ alloc_inode:		no	no	no
 destroy_inode:		no
 dirty_inode:		no				(must not sleep)
 write_inode:		no
-put_inode:		no
 drop_inode:		no				!!!inode_lock!!!
 delete_inode:		no
 put_super:		yes	yes	no
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@ -205,7 +205,6 @@ struct super_operations {

        void (*dirty_inode) (struct inode *);
        int (*write_inode) (struct inode *, int);
-        void (*put_inode) (struct inode *);
        void (*drop_inode) (struct inode *);
        void (*delete_inode) (struct inode *);
        void (*put_super) (struct super_block *);
@ -246,9 +245,6 @@ or bottom half).
 	inode to disc.  The second parameter indicates whether the write
 	should be synchronous or not, not all filesystems check this flag.

-  put_inode: called when the VFS inode is removed from the inode
-	cache.
-
  drop_inode: called when the last access to the inode is dropped,
 	with the inode_lock spinlock held.

--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@ -377,27 +377,3 @@ config FOO

 limits FOO to module (=m) or disabled (=n).

-
-Build limited by a third config symbol which may be =y or =m
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-A common idiom that we see (and sometimes have problems with) is this:
-
-When option C in B (module or subsystem) uses interfaces from A (module
-or subsystem), and both A and B are tristate (could be =y or =m if they
-were independent of each other, but they aren't), then we need to limit
-C such that it cannot be built statically if A is built as a loadable
-module.  (C already depends on B, so there is no dependency issue to
-take care of here.)
-
-If A is linked statically into the kernel image, C can be built
-statically or as loadable module(s).  However, if A is built as loadable
-module(s), then C must be restricted to loadable module(s) also.  This
-can be expressed in kconfig language as:
-
-config C
-	depends on A = y || A = B
-
-or for real examples, use this command in a kernel tree:
-
-$ find . -name Kconfig\* | xargs grep -ns "depends on.*=.*||.*=" | grep -v orig
-
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@ -1094,9 +1094,6 @@ and is between 256 and 4096 characters. It is defined in the file
 	mac5380=	[HW,SCSI] Format:
 			<can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>

-	mac53c9x=	[HW,SCSI] Format:
-			<num_esps>,<disconnect>,<nosync>,<can_queue>,<cmd_per_lun>,<sg_tablesize>,<hostid>,<use_tags>
-
 	machvec=	[IA64] Force the use of a particular machine-vector
 			(machvec) in a generic kernel.
 			Example: machvec=hpzx1_swiotlb
@ -1525,6 +1522,8 @@ and is between 256 and 4096 characters. It is defined in the file
 				This is normally done in pci_enable_device(),
 				so this option is a temporary workaround
 				for broken drivers that don't call it.
+		skip_isa_align	[X86] do not align io start addr, so can
+				handle more pci cards
 		firmware	[ARM] Do not re-enumerate the bus but instead
 				just use the configuration from the
 				bootloader. This is currently used on
--- a/Documentation/s390/CommonIO
+++ b/Documentation/s390/CommonIO
@ -8,17 +8,6 @@ Command line parameters

  Enable logging of debug information in case of ccw device timeouts.

-
-* cio_msg = yes | no
-  
-  Determines whether information on found devices and sensed device 
-  characteristics should be shown during startup or when new devices are
-  found, i. e. messages of the types "Detected device 0.0.4711 on subchannel
-  0.0.0042" and "SenseID: Device 0.0.4711 reports: ...".
-
-  Default is off.
-
-
 * cio_ignore = {all} |
 	       {<device> | <range of devices>} |
 	       {!<device> | !<range of devices>}
--- a/Documentation/scheduler/sched-design.txt
+++ b/Documentation/scheduler/sched-design.txt
@ -1,165 +0,0 @@
-		   Goals, Design and Implementation of the
-		      new ultra-scalable O(1) scheduler
-
-
-  This is an edited version of an email Ingo Molnar sent to
-  lkml on 4 Jan 2002.  It describes the goals, design, and
-  implementation of Ingo's new ultra-scalable O(1) scheduler.
-  Last Updated: 18 April 2002.
-
-
-Goal
-====
-
-The main goal of the new scheduler is to keep all the good things we know
-and love about the current Linux scheduler:
-
- - good interactive performance even during high load: if the user
-   types or clicks then the system must react instantly and must execute
-   the user tasks smoothly, even during considerable background load.
-
- - good scheduling/wakeup performance with 1-2 runnable processes.
-
- - fairness: no process should stay without any timeslice for any
-   unreasonable amount of time. No process should get an unjustly high
-   amount of CPU time.
-
- - priorities: less important tasks can be started with lower priority,
-   more important tasks with higher priority.
-
- - SMP efficiency: no CPU should stay idle if there is work to do.
-
- - SMP affinity: processes which run on one CPU should stay affine to
-   that CPU. Processes should not bounce between CPUs too frequently.
-
- - plus additional scheduler features: RT scheduling, CPU binding.
-
-and the goal is also to add a few new things:
-
- - fully O(1) scheduling. Are you tired of the recalculation loop
-   blowing the L1 cache away every now and then? Do you think the goodness
-   loop is taking a bit too long to finish if there are lots of runnable
-   processes? This new scheduler takes no prisoners: wakeup(), schedule(),
-   the timer interrupt are all O(1) algorithms. There is no recalculation
-   loop. There is no goodness loop either.
-
- - 'perfect' SMP scalability. With the new scheduler there is no 'big'
-   runqueue_lock anymore - it's all per-CPU runqueues and locks - two
-   tasks on two separate CPUs can wake up, schedule and context-switch
-   completely in parallel, without any interlocking. All
-   scheduling-relevant data is structured for maximum scalability.
-
- - better SMP affinity. The old scheduler has a particular weakness that
-   causes the random bouncing of tasks between CPUs if/when higher
-   priority/interactive tasks, this was observed and reported by many
-   people. The reason is that the timeslice recalculation loop first needs
-   every currently running task to consume its timeslice. But when this
-   happens on eg. an 8-way system, then this property starves an
-   increasing number of CPUs from executing any process. Once the last
-   task that has a timeslice left has finished using up that timeslice,
-   the recalculation loop is triggered and other CPUs can start executing
-   tasks again - after having idled around for a number of timer ticks.
-   The more CPUs, the worse this effect.
-
-   Furthermore, this same effect causes the bouncing effect as well:
-   whenever there is such a 'timeslice squeeze' of the global runqueue,
-   idle processors start executing tasks which are not affine to that CPU.
-   (because the affine tasks have finished off their timeslices already.)
-
-   The new scheduler solves this problem by distributing timeslices on a
-   per-CPU basis, without having any global synchronization or
-   recalculation.
-
- - batch scheduling. A significant proportion of computing-intensive tasks
-   benefit from batch-scheduling, where timeslices are long and processes
-   are roundrobin scheduled. The new scheduler does such batch-scheduling
-   of the lowest priority tasks - so nice +19 jobs will get
-   'batch-scheduled' automatically. With this scheduler, nice +19 jobs are
-   in essence SCHED_IDLE, from an interactiveness point of view.
-
- - handle extreme loads more smoothly, without breakdown and scheduling
-   storms.
-
- - O(1) RT scheduling. For those RT folks who are paranoid about the
-   O(nr_running) property of the goodness loop and the recalculation loop.
-
- - run fork()ed children before the parent. Andrea has pointed out the
-   advantages of this a few months ago, but patches for this feature
-   do not work with the old scheduler as well as they should,
-   because idle processes often steal the new child before the fork()ing
-   CPU gets to execute it.
-
-
-Design
-======
-
-The core of the new scheduler contains the following mechanisms:
-
- - *two* priority-ordered 'priority arrays' per CPU. There is an 'active'
-   array and an 'expired' array. The active array contains all tasks that
-   are affine to this CPU and have timeslices left. The expired array
-   contains all tasks which have used up their timeslices - but this array
-   is kept sorted as well. The active and expired array is not accessed
-   directly, it's accessed through two pointers in the per-CPU runqueue
-   structure. If all active tasks are used up then we 'switch' the two
-   pointers and from now on the ready-to-go (former-) expired array is the
-   active array - and the empty active array serves as the new collector
-   for expired tasks.
-
- - there is a 64-bit bitmap cache for array indices. Finding the highest
-   priority task is thus a matter of two x86 BSFL bit-search instructions.
-
-the split-array solution enables us to have an arbitrary number of active
-and expired tasks, and the recalculation of timeslices can be done
-immediately when the timeslice expires. Because the arrays are always
-access through the pointers in the runqueue, switching the two arrays can
-be done very quickly.
-
-this is a hybride priority-list approach coupled with roundrobin
-scheduling and the array-switch method of distributing timeslices.
-
- - there is a per-task 'load estimator'.
-
-one of the toughest things to get right is good interactive feel during
-heavy system load. While playing with various scheduler variants i found
-that the best interactive feel is achieved not by 'boosting' interactive
-tasks, but by 'punishing' tasks that want to use more CPU time than there
-is available. This method is also much easier to do in an O(1) fashion.
-
-to establish the actual 'load' the task contributes to the system, a
-complex-looking but pretty accurate method is used: there is a 4-entry
-'history' ringbuffer of the task's activities during the last 4 seconds.
-This ringbuffer is operated without much overhead. The entries tell the
-scheduler a pretty accurate load-history of the task: has it used up more
-CPU time or less during the past N seconds. [the size '4' and the interval
-of 4x 1 seconds was found by lots of experimentation - this part is
-flexible and can be changed in both directions.]
-
-the penalty a task gets for generating more load than the CPU can handle
-is a priority decrease - there is a maximum amount to this penalty
-relative to their static priority, so even fully CPU-bound tasks will
-observe each other's priorities, and will share the CPU accordingly.
-
-the SMP load-balancer can be extended/switched with additional parallel
-computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs
-can be supported easily by changing the load-balancer. Right now it's
-tuned for my SMP systems.
-
-i skipped the prev->mm == next->mm advantage - no workload i know of shows
-any sensitivity to this. It can be added back by sacrificing O(1)
-schedule() [the current and one-lower priority list can be searched for a
-that->mm == current->mm condition], but costs a fair number of cycles
-during a number of important workloads, so i wanted to avoid this as much
-as possible.
-
- the SMP idle-task startup code was still racy and the new scheduler
-triggered this. So i streamlined the idle-setup code a bit. We do not call
-into schedule() before all processors have started up fully and all idle
-threads are in place.
-
- the patch also cleans up a number of aspects of sched.c - moves code
-into other areas of the kernel where it's appropriate, and simplifies
-certain code paths and data constructs. As a result, the new scheduler's
-code is smaller than the old one.
-
-	Ingo
--- a/6
+++ b/6
@ -2112,12 +2112,10 @@ L:	netdev@vger.kernel.org
 S:	Maintained

 INTEL ETHERNET DRIVERS (e100/e1000/e1000e/igb/ixgb/ixgbe)
-P:	Auke Kok
-M:	auke-jan.h.kok@intel.com
-P:	Jesse Brandeburg
-M:	jesse.brandeburg@intel.com
 P:	Jeff Kirsher
 M:	jeffrey.t.kirsher@intel.com
+P:	Jesse Brandeburg
+M:	jesse.brandeburg@intel.com
 P:	Bruce Allan
 M:	bruce.w.allan@intel.com
 P:	John Ronciak
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@ -34,23 +34,6 @@ extern unsigned long do_mremap(unsigned long addr, unsigned long old_len,
 			       unsigned long new_len, unsigned long flags,
 			       unsigned long new_addr);

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 inline long do_mmap2(
 	unsigned long addr, unsigned long len,
--- a/arch/arm/mach-orion5x/addr-map.c
+++ b/arch/arm/mach-orion5x/addr-map.c
@ -34,11 +34,7 @@
 * Non-CPU Masters address decoding --
 * Unlike the CPU, we setup the access from Orion's master interfaces to DDR
 * banks only (the typical use case).
- * Setup access for each master to DDR is issued by common.c.
- *
- * Note: although orion_setbits() and orion_clrbits() are not atomic
- * no locking is necessary here since code in this file is only called
- * at boot time when there is no concurrency issues.
+ * Setup access for each master to DDR is issued by platform device setup.
 */

 /*
@ -48,10 +44,6 @@
 #define TARGET_DEV_BUS		1
 #define TARGET_PCI		3
 #define TARGET_PCIE		4
-#define ATTR_DDR_CS(n)		(((n) ==0) ? 0xe :	\
-				((n) == 1) ? 0xd :	\
-				((n) == 2) ? 0xb :	\
-				((n) == 3) ? 0x7 : 0xf)
 #define ATTR_PCIE_MEM		0x59
 #define ATTR_PCIE_IO		0x51
 #define ATTR_PCIE_WA		0x79
@ -61,17 +53,12 @@
 #define ATTR_DEV_CS1		0x1d
 #define ATTR_DEV_CS2		0x1b
 #define ATTR_DEV_BOOT		0xf
-#define WIN_EN			1

 /*
 * Helpers to get DDR bank info
 */
-#define DDR_BASE_CS(n)		ORION5X_DDR_REG(0x1500 + ((n) * 8))
-#define DDR_SIZE_CS(n)		ORION5X_DDR_REG(0x1504 + ((n) * 8))
-#define DDR_MAX_CS		4
-#define DDR_REG_TO_SIZE(reg)	(((reg) | 0xffffff) + 1)
-#define DDR_REG_TO_BASE(reg)	((reg) & 0xff000000)
-#define DDR_BANK_EN		1
+#define DDR_BASE_CS(n)		ORION5X_DDR_REG(0x1500 + ((n) << 3))
+#define DDR_SIZE_CS(n)		ORION5X_DDR_REG(0x1504 + ((n) << 3))

 /*
 * CPU Address Decode Windows registers
@ -81,17 +68,6 @@
 #define CPU_WIN_REMAP_LO(n)	ORION5X_BRIDGE_REG(0x008 | ((n) << 4))
 #define CPU_WIN_REMAP_HI(n)	ORION5X_BRIDGE_REG(0x00c | ((n) << 4))

-/*
- * Gigabit Ethernet Address Decode Windows registers
- */
-#define ETH_WIN_BASE(win)	ORION5X_ETH_REG(0x200 + ((win) * 8))
-#define ETH_WIN_SIZE(win)	ORION5X_ETH_REG(0x204 + ((win) * 8))
-#define ETH_WIN_REMAP(win)	ORION5X_ETH_REG(0x280 + ((win) * 4))
-#define ETH_WIN_EN		ORION5X_ETH_REG(0x290)
-#define ETH_WIN_PROT		ORION5X_ETH_REG(0x294)
-#define ETH_MAX_WIN		6
-#define ETH_MAX_REMAP_WIN	4
-

 struct mbus_dram_target_info orion5x_mbus_dram_info;

@ -202,39 +178,3 @@ void __init orion5x_setup_pcie_wa_win(u32 base, u32 size)
 {
 	setup_cpu_win(7, base, size, TARGET_PCIE, ATTR_PCIE_WA, -1);
 }
-
-void __init orion5x_setup_eth_wins(void)
-{
-	int i;
-
-	/*
-	 * First, disable and clear windows
-	 */
-	for (i = 0; i < ETH_MAX_WIN; i++) {
-		orion5x_write(ETH_WIN_BASE(i), 0);
-		orion5x_write(ETH_WIN_SIZE(i), 0);
-		orion5x_setbits(ETH_WIN_EN, 1 << i);
-		orion5x_clrbits(ETH_WIN_PROT, 0x3 << (i * 2));
-		if (i < ETH_MAX_REMAP_WIN)
-			orion5x_write(ETH_WIN_REMAP(i), 0);
-	}
-
-	/*
-	 * Setup windows for DDR banks.
-	 */
-	for (i = 0; i < DDR_MAX_CS; i++) {
-		u32 base, size;
-		size = orion5x_read(DDR_SIZE_CS(i));
-		base = orion5x_read(DDR_BASE_CS(i));
-		if (size & DDR_BANK_EN) {
-			base = DDR_REG_TO_BASE(base);
-			size = DDR_REG_TO_SIZE(size);
-			orion5x_write(ETH_WIN_SIZE(i), (size-1) & 0xffff0000);
-			orion5x_write(ETH_WIN_BASE(i), (base & 0xffff0000) |
-					(ATTR_DDR_CS(i) << 8) |
-					TARGET_DDR);
-			orion5x_clrbits(ETH_WIN_EN, 1 << i);
-			orion5x_setbits(ETH_WIN_PROT, 0x3 << (i * 2));
-		}
-	}
-}
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@ -190,6 +190,11 @@ static struct platform_device orion5x_ehci1 = {
 * (The Orion and Discovery (MV643xx) families use the same Ethernet driver)
 ****************************************************************************/

+struct mv643xx_eth_shared_platform_data orion5x_eth_shared_data = {
+	.dram		= &orion5x_mbus_dram_info,
+	.t_clk		= ORION5X_TCLK,
+};
+
 static struct resource orion5x_eth_shared_resources[] = {
 	{
 		.start	= ORION5X_ETH_PHYS_BASE + 0x2000,
@ -201,6 +206,9 @@ static struct resource orion5x_eth_shared_resources[] = {
 static struct platform_device orion5x_eth_shared = {
 	.name		= MV643XX_ETH_SHARED_NAME,
 	.id		= 0,
+	.dev		= {
+		.platform_data	= &orion5x_eth_shared_data,
+	},
 	.num_resources	= 1,
 	.resource	= orion5x_eth_shared_resources,
 };
@ -223,7 +231,9 @@ static struct platform_device orion5x_eth = {

 void __init orion5x_eth_init(struct mv643xx_eth_platform_data *eth_data)
 {
+	eth_data->shared = &orion5x_eth_shared;
 	orion5x_eth.dev.platform_data = eth_data;
+
 	platform_device_register(&orion5x_eth_shared);
 	platform_device_register(&orion5x_eth);
 }
@ -360,7 +370,6 @@ void __init orion5x_init(void)
 	 * Setup Orion address map
 	 */
 	orion5x_setup_cpu_mbus_bridge();
-	orion5x_setup_eth_wins();

 	/*
 	 * Register devices.
--- a/arch/arm/mach-orion5x/common.h
+++ b/arch/arm/mach-orion5x/common.h
@ -22,7 +22,6 @@ void orion5x_setup_dev0_win(u32 base, u32 size);
 void orion5x_setup_dev1_win(u32 base, u32 size);
 void orion5x_setup_dev2_win(u32 base, u32 size);
 void orion5x_setup_pcie_wa_win(u32 base, u32 size);
-void orion5x_setup_eth_wins(void);

 /*
 * Shared code used internally by other Orion core functions.
--- a/arch/avr32/kernel/sys_avr32.c
+++ b/arch/avr32/kernel/sys_avr32.c
@ -14,19 +14,6 @@
 #include <asm/mman.h>
 #include <asm/uaccess.h>

-asmlinkage int sys_pipe(unsigned long __user *filedes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(filedes, fd, sizeof(fd)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, off_t offset)
--- a/arch/blackfin/kernel/sys_bfin.c
+++ b/arch/blackfin/kernel/sys_bfin.c
@ -45,23 +45,6 @@
 #include <asm/cacheflush.h>
 #include <asm/dma.h>

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2 * sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long
 do_mmap2(unsigned long addr, unsigned long len,
--- a/arch/cris/kernel/sys_cris.c
+++ b/arch/cris/kernel/sys_cris.c
@ -40,8 +40,11 @@ asmlinkage int sys_pipe(unsigned long __user * fildes)
        error = do_pipe(fd);
        unlock_kernel();
        if (!error) {
-                if (copy_to_user(fildes, fd, 2*sizeof(int)))
+                if (copy_to_user(fildes, fd, 2*sizeof(int))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
                        error = -EFAULT;
+		}
        }
        return error;
 }
--- a/arch/frv/kernel/sys_frv.c
+++ b/arch/frv/kernel/sys_frv.c
@ -28,23 +28,6 @@
 #include <asm/setup.h>
 #include <asm/uaccess.h>

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(unsigned long __user * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff)
--- a/arch/h8300/kernel/sys_h8300.c
+++ b/arch/h8300/kernel/sys_h8300.c
@ -27,23 +27,6 @@
 #include <asm/traps.h>
 #include <asm/unistd.h>

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long do_mmap2(
 	unsigned long addr, unsigned long len,
--- a/arch/m32r/kernel/sys_m32r.c
+++ b/arch/m32r/kernel/sys_m32r.c
@ -90,8 +90,11 @@ sys_pipe(unsigned long r0, unsigned long r1, unsigned long r2,

 	error = do_pipe(fd);
 	if (!error) {
-		if (copy_to_user((void __user *)r0, fd, 2*sizeof(int)))
+		if (copy_to_user((void __user *)r0, fd, 2*sizeof(int))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
 			error = -EFAULT;
+		}
 	}
 	return error;
 }
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@ -30,23 +30,6 @@
 #include <asm/page.h>
 #include <asm/unistd.h>

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long do_mmap2(
 	unsigned long addr, unsigned long len,
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@ -468,15 +468,26 @@ static inline void access_error040(struct frame *fp)
 			 * (if do_page_fault didn't fix the mapping,
                         * the writeback won't do good)
 			 */
+disable_wb:
 #ifdef DEBUG
 			printk(".. disabling wb2\n");
 #endif
 			if (fp->un.fmt7.wb2a == fp->un.fmt7.faddr)
 				fp->un.fmt7.wb2s &= ~WBV_040;
+			if (fp->un.fmt7.wb3a == fp->un.fmt7.faddr)
+				fp->un.fmt7.wb3s &= ~WBV_040;
 		}
-	} else if (send_fault_sig(&fp->ptregs) > 0) {
-		printk("68040 access error, ssw=%x\n", ssw);
-		trap_c(fp);
+	} else {
+		/* In case of a bus error we either kill the process or expect
+		 * the kernel to catch the fault, which then is also responsible
+		 * for cleaning up the mess.
+		 */
+		current->thread.signo = SIGBUS;
+		current->thread.faddr = fp->un.fmt7.faddr;
+		if (send_fault_sig(&fp->ptregs) >= 0)
+			printk("68040 bus error (ssw=%x, faddr=%lx)\n", ssw,
+			       fp->un.fmt7.faddr);
+		goto disable_wb;
 	}

 	do_040writebacks(fp);
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@ -48,9 +48,6 @@
 struct mac_booter_data mac_bi_data;
 int mac_bisize = sizeof mac_bi_data;

-struct mac_hw_present mac_hw_present;
-EXPORT_SYMBOL(mac_hw_present);
-
 /* New m68k bootinfo stuff and videobase */

 extern int m68k_num_memory;
@ -817,27 +814,6 @@ void __init mac_identify(void)
 		m68k_ramdisk.addr, m68k_ramdisk.size);
 #endif

-	/*
-	 * TODO: set the various fields in macintosh_config->hw_present here!
-	 */
-	switch (macintosh_config->scsi_type) {
-	case MAC_SCSI_OLD:
-		MACHW_SET(MAC_SCSI_80);
-		break;
-	case MAC_SCSI_QUADRA:
-	case MAC_SCSI_QUADRA2:
-	case MAC_SCSI_QUADRA3:
-		MACHW_SET(MAC_SCSI_96);
-		if ((macintosh_config->ident == MAC_MODEL_Q900) ||
-		    (macintosh_config->ident == MAC_MODEL_Q950))
-			MACHW_SET(MAC_SCSI_96_2);
-		break;
-	default:
-		printk(KERN_WARNING "config.c: wtf: unknown scsi, using 53c80\n");
-		MACHW_SET(MAC_SCSI_80);
-		break;
-	}
-
 	iop_init();
 	via_init();
 	oss_init();
--- a/arch/m68knommu/kernel/sys_m68k.c
+++ b/arch/m68knommu/kernel/sys_m68k.c
@ -28,23 +28,6 @@
 #include <asm/cacheflush.h>
 #include <asm/unistd.h>

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long do_mmap2(
 	unsigned long addr, unsigned long len,
--- a/arch/mn10300/kernel/sys_mn10300.c
+++ b/arch/mn10300/kernel/sys_mn10300.c
@ -28,23 +28,6 @@

 #define MIN_MAP_ADDR	PAGE_SIZE	/* minimum fixed mmap address */

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(unsigned long __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2 * sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /*
 * memory mapping syscall
 */
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@ -33,19 +33,6 @@
 #include <linux/utsname.h>
 #include <linux/personality.h>

-int sys_pipe(int __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 static unsigned long get_unshared_area(unsigned long addr, unsigned long len)
 {
 	struct vm_area_struct *vma;
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@ -136,23 +136,6 @@ int sys_ipc(uint call, int first, unsigned long second, long third,
 	return ret;
 }

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-int sys_pipe(int __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 static inline unsigned long do_mmap2(unsigned long addr, size_t len,
 			unsigned long prot, unsigned long flags,
 			unsigned long fd, unsigned long off, int shift)
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@ -49,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "inst_emu",   VCPU_STAT(emulated_inst_exits) },
 	{ "dec",        VCPU_STAT(dec_exits) },
 	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
+	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ NULL }
 };

@ -338,6 +339,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		}
 		break;

+	case BOOKE_INTERRUPT_FP_UNAVAIL:
+		kvmppc_queue_exception(vcpu, exit_nr);
+		r = RESUME_GUEST;
+		break;
+
 	case BOOKE_INTERRUPT_DATA_STORAGE:
 		vcpu->arch.dear = vcpu->arch.fault_dear;
 		vcpu->arch.esr = vcpu->arch.fault_esr;
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@ -36,13 +36,12 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)

 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	/* XXX implement me */
-	return 0;
+	return !!(v->arch.pending_exceptions);
 }

 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return 1;
+	return !(v->arch.msr & MSR_WE);
 }


@ -214,6 +213,11 @@ static void kvmppc_decrementer_func(unsigned long data)
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;

 	kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		vcpu->stat.halt_wakeup++;
+	}
 }

 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
@ -339,6 +343,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int r;
 	sigset_t sigsaved;

+	vcpu_load(vcpu);
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);

@ -363,12 +369,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

+	vcpu_put(vcpu);
+
 	return r;
 }

 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
 	kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+
+	if (waitqueue_active(&vcpu->wq)) {
+		wake_up_interruptible(&vcpu->wq);
+		vcpu->stat.halt_wakeup++;
+	}
+
 	return 0;
 }

--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@ -23,3 +23,4 @@ obj-$(CONFIG_SMP)	+= locks.o
 endif

 obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
+obj-$(CONFIG_HAS_IOMEM)	+= devres.o
--- a/arch/powerpc/lib/devres.c
+++ b/arch/powerpc/lib/devres.c
@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/device.h>	/* devres_*(), devm_ioremap_release() */
+#include <linux/io.h>		/* ioremap_flags() */
+#include <linux/module.h>	/* EXPORT_SYMBOL() */
+
+/**
+ * devm_ioremap_prot - Managed ioremap_flags()
+ * @dev: Generic device to remap IO address for
+ * @offset: BUS offset to map
+ * @size: Size of map
+ * @flags: Page flags
+ *
+ * Managed ioremap_prot().  Map is automatically unmapped on driver
+ * detach.
+ */
+void __iomem *devm_ioremap_prot(struct device *dev, resource_size_t offset,
+				 size_t size, unsigned long flags)
+{
+	void __iomem **ptr, *addr;
+
+	ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	addr = ioremap_flags(offset, size, flags);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else
+		devres_free(ptr);
+
+	return addr;
+}
+EXPORT_SYMBOL(devm_ioremap_prot);
--- a/arch/powerpc/platforms/chrp/pegasos_eth.c
+++ b/arch/powerpc/platforms/chrp/pegasos_eth.c
@ -58,7 +58,9 @@ static struct resource mv643xx_eth0_resources[] = {


 static struct mv643xx_eth_platform_data eth0_pd = {
+	.shared		= &mv643xx_eth_shared_device,
 	.port_number	= 0,
+
 	.tx_sram_addr = PEGASOS2_SRAM_BASE_ETH0,
 	.tx_sram_size = PEGASOS2_SRAM_TXRING_SIZE,
 	.tx_queue_size = PEGASOS2_SRAM_TXRING_SIZE/16,
@ -88,7 +90,9 @@ static struct resource mv643xx_eth1_resources[] = {
 };

 static struct mv643xx_eth_platform_data eth1_pd = {
+	.shared		= &mv643xx_eth_shared_device,
 	.port_number	= 1,
+
 	.tx_sram_addr = PEGASOS2_SRAM_BASE_ETH1,
 	.tx_sram_size = PEGASOS2_SRAM_TXRING_SIZE,
 	.tx_queue_size = PEGASOS2_SRAM_TXRING_SIZE/16,
--- a/arch/powerpc/platforms/pseries/scanlog.c
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@ -55,11 +55,6 @@ static ssize_t scanlog_read(struct file *file, char __user *buf,
        dp = PDE(inode);
 	data = (unsigned int *)dp->data;

-	if (!data) {
-		printk(KERN_ERR "scanlog: read failed no data\n");
-		return -EIO;
-	}
-
 	if (count > RTAS_DATA_BUF_SIZE)
 		count = RTAS_DATA_BUF_SIZE;

@ -146,11 +141,6 @@ static int scanlog_open(struct inode * inode, struct file * file)
 	struct proc_dir_entry *dp = PDE(inode);
 	unsigned int *data = (unsigned int *)dp->data;

-	if (!data) {
-		printk(KERN_ERR "scanlog: open failed no data\n");
-		return -EIO;
-	}
-
 	if (data[0] != 0) {
 		/* This imperfect test stops a second copy of the
 		 * data (or a reset while data is being copied)
@ -168,10 +158,6 @@ static int scanlog_release(struct inode * inode, struct file * file)
 	struct proc_dir_entry *dp = PDE(inode);
 	unsigned int *data = (unsigned int *)dp->data;

-	if (!data) {
-		printk(KERN_ERR "scanlog: release failed no data\n");
-		return -EIO;
-	}
 	data[0] = 0;

 	return 0;
@ -200,12 +186,11 @@ static int __init scanlog_init(void)
 	if (!data)
 		goto err;

-	ent = proc_create("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
-			  &scanlog_fops);
+	ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
+			       &scanlog_fops, data);
 	if (!ent)
 		goto err;

-	ent->data = data;
 	proc_ppc64_scan_log_dump = ent;

 	return 0;
--- a/arch/powerpc/sysdev/mv64x60_dev.c
+++ b/arch/powerpc/sysdev/mv64x60_dev.c
@ -239,6 +239,8 @@ static int __init mv64x60_eth_device_setup(struct device_node *np, int id,

 	memset(&pdata, 0, sizeof(pdata));

+	pdata.shared = shared_pdev;
+
 	prop = of_get_property(np, "reg", NULL);
 	if (!prop)
 		return -ENODEV;
--- a/arch/ppc/syslib/mv64x60.c
+++ b/arch/ppc/syslib/mv64x60.c
@ -341,6 +341,7 @@ static struct resource mv64x60_eth0_resources[] = {
 };

 static struct mv643xx_eth_platform_data eth0_pd = {
+	.shared		= &mv64x60_eth_shared_device;
 	.port_number	= 0,
 };

@ -366,6 +367,7 @@ static struct resource mv64x60_eth1_resources[] = {
 };

 static struct mv643xx_eth_platform_data eth1_pd = {
+	.shared		= &mv64x60_eth_shared_device;
 	.port_number	= 1,
 };

@ -391,6 +393,7 @@ static struct resource mv64x60_eth2_resources[] = {
 };

 static struct mv643xx_eth_platform_data eth2_pd = {
+	.shared		= &mv64x60_eth_shared_device;
 	.port_number	= 2,
 };

--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@ -430,6 +430,13 @@ config CMM_IUCV
 	  Select this option to enable the special message interface to
 	  the cooperative memory management.

+config PAGE_STATES
+	bool "Unused page notification"
+	help
+	  This enables the notification of unused pages to the
+	  hypervisor. The ESSA instruction is used to do the states
+	  changes between a page that has content and the unused state.
+
 config VIRT_TIMER
 	bool "Virtual CPU timer support"
 	help
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@ -121,7 +121,7 @@ sys32_ptrace_wrapper:
 	lgfr	%r3,%r3			# long
 	llgtr	%r4,%r4			# long
 	llgfr	%r5,%r5			# long
-	jg	sys_ptrace		# branch to system call
+	jg	compat_sys_ptrace	# branch to system call

 	.globl	sys32_alarm_wrapper
 sys32_alarm_wrapper:
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@ -279,8 +279,6 @@ sysc_do_restart:
 	st	%r2,SP_R2(%r15)   # store return value (change R2 on stack)

 sysc_return:
-	tm	SP_PSW+1(%r15),0x01	# returning to user ?
-	bno	BASED(sysc_restore)
 	tm	__TI_flags+3(%r9),_TIF_WORK_SVC
 	bnz	BASED(sysc_work)  # there is work to do (signals etc.)
 sysc_restore:
@ -312,6 +310,8 @@ sysc_work_loop:
 # One of the work bits is on. Find out which one.
 #
 sysc_work:
+	tm	SP_PSW+1(%r15),0x01	# returning to user ?
+	bno	BASED(sysc_restore)
 	tm	__TI_flags+3(%r9),_TIF_MCCK_PENDING
 	bo	BASED(sysc_mcck_pending)
 	tm	__TI_flags+3(%r9),_TIF_NEED_RESCHED
@ -602,12 +602,6 @@ io_no_vtime:
 	la	%r2,SP_PTREGS(%r15)	# address of register-save area
 	basr	%r14,%r1		# branch to standard irq handler
 io_return:
-	tm	SP_PSW+1(%r15),0x01	# returning to user ?
-#ifdef CONFIG_PREEMPT
-	bno	BASED(io_preempt)	# no -> check for preemptive scheduling
-#else
-	bno	BASED(io_restore)	# no-> skip resched & signal
-#endif
 	tm	__TI_flags+3(%r9),_TIF_WORK_INT
 	bnz	BASED(io_work)		# there is work to do (signals etc.)
 io_restore:
@ -629,10 +623,18 @@ io_restore_trace_psw:
 	.long	0, io_restore_trace + 0x80000000
 #endif

-#ifdef CONFIG_PREEMPT
-io_preempt:
+#
+# switch to kernel stack, then check the TIF bits
+#
+io_work:
+	tm	SP_PSW+1(%r15),0x01	# returning to user ?
+#ifndef CONFIG_PREEMPT
+	bno	BASED(io_restore)	# no-> skip resched & signal
+#else
+	bnz	BASED(io_work_user)	# no -> check for preemptive scheduling
+	# check for preemptive scheduling
 	icm	%r0,15,__TI_precount(%r9)
-	bnz	BASED(io_restore)
+	bnz	BASED(io_restore)	# preemption disabled
 	l	%r1,SP_R15(%r15)
 	s	%r1,BASED(.Lc_spsize)
 	mvc	SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15)
@ -646,10 +648,7 @@ io_resume_loop:
 	br	%r1			# call schedule
 #endif

-#
-# switch to kernel stack, then check the TIF bits
-#
-io_work:
+io_work_user:
 	l	%r1,__LC_KERNEL_STACK
 	s	%r1,BASED(.Lc_spsize)
 	mvc	SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15)
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@ -271,8 +271,6 @@ sysc_noemu:
 	stg	%r2,SP_R2(%r15) # store return value (change R2 on stack)

 sysc_return:
-	tm	SP_PSW+1(%r15),0x01	# returning to user ?
-	jno	sysc_restore
 	tm	__TI_flags+7(%r9),_TIF_WORK_SVC
 	jnz	sysc_work	# there is work to do (signals etc.)
 sysc_restore:
@ -304,6 +302,8 @@ sysc_work_loop:
 # One of the work bits is on. Find out which one.
 #
 sysc_work:
+	tm	SP_PSW+1(%r15),0x01	# returning to user ?
+	jno	sysc_restore
 	tm	__TI_flags+7(%r9),_TIF_MCCK_PENDING
 	jo	sysc_mcck_pending
 	tm	__TI_flags+7(%r9),_TIF_NEED_RESCHED
@ -585,12 +585,6 @@ io_no_vtime:
 	la	%r2,SP_PTREGS(%r15)	# address of register-save area
 	brasl	%r14,do_IRQ		# call standard irq handler
 io_return:
-	tm	SP_PSW+1(%r15),0x01	# returning to user ?
-#ifdef CONFIG_PREEMPT
-	jno	io_preempt		# no -> check for preemptive scheduling
-#else
-	jno	io_restore		# no-> skip resched & signal
-#endif
 	tm	__TI_flags+7(%r9),_TIF_WORK_INT
 	jnz	io_work 		# there is work to do (signals etc.)
 io_restore:
@ -612,10 +606,41 @@ io_restore_trace_psw:
 	.quad	0, io_restore_trace
 #endif

-#ifdef CONFIG_PREEMPT
-io_preempt:
+#
+# There is work todo, we need to check if we return to userspace, then
+# check, if we are in SIE, if yes leave it
+#
+io_work:
+	tm	SP_PSW+1(%r15),0x01	# returning to user ?
+#ifndef CONFIG_PREEMPT
+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
+	jnz	io_work_user		# yes -> no need to check for SIE
+	la	%r1, BASED(sie_opcode)	# we return to kernel here
+	lg	%r2, SP_PSW+8(%r15)
+	clc	0(2,%r1), 0(%r2)	# is current instruction = SIE?
+	jne	io_restore		# no-> return to kernel
+	lg	%r1, SP_PSW+8(%r15)	# yes-> add 4 bytes to leave SIE
+	aghi	%r1, 4
+	stg	%r1, SP_PSW+8(%r15)
+	j	io_restore		# return to kernel
+#else
+	jno	io_restore		# no-> skip resched & signal
+#endif
+#else
+	jnz	io_work_user		# yes -> do resched & signal
+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
+	la	%r1, BASED(sie_opcode)
+	lg	%r2, SP_PSW+8(%r15)
+	clc	0(2,%r1), 0(%r2)	# is current instruction = SIE?
+	jne	0f			# no -> leave PSW alone
+	lg	%r1, SP_PSW+8(%r15)	# yes-> add 4 bytes to leave SIE
+	aghi	%r1, 4
+	stg	%r1, SP_PSW+8(%r15)
+0:
+#endif
+	# check for preemptive scheduling
 	icm	%r0,15,__TI_precount(%r9)
-	jnz	io_restore
+	jnz	io_restore		# preemption is disabled
 	# switch to kernel stack
 	lg	%r1,SP_R15(%r15)
 	aghi	%r1,-SP_SIZE
@ -629,10 +654,7 @@ io_resume_loop:
 	jg	preempt_schedule_irq
 #endif

-#
-# switch to kernel stack, then check TIF bits
-#
-io_work:
+io_work_user:
 	lg	%r1,__LC_KERNEL_STACK
 	aghi	%r1,-SP_SIZE
 	mvc	SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15)
@ -653,6 +675,11 @@ io_work_loop:
 	j	io_restore
 io_work_done:

+#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
+sie_opcode:
+	.long 0xb2140000
+#endif
+
 #
 # _TIF_MCCK_PENDING is set, call handler
 #
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@ -292,8 +292,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 	return 0;
 }

-static int
-do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
 	ptrace_area parea; 
 	int copied, ret;
@ -529,35 +528,19 @@ poke_user_emu31(struct task_struct *child, addr_t addr, addr_t data)
 	return 0;
 }

-static int
-do_ptrace_emu31(struct task_struct *child, long request, long addr, long data)
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+			compat_ulong_t caddr, compat_ulong_t cdata)
 {
-	unsigned int tmp;  /* 4 bytes !! */
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
 	ptrace_area_emu31 parea; 
 	int copied, ret;

 	switch (request) {
-	case PTRACE_PEEKTEXT:
-	case PTRACE_PEEKDATA:
-		/* read word at location addr. */
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		if (copied != sizeof(tmp))
-			return -EIO;
-		return put_user(tmp, (unsigned int __force __user *) data);
-
 	case PTRACE_PEEKUSR:
 		/* read the word at location addr in the USER area. */
 		return peek_user_emu31(child, addr, data);

-	case PTRACE_POKETEXT:
-	case PTRACE_POKEDATA:
-		/* write the word at location addr. */
-		tmp = data;
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 1);
-		if (copied != sizeof(tmp))
-			return -EIO;
-		return 0;
-
 	case PTRACE_POKEUSR:
 		/* write the word at location addr in the USER area */
 		return poke_user_emu31(child, addr, data);
@ -587,82 +570,11 @@ do_ptrace_emu31(struct task_struct *child, long request, long addr, long data)
 			copied += sizeof(unsigned int);
 		}
 		return 0;
-	case PTRACE_GETEVENTMSG:
-		return put_user((__u32) child->ptrace_message,
-				(unsigned int __force __user *) data);
-	case PTRACE_GETSIGINFO:
-		if (child->last_siginfo == NULL)
-			return -EINVAL;
-		return copy_siginfo_to_user32((compat_siginfo_t
-					       __force __user *) data,
-					      child->last_siginfo);
-	case PTRACE_SETSIGINFO:
-		if (child->last_siginfo == NULL)
-			return -EINVAL;
-		return copy_siginfo_from_user32(child->last_siginfo,
-						(compat_siginfo_t
-						 __force __user *) data);
 	}
-	return ptrace_request(child, request, addr, data);
+	return compat_ptrace_request(child, request, addr, data);
 }
 #endif

-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
-{
-	switch (request) {
-	case PTRACE_SYSCALL:
-		/* continue and stop at next (return from) syscall */
-	case PTRACE_CONT:
-		/* restart after signal. */
-		if (!valid_signal(data))
-			return -EIO;
-		if (request == PTRACE_SYSCALL)
-			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		else
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
-		/* make sure the single step bit is not set. */
-		user_disable_single_step(child);
-		wake_up_process(child);
-		return 0;
-
-	case PTRACE_KILL:
-		/*
-		 * make the child exit.  Best I can do is send it a sigkill. 
-		 * perhaps it should be put in the status that it wants to 
-		 * exit.
-		 */
-		if (child->exit_state == EXIT_ZOMBIE) /* already dead */
-			return 0;
-		child->exit_code = SIGKILL;
-		/* make sure the single step bit is not set. */
-		user_disable_single_step(child);
-		wake_up_process(child);
-		return 0;
-
-	case PTRACE_SINGLESTEP:
-		/* set the trap flag. */
-		if (!valid_signal(data))
-			return -EIO;
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
-		user_enable_single_step(child);
-		/* give it a chance to run. */
-		wake_up_process(child);
-		return 0;
-
-	/* Do requests that differ for 31/64 bit */
-	default:
-#ifdef CONFIG_COMPAT
-		if (test_thread_flag(TIF_31BIT))
-			return do_ptrace_emu31(child, request, addr, data);
-#endif
-		return do_ptrace_normal(child, request, addr, data);
-	}
-	/* Not reached.  */
-	return -EIO;
-}
-
 asmlinkage void
 syscall_trace(struct pt_regs *regs, int entryexit)
 {
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@ -32,23 +32,6 @@
 #include <asm/uaccess.h>
 #include "entry.h"

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(unsigned long __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 /* common code for old and new mmaps */
 static inline long do_mmap2(
 	unsigned long addr, unsigned long len,
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@ -22,7 +22,6 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select S390_SWITCH_AMODE
-	select PREEMPT
 	---help---
 	  Support hosting paravirtualized guest machines using the SIE
 	  virtualization capability on the mainframe. This should work
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@ -105,6 +105,9 @@ static intercept_handler_t instruction_handlers[256] = {
 static int handle_noop(struct kvm_vcpu *vcpu)
 {
 	switch (vcpu->arch.sie_block->icptcode) {
+	case 0x0:
+		vcpu->stat.exit_null++;
+		break;
 	case 0x10:
 		vcpu->stat.exit_external_request++;
 		break;
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@ -31,6 +31,7 @@

 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "userspace_handled", VCPU_STAT(exit_userspace) },
+	{ "exit_null", VCPU_STAT(exit_null) },
 	{ "exit_validity", VCPU_STAT(exit_validity) },
 	{ "exit_stop_request", VCPU_STAT(exit_stop_request) },
 	{ "exit_external_request", VCPU_STAT(exit_external_request) },
@ -221,10 +222,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK;
 	restore_fp_regs(&vcpu->arch.guest_fpregs);
 	restore_access_regs(vcpu->arch.guest_acrs);
-
-	if (signal_pending(current))
-		atomic_set_mask(CPUSTAT_STOP_INT,
-			&vcpu->arch.sie_block->cpuflags);
 }

 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@ -5,3 +5,4 @@
 obj-y	 := init.o fault.o extmem.o mmap.o vmem.o pgtable.o
 obj-$(CONFIG_CMM) += cmm.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_PAGE_STATES) += page-states.o
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@ -126,6 +126,9 @@ void __init mem_init(void)
        /* clear the zero-page */
        memset(empty_zero_page, 0, PAGE_SIZE);

+	/* Setup guest page hinting */
+	cmma_init();
+
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();

--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@ -0,0 +1,79 @@
+/*
+ * arch/s390/mm/page-states.c
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Guest page hinting for unused pages.
+ *
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+
+#define ESSA_SET_STABLE		1
+#define ESSA_SET_UNUSED		2
+
+static int cmma_flag;
+
+static int __init cmma(char *str)
+{
+	char *parm;
+	parm = strstrip(str);
+	if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) {
+		cmma_flag = 1;
+		return 1;
+	}
+	cmma_flag = 0;
+	if (strcmp(parm, "no") == 0 || strcmp(parm, "off") == 0)
+		return 1;
+	return 0;
+}
+
+__setup("cmma=", cmma);
+
+void __init cmma_init(void)
+{
+	register unsigned long tmp asm("0") = 0;
+	register int rc asm("1") = -EOPNOTSUPP;
+
+	if (!cmma_flag)
+		return;
+	asm volatile(
+		"       .insn rrf,0xb9ab0000,%1,%1,0,0\n"
+		"0:     la      %0,0\n"
+		"1:\n"
+		EX_TABLE(0b,1b)
+		: "+&d" (rc), "+&d" (tmp));
+	if (rc)
+		cmma_flag = 0;
+}
+
+void arch_free_page(struct page *page, int order)
+{
+	int i, rc;
+
+	if (!cmma_flag)
+		return;
+	for (i = 0; i < (1 << order); i++)
+		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
+			     : "=&d" (rc)
+			     : "a" ((page_to_pfn(page) + i) << PAGE_SHIFT),
+			       "i" (ESSA_SET_UNUSED));
+}
+
+void arch_alloc_page(struct page *page, int order)
+{
+	int i, rc;
+
+	if (!cmma_flag)
+		return;
+	for (i = 0; i < (1 << order); i++)
+		asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
+			     : "=&d" (rc)
+			     : "a" ((page_to_pfn(page) + i) << PAGE_SHIFT),
+			       "i" (ESSA_SET_STABLE));
+}
--- a/arch/sh/kernel/sys_sh64.c
+++ b/arch/sh/kernel/sys_sh64.c
@ -30,23 +30,6 @@
 #include <asm/ptrace.h>
 #include <asm/unistd.h>

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long * fildes)
-{
-        int fd[2];
-        int error;
-
-        error = do_pipe(fd);
-        if (!error) {
-                if (copy_to_user(fildes, fd, 2*sizeof(int)))
-                        error = -EFAULT;
-        }
-        return error;
-}
-
 /*
 * Do a system call from kernel instead of calling sys_execve so we
 * end up with proper pt_regs.
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@ -419,14 +419,26 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags,
                             unsigned long stack_size)
 {
 	unsigned long parent_tid_ptr, child_tid_ptr;
+	unsigned long orig_i1 = regs->u_regs[UREG_I1];
+	long ret;

 	parent_tid_ptr = regs->u_regs[UREG_I2];
 	child_tid_ptr = regs->u_regs[UREG_I4];

-	return do_fork(clone_flags, stack_start,
-		       regs, stack_size,
-		       (int __user *) parent_tid_ptr,
-		       (int __user *) child_tid_ptr);
+	ret = do_fork(clone_flags, stack_start,
+		      regs, stack_size,
+		      (int __user *) parent_tid_ptr,
+		      (int __user *) child_tid_ptr);
+
+	/* If we get an error and potentially restart the system
+	 * call, we're screwed because copy_thread() clobbered
+	 * the parent's %o1.  So detect that case and restore it
+	 * here.
+	 */
+	if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)
+		regs->u_regs[UREG_I1] = orig_i1;
+
+	return ret;
 }

 /* Copy a Sparc thread.  The fork() return value conventions
--- a/arch/sparc/kernel/signal.c
+++ b/arch/sparc/kernel/signal.c
@ -245,15 +245,29 @@ static inline int invalid_frame_pointer(void __user *fp, int fplen)

 static inline void __user *get_sigframe(struct sigaction *sa, struct pt_regs *regs, unsigned long framesize)
 {
-	unsigned long sp;
+	unsigned long sp = regs->u_regs[UREG_FP];

-	sp = regs->u_regs[UREG_FP];
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - framesize)))
+		return (void __user *) -1L;

 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (sa->sa_flags & SA_ONSTACK) {
-		if (!on_sig_stack(sp) && !((current->sas_ss_sp + current->sas_ss_size) & 7))
+		if (sas_ss_flags(sp) == 0)
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
+
+	/* Always align the stack frame.  This handles two cases.  First,
+	 * sigaltstack need not be mindful of platform specific stack
+	 * alignment.  Second, if we took this signal because the stack
+	 * is not aligned properly, we'd like to take the signal cleanly
+	 * and report that.
+	 */
+	sp &= ~7UL;
+
 	return (void __user *)(sp - framesize);
 }

--- a/arch/sparc/kernel/sys_sparc.c
+++ b/arch/sparc/kernel/sys_sparc.c
@ -223,8 +223,7 @@ int sparc_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
 {
 	if (ARCH_SUN4C_SUN4 &&
 	    (len > 0x20000000 ||
-	     ((flags & MAP_FIXED) &&
-	      addr < 0xe0000000 && addr + len > 0x20000000)))
+	     (addr < 0xe0000000 && addr + len > 0x20000000)))
 		return -EINVAL;

 	/* See asm-sparc/uaccess.h */
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@ -503,6 +503,8 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
 			      unsigned long stack_size)
 {
 	int __user *parent_tid_ptr, *child_tid_ptr;
+	unsigned long orig_i1 = regs->u_regs[UREG_I1];
+	long ret;

 #ifdef CONFIG_COMPAT
 	if (test_thread_flag(TIF_32BIT)) {
@ -515,9 +517,19 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
 		child_tid_ptr = (int __user *) regs->u_regs[UREG_I4];
 	}

-	return do_fork(clone_flags, stack_start,
-		       regs, stack_size,
-		       parent_tid_ptr, child_tid_ptr);
+	ret = do_fork(clone_flags, stack_start,
+		      regs, stack_size,
+		      parent_tid_ptr, child_tid_ptr);
+
+	/* If we get an error and potentially restart the system
+	 * call, we're screwed because copy_thread() clobbered
+	 * the parent's %o1.  So detect that case and restore it
+	 * here.
+	 */
+	if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)
+		regs->u_regs[UREG_I1] = orig_i1;
+
+	return ret;
 }

 /* Copy a Sparc thread.  The fork() return value conventions
--- a/arch/sparc64/kernel/signal.c
+++ b/arch/sparc64/kernel/signal.c
@ -376,16 +376,29 @@ save_fpu_state(struct pt_regs *regs, __siginfo_fpu_t __user *fpu)

 static inline void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, unsigned long framesize)
 {
-	unsigned long sp;
+	unsigned long sp = regs->u_regs[UREG_FP] + STACK_BIAS;

-	sp = regs->u_regs[UREG_FP] + STACK_BIAS;
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - framesize)))
+		return (void __user *) -1L;

 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (!on_sig_stack(sp) &&
-		    !((current->sas_ss_sp + current->sas_ss_size) & 7))
+		if (sas_ss_flags(sp) == 0)
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
+
+	/* Always align the stack frame.  This handles two cases.  First,
+	 * sigaltstack need not be mindful of platform specific stack
+	 * alignment.  Second, if we took this signal because the stack
+	 * is not aligned properly, we'd like to take the signal cleanly
+	 * and report that.
+	 */
+	sp &= ~7UL;
+
 	return (void __user *)(sp - framesize);
 }

--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@ -406,11 +406,27 @@ static void __user *get_sigframe(struct sigaction *sa, struct pt_regs *regs, uns
 	regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
 	sp = regs->u_regs[UREG_FP];
 	
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - framesize)))
+		return (void __user *) -1L;
+
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (sa->sa_flags & SA_ONSTACK) {
-		if (!on_sig_stack(sp) && !((current->sas_ss_sp + current->sas_ss_size) & 7))
+		if (sas_ss_flags(sp) == 0)
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
+
+	/* Always align the stack frame.  This handles two cases.  First,
+	 * sigaltstack need not be mindful of platform specific stack
+	 * alignment.  Second, if we took this signal because the stack
+	 * is not aligned properly, we'd like to take the signal cleanly
+	 * and report that.
+	 */
+	sp &= ~7UL;
+
 	return (void __user *)(sp - framesize);
 }

--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@ -865,21 +865,14 @@ void smp_call_function_client(int irq, struct pt_regs *regs)
 	void *info = call_data->info;

 	clear_softint(1 << irq);
-
-	irq_enter();
-
-	if (!call_data->wait) {
-		/* let initiator proceed after getting data */
-		atomic_inc(&call_data->finished);
-	}
-
-	func(info);
-
-	irq_exit();
-
 	if (call_data->wait) {
 		/* let initiator proceed only after completion */
+		func(info);
 		atomic_inc(&call_data->finished);
+	} else {
+		/* let initiator proceed after getting data */
+		atomic_inc(&call_data->finished);
+		func(info);
 	}
 }

@ -1041,9 +1034,7 @@ void smp_receive_signal(int cpu)

 void smp_receive_signal_client(int irq, struct pt_regs *regs)
 {
-	irq_enter();
 	clear_softint(1 << irq);
-	irq_exit();
 }

 void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
@ -1051,8 +1042,6 @@ void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 	struct mm_struct *mm;
 	unsigned long flags;

-	irq_enter();
-
 	clear_softint(1 << irq);

 	/* See if we need to allocate a new TLB context because
@ -1072,8 +1061,6 @@ void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 	load_secondary_context(mm);
 	__flush_tlb_mm(CTX_HWBITS(mm->context),
 		       SECONDARY_CONTEXT);
-
-	irq_exit();
 }

 void smp_new_mmu_context_version(void)
@ -1239,8 +1226,6 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);

-	irq_enter();
-
 	preempt_disable();

 	__asm__ __volatile__("flushw");
@ -1253,8 +1238,6 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 	prom_world(0);

 	preempt_enable();
-
-	irq_exit();
 }

 /* /proc/profile writes can call this, don't __init it please. */
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@ -549,13 +549,13 @@ int sparc64_mmap_check(unsigned long addr, unsigned long len,
 		if (len >= STACK_TOP32)
 			return -EINVAL;

-		if ((flags & MAP_FIXED) && addr > STACK_TOP32 - len)
+		if (addr > STACK_TOP32 - len)
 			return -EINVAL;
 	} else {
 		if (len >= VA_EXCLUDE_START)
 			return -EINVAL;

-		if ((flags & MAP_FIXED) && invalid_64bit_range(addr, len))
+		if (invalid_64bit_range(addr, len))
 			return -EINVAL;
 	}

--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@ -236,13 +236,6 @@ asmlinkage long sys32_getegid16(void)

 /* 32-bit timeval and related flotsam.  */

-static long get_tv32(struct timeval *o, struct compat_timeval __user *i)
-{
-	return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
-		(__get_user(o->tv_sec, &i->tv_sec) |
-		 __get_user(o->tv_usec, &i->tv_usec)));
-}
-
 static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
 {
 	return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
@ -757,30 +750,6 @@ asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
 	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
 }

-asmlinkage long sys32_utimes(char __user *filename,
-			     struct compat_timeval __user *tvs)
-{
-	struct timespec tv[2];
-
-	if (tvs) {
-		struct timeval ktvs[2];
-		if (get_tv32(&ktvs[0], tvs) ||
-		    get_tv32(&ktvs[1], 1+tvs))
-			return -EFAULT;
-
-		if (ktvs[0].tv_usec < 0 || ktvs[0].tv_usec >= 1000000 ||
-		    ktvs[1].tv_usec < 0 || ktvs[1].tv_usec >= 1000000)
-			return -EINVAL;
-
-		tv[0].tv_sec = ktvs[0].tv_sec;
-		tv[0].tv_nsec = 1000 * ktvs[0].tv_usec;
-		tv[1].tv_sec = ktvs[1].tv_sec;
-		tv[1].tv_nsec = 1000 * ktvs[1].tv_usec;
-	}
-
-	return do_utimes(AT_FDCWD, filename, tvs ? tv : NULL, 0);
-}
-
 /* These are here just in case some old sparc32 binary calls it. */
 asmlinkage long sys32_pause(void)
 {
--- a/arch/sparc64/kernel/systbls.S
+++ b/arch/sparc64/kernel/systbls.S
@ -45,7 +45,7 @@ sys_call_table32:
 /*120*/	.word compat_sys_readv, compat_sys_writev, sys32_settimeofday, sys32_fchown16, sys_fchmod
 	.word sys_nis_syscall, sys32_setreuid16, sys32_setregid16, sys_rename, sys_truncate
 /*130*/	.word sys_ftruncate, sys_flock, compat_sys_lstat64, sys_nis_syscall, sys_nis_syscall
-	.word sys_nis_syscall, sys32_mkdir, sys_rmdir, sys32_utimes, compat_sys_stat64
+	.word sys_nis_syscall, sys32_mkdir, sys_rmdir, compat_sys_utimes, compat_sys_stat64
 /*140*/	.word sys32_sendfile64, sys_nis_syscall, sys32_futex, sys_gettid, compat_sys_getrlimit
 	.word compat_sys_setrlimit, sys_pivot_root, sys32_prctl, sys_pciconfig_read, sys_pciconfig_write
 /*150*/	.word sys_nis_syscall, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@ -771,6 +771,9 @@ static void __init find_ramdisk(unsigned long phys_base)
 		initrd_end = ramdisk_image + sparc_ramdisk_size;

 		lmb_reserve(initrd_start, initrd_end);
+
+		initrd_start += PAGE_OFFSET;
+		initrd_end += PAGE_OFFSET;
 	}
 #endif
 }
@ -2362,16 +2365,3 @@ void __flush_tlb_all(void)
 	__asm__ __volatile__("wrpr	%0, 0, %%pstate"
 			     : : "r" (pstate));
 }
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-
-void online_page(struct page *page)
-{
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	totalram_pages++;
-	num_physpages++;
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@ -191,9 +191,9 @@ void line_flush_chars(struct tty_struct *tty)
 	line_flush_buffer(tty);
 }

-void line_put_char(struct tty_struct *tty, unsigned char ch)
+int line_put_char(struct tty_struct *tty, unsigned char ch)
 {
-	line_write(tty, &ch, sizeof(ch));
+	return line_write(tty, &ch, sizeof(ch));
 }

 int line_write(struct tty_struct *tty, const unsigned char *buf, int len)
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@ -71,7 +71,7 @@ extern int line_setup(struct line *lines, unsigned int sizeof_lines,
 		      char *init, char **error_out);
 extern int line_write(struct tty_struct *tty, const unsigned char *buf,
 		      int len);
-extern void line_put_char(struct tty_struct *tty, unsigned char ch);
+extern int line_put_char(struct tty_struct *tty, unsigned char ch);
 extern void line_set_termios(struct tty_struct *tty, struct ktermios * old);
 extern int line_chars_in_buffer(struct tty_struct *tty);
 extern void line_flush_buffer(struct tty_struct *tty);
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@ -73,23 +73,6 @@ long old_mmap(unsigned long addr, unsigned long len,
 out:
 	return err;
 }
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-long sys_pipe(unsigned long __user * fildes)
-{
-	int fd[2];
-	long error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, sizeof(fd)))
-			error = -EFAULT;
-	}
-	return error;
-}
-

 long sys_uname(struct old_utsname __user * name)
 {
--- a/arch/v850/kernel/syscalls.c
+++ b/arch/v850/kernel/syscalls.c
@ -132,23 +132,6 @@ sys_ipc (uint call, int first, int second, int third, void *ptr, long fifth)
 	return ret;
 }

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way unix traditionally does this, though.
- */
-int sys_pipe (int *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe (fd);
-	if (!error) {
-		if (copy_to_user (fildes, fd, 2*sizeof (int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 static inline unsigned long
 do_mmap2 (unsigned long addr, size_t len,
 	 unsigned long prot, unsigned long flags,
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@ -18,6 +18,7 @@ config X86_64
 ### Arch settings
 config X86
 	def_bool y
+	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
@ -1661,6 +1662,7 @@ config GEODE_MFGPT_TIMER

 config OLPC
 	bool "One Laptop Per Child support"
+	depends on MGEODE_LX
 	default n
 	help
 	  Add support for detecting the unique features of the OLPC
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@ -191,7 +191,7 @@ static void read_ehdr(FILE *fp)
 		die("Cannot read ELF header: %s\n",
 			strerror(errno));
 	}
-	if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
 		die("No ELF magic\n");
 	}
 	if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@ -83,9 +83,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o

-ifdef CONFIG_INPUT_PCSPKR
-obj-y				+= pcspeaker.o
-endif
+obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o

 obj-$(CONFIG_SCx200)		+= scx200.o
 scx200-y			+= scx200_32.o
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@ -10,5 +10,5 @@ endif
 $(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin

 $(obj)/realmode/wakeup.bin: FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/realmode $@
+	$(Q)$(MAKE) $(build)=$(obj)/realmode

--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@ -6,7 +6,8 @@
 # for more details.
 #

-targets		:= wakeup.bin wakeup.elf
+always		:= wakeup.bin
+targets		:= wakeup.elf wakeup.lds

 wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o

@ -48,7 +49,7 @@ LDFLAGS_wakeup.elf	:= -T

 CPPFLAGS_wakeup.lds += -P -C

-$(obj)/wakeup.elf: $(src)/wakeup.lds $(WAKEUP_OBJS) FORCE
+$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
 	$(call if_changed,ld)

 OBJCOPYFLAGS_wakeup.bin	:= -O binary
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@ -133,6 +133,7 @@ static int kvm_register_clock(void)
 	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
 }

+#ifdef CONFIG_X86_LOCAL_APIC
 static void kvm_setup_secondary_clock(void)
 {
 	/*
@ -143,6 +144,7 @@ static void kvm_setup_secondary_clock(void)
 	/* ok, done with our trickery, call native */
 	setup_secondary_APIC_clock();
 }
+#endif

 /*
 * After the clock is registered, the host will keep writing to the
@ -177,7 +179,9 @@ void __init kvmclock_init(void)
 		pv_time_ops.get_wallclock = kvm_get_wallclock;
 		pv_time_ops.set_wallclock = kvm_set_wallclock;
 		pv_time_ops.sched_clock = kvm_clock_read;
+#ifdef CONFIG_X86_LOCAL_APIC
 		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+#endif
 		machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
 		machine_ops.crash_shutdown  = kvm_crash_shutdown;
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@ -794,6 +794,11 @@ void __init find_smp_config(void)
                            ACPI-based MP Configuration
   -------------------------------------------------------------------------- */

+/*
+ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
+ */
+int es7000_plat;
+
 #ifdef CONFIG_ACPI

 #ifdef	CONFIG_X86_IO_APIC
@ -909,8 +914,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	MP_intsrc_info(&intsrc);
 }

-int es7000_plat;
-
 void __init mp_config_acpi_legacy_irqs(void)
 {
 	struct mpc_config_intsrc intsrc;
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@ -149,7 +149,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
 			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
-			DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
 		},
 	},
 	{       /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@ -95,7 +95,7 @@ void __init setup_per_cpu_areas(void)

 	/* Copy section for each CPU (we discard the original) */
 	size = PERCPU_ENOUGH_ROOM;
-	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
+	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);

 	for_each_possible_cpu(i) {
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@ -299,7 +299,7 @@ static void __cpuinit smp_callin(void)
 /*
 * Activate a secondary processor.
 */
-void __cpuinit start_secondary(void *unused)
+static void __cpuinit start_secondary(void *unused)
 {
 	/*
 	 * Don't put *anything* before cpu_init(), SMP booting is too
@ -1306,7 +1306,7 @@ static void remove_siblinginfo(int cpu)
 	cpu_clear(cpu, cpu_sibling_setup_map);
 }

-int additional_cpus __initdata = -1;
+static int additional_cpus __initdata = -1;

 static __init int setup_additional_cpus(char *s)
 {
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@ -22,23 +22,6 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage int sys_pipe(unsigned long __user * fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff)
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@ -17,23 +17,6 @@
 #include <asm/uaccess.h>
 #include <asm/ia32.h>

-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(int __user *fildes)
-{
-	int fd[2];
-	int error;
-
-	error = do_pipe(fd);
-	if (!error) {
-		if (copy_to_user(fildes, fd, 2*sizeof(int)))
-			error = -EFAULT;
-	}
-	return error;
-}
-
 asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
 	unsigned long fd, unsigned long off)
 {
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@ -288,6 +288,8 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
 	switch (ps->channels[0].mode) {
 	case 1:
+        /* FIXME: enhance mode 4 precision */
+	case 4:
 		create_pit_timer(&ps->pit_timer, val, 0);
 		break;
 	case 2:
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@ -79,36 +79,6 @@ static int dbg = 1;
 	}
 #endif

-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_SHIFT 63
-#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
-	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52

@ -154,10 +124,6 @@ static int dbg = 1;
 #define PFERR_USER_MASK (1U << 2)
 #define PFERR_FETCH_MASK (1U << 4)

-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1

@ -186,6 +152,12 @@ static struct kmem_cache *mmu_page_header_cache;

 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
+static u64 __read_mostly shadow_base_present_pte;
+static u64 __read_mostly shadow_nx_mask;
+static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
+static u64 __read_mostly shadow_user_mask;
+static u64 __read_mostly shadow_accessed_mask;
+static u64 __read_mostly shadow_dirty_mask;

 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@ -194,6 +166,23 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);

+void kvm_mmu_set_base_ptes(u64 base_pte)
+{
+	shadow_base_present_pte = base_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
+
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+		u64 dirty_mask, u64 nx_mask, u64 x_mask)
+{
+	shadow_user_mask = user_mask;
+	shadow_accessed_mask = accessed_mask;
+	shadow_dirty_mask = dirty_mask;
+	shadow_nx_mask = nx_mask;
+	shadow_x_mask = x_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+
 static int is_write_protection(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.cr0 & X86_CR0_WP;
@ -232,7 +221,7 @@ static int is_writeble_pte(unsigned long pte)

 static int is_dirty_pte(unsigned long pte)
 {
-	return pte & PT_DIRTY_MASK;
+	return pte & shadow_dirty_mask;
 }

 static int is_rmap_pte(u64 pte)
@ -387,7 +376,6 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)

 	write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
 	*write_count += 1;
-	WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
 }

 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@ -547,7 +535,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 		return;
 	sp = page_header(__pa(spte));
 	pfn = spte_to_pfn(*spte);
-	if (*spte & PT_ACCESSED_MASK)
+	if (*spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writeble_pte(*spte))
 		kvm_release_pfn_dirty(pfn);
@ -1073,17 +1061,17 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	 * whether the guest actually used the pte (in order to detect
 	 * demand paging).
 	 */
-	spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+	spte = shadow_base_present_pte | shadow_dirty_mask;
 	if (!speculative)
 		pte_access |= PT_ACCESSED_MASK;
 	if (!dirty)
 		pte_access &= ~ACC_WRITE_MASK;
-	if (!(pte_access & ACC_EXEC_MASK))
-		spte |= PT64_NX_MASK;
-
-	spte |= PT_PRESENT_MASK;
+	if (pte_access & ACC_EXEC_MASK)
+		spte |= shadow_x_mask;
+	else
+		spte |= shadow_nx_mask;
 	if (pte_access & ACC_USER_MASK)
-		spte |= PT_USER_MASK;
+		spte |= shadow_user_mask;
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;

@ -1188,8 +1176,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 				return -ENOMEM;
 			}

-			table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-				| PT_WRITABLE_MASK | PT_USER_MASK;
+			table[index] = __pa(new_table->spt)
+				| PT_PRESENT_MASK | PT_WRITABLE_MASK
+				| shadow_user_mask | shadow_x_mask;
 		}
 		table_addr = table[index] & PT64_BASE_ADDR_MASK;
 	}
@ -1244,7 +1233,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 	spin_lock(&vcpu->kvm->mmu_lock);
-#ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;

@ -1256,7 +1244,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
 	}
-#endif
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];

@ -1282,7 +1269,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)

 	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;

-#ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;

@ -1297,7 +1283,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		vcpu->arch.mmu.root_hpa = root;
 		return;
 	}
-#endif
 	metaphysical = !is_paging(vcpu);
 	if (tdp_enabled)
 		metaphysical = 1;
@ -1377,7 +1362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 largepage, gfn, pfn, TDP_ROOT_LEVEL);
+			 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
 	spin_unlock(&vcpu->kvm->mmu_lock);

 	return r;
@ -1484,7 +1469,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
-	context->shadow_root_level = TDP_ROOT_LEVEL;
+	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;

 	if (!is_paging(vcpu)) {
@ -1633,7 +1618,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 {
 	u64 *spte = vcpu->arch.last_pte_updated;

-	return !!(spte && (*spte & PT_ACCESSED_MASK));
+	return !!(spte && (*spte & shadow_accessed_mask));
 }

 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@ -3,11 +3,38 @@

 #include <linux/kvm_host.h>

-#ifdef CONFIG_X86_64
-#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
-#else
-#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
-#endif
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+	(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3

 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@ -1863,6 +1863,15 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }

+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+	return PT64_ROOT_LEVEL;
+#else
+	return PT32E_ROOT_LEVEL;
+#endif
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@ -1920,6 +1929,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.inject_pending_vectors = do_interrupt_requests,

 	.set_tss_addr = svm_set_tss_addr,
+	.get_tdp_level = get_npt_level,
 };

 static int __init svm_init(void)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@ -42,6 +42,9 @@ module_param(enable_vpid, bool, 0);
 static int flexpriority_enabled = 1;
 module_param(flexpriority_enabled, bool, 0);

+static int enable_ept = 1;
+module_param(enable_ept, bool, 0);
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@ -84,7 +87,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }

-static int init_rmode_tss(struct kvm *kvm);
+static int init_rmode(struct kvm *kvm);

 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@ -107,6 +110,11 @@ static struct vmcs_config {
 	u32 vmentry_ctrl;
 } vmcs_config;

+struct vmx_capability {
+	u32 ept;
+	u32 vpid;
+} vmx_capability;
+
 #define VMX_SEGMENT_FIELD(seg)					\
 	[VCPU_SREG_##seg] = {                                   \
 		.selector = GUEST_##seg##_SELECTOR,		\
@ -214,6 +222,32 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 		    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 }

+static inline int cpu_has_vmx_invept_individual_addr(void)
+{
+	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
+}
+
+static inline int cpu_has_vmx_invept_context(void)
+{
+	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
+}
+
+static inline int cpu_has_vmx_invept_global(void)
+{
+	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
+}
+
+static inline int cpu_has_vmx_ept(void)
+{
+	return (vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_EPT);
+}
+
+static inline int vm_need_ept(void)
+{
+	return (cpu_has_vmx_ept() && enable_ept);
+}
+
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return ((cpu_has_vmx_virtualize_apic_accesses()) &&
@ -250,6 +284,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 		  : : "a"(&operand), "c"(ext) : "cc", "memory");
 }

+static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+{
+	struct {
+		u64 eptp, gpa;
+	} operand = {eptp, gpa};
+
+	asm volatile (ASM_VMX_INVEPT
+			/* CF==1 or ZF==1 --> rc = -1 */
+			"; ja 1f ; ud2 ; 1:\n"
+			: : "a" (&operand), "c" (ext) : "cc", "memory");
+}
+
 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@ -301,6 +347,33 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }

+static inline void ept_sync_global(void)
+{
+	if (cpu_has_vmx_invept_global())
+		__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+	if (vm_need_ept()) {
+		if (cpu_has_vmx_invept_context())
+			__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+		else
+			ept_sync_global();
+	}
+}
+
+static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
+{
+	if (vm_need_ept()) {
+		if (cpu_has_vmx_invept_individual_addr())
+			__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
+					eptp, gpa);
+		else
+			ept_sync_context(eptp);
+	}
+}
+
 static unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;
@ -388,6 +461,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb |= 1u << 1;
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
+	if (vm_need_ept())
+		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }

@ -985,7 +1060,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 {
 	u32 vmx_msr_low, vmx_msr_high;
-	u32 min, opt;
+	u32 min, opt, min2, opt2;
 	u32 _pin_based_exec_control = 0;
 	u32 _cpu_based_exec_control = 0;
 	u32 _cpu_based_2nd_exec_control = 0;
@ -1003,6 +1078,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_CR8_LOAD_EXITING |
 	      CPU_BASED_CR8_STORE_EXITING |
 #endif
+	      CPU_BASED_CR3_LOAD_EXITING |
+	      CPU_BASED_CR3_STORE_EXITING |
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
 	      CPU_BASED_USE_TSC_OFFSETING;
@ -1018,11 +1095,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 					   ~CPU_BASED_CR8_STORE_EXITING;
 #endif
 	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
-		min = 0;
-		opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+		min2 = 0;
+		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 			SECONDARY_EXEC_WBINVD_EXITING |
-			SECONDARY_EXEC_ENABLE_VPID;
-		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
+			SECONDARY_EXEC_ENABLE_VPID |
+			SECONDARY_EXEC_ENABLE_EPT;
+		if (adjust_vmx_controls(min2, opt2,
+					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
 			return -EIO;
 	}
@ -1031,6 +1110,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 #endif
+	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
+		/* CR3 accesses don't need to cause VM Exits when EPT enabled */
+		min &= ~(CPU_BASED_CR3_LOAD_EXITING |
+			 CPU_BASED_CR3_STORE_EXITING);
+		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+					&_cpu_based_exec_control) < 0)
+			return -EIO;
+		rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
+		      vmx_capability.ept, vmx_capability.vpid);
+	}

 	min = 0;
 #ifdef CONFIG_X86_64
@ -1256,7 +1345,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);

 	kvm_mmu_reset_context(vcpu);
-	init_rmode_tss(vcpu->kvm);
+	init_rmode(vcpu->kvm);
 }

 #ifdef CONFIG_X86_64
@ -1304,8 +1393,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
 }

+static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+		if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+			printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
+			return;
+		}
+		vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
+		vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
+		vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
+		vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+	}
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+					unsigned long cr0,
+					struct kvm_vcpu *vcpu)
+{
+	if (!(cr0 & X86_CR0_PG)) {
+		/* From paging/starting to nonpaging */
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+			     vmcs_config.cpu_based_exec_ctrl |
+			     (CPU_BASED_CR3_LOAD_EXITING |
+			      CPU_BASED_CR3_STORE_EXITING));
+		vcpu->arch.cr0 = cr0;
+		vmx_set_cr4(vcpu, vcpu->arch.cr4);
+		*hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
+		*hw_cr0 &= ~X86_CR0_WP;
+	} else if (!is_paging(vcpu)) {
+		/* From nonpaging to paging */
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+			     vmcs_config.cpu_based_exec_ctrl &
+			     ~(CPU_BASED_CR3_LOAD_EXITING |
+			       CPU_BASED_CR3_STORE_EXITING));
+		vcpu->arch.cr0 = cr0;
+		vmx_set_cr4(vcpu, vcpu->arch.cr4);
+		if (!(vcpu->arch.cr0 & X86_CR0_WP))
+			*hw_cr0 &= ~X86_CR0_WP;
+	}
+}
+
+static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
+					struct kvm_vcpu *vcpu)
+{
+	if (!is_paging(vcpu)) {
+		*hw_cr4 &= ~X86_CR4_PAE;
+		*hw_cr4 |= X86_CR4_PSE;
+	} else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
+		*hw_cr4 &= ~X86_CR4_PAE;
+}
+
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+	unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
+				KVM_VM_CR0_ALWAYS_ON;
+
 	vmx_fpu_deactivate(vcpu);

 	if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
@ -1323,29 +1468,61 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 #endif

+	if (vm_need_ept())
+		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
 	vmcs_writel(CR0_READ_SHADOW, cr0);
-	vmcs_writel(GUEST_CR0,
-		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
+	vmcs_writel(GUEST_CR0, hw_cr0);
 	vcpu->arch.cr0 = cr0;

 	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
 		vmx_fpu_activate(vcpu);
 }

+static u64 construct_eptp(unsigned long root_hpa)
+{
+	u64 eptp;
+
+	/* TODO write the value reading from MSR */
+	eptp = VMX_EPT_DEFAULT_MT |
+		VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+	eptp |= (root_hpa & PAGE_MASK);
+
+	return eptp;
+}
+
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	unsigned long guest_cr3;
+	u64 eptp;
+
+	guest_cr3 = cr3;
+	if (vm_need_ept()) {
+		eptp = construct_eptp(cr3);
+		vmcs_write64(EPT_POINTER, eptp);
+		ept_sync_context(eptp);
+		ept_load_pdptrs(vcpu);
+		guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+			VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+	}
+
 	vmx_flush_tlb(vcpu);
-	vmcs_writel(GUEST_CR3, cr3);
+	vmcs_writel(GUEST_CR3, guest_cr3);
 	if (vcpu->arch.cr0 & X86_CR0_PE)
 		vmx_fpu_deactivate(vcpu);
 }

 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	vmcs_writel(CR4_READ_SHADOW, cr4);
-	vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
-		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+	unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+
 	vcpu->arch.cr4 = cr4;
+	if (vm_need_ept())
+		ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+
+	vmcs_writel(CR4_READ_SHADOW, cr4);
+	vmcs_writel(GUEST_CR4, hw_cr4);
 }

 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@ -1530,6 +1707,41 @@ out:
 	return ret;
 }

+static int init_rmode_identity_map(struct kvm *kvm)
+{
+	int i, r, ret;
+	pfn_t identity_map_pfn;
+	u32 tmp;
+
+	if (!vm_need_ept())
+		return 1;
+	if (unlikely(!kvm->arch.ept_identity_pagetable)) {
+		printk(KERN_ERR "EPT: identity-mapping pagetable "
+			"haven't been allocated!\n");
+		return 0;
+	}
+	if (likely(kvm->arch.ept_identity_pagetable_done))
+		return 1;
+	ret = 0;
+	identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
+	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
+	if (r < 0)
+		goto out;
+	/* Set up identity-mapping pagetable for EPT in real mode */
+	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+		r = kvm_write_guest_page(kvm, identity_map_pfn,
+				&tmp, i * sizeof(tmp), sizeof(tmp));
+		if (r < 0)
+			goto out;
+	}
+	kvm->arch.ept_identity_pagetable_done = true;
+	ret = 1;
+out:
+	return ret;
+}
+
 static void seg_setup(int seg)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@ -1564,6 +1776,31 @@ out:
 	return r;
 }

+static int alloc_identity_pagetable(struct kvm *kvm)
+{
+	struct kvm_userspace_memory_region kvm_userspace_mem;
+	int r = 0;
+
+	down_write(&kvm->slots_lock);
+	if (kvm->arch.ept_identity_pagetable)
+		goto out;
+	kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+	kvm_userspace_mem.flags = 0;
+	kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+	kvm_userspace_mem.memory_size = PAGE_SIZE;
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+	if (r)
+		goto out;
+
+	down_read(&current->mm->mmap_sem);
+	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
+			VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+out:
+	up_write(&kvm->slots_lock);
+	return r;
+}
+
 static void allocate_vpid(struct vcpu_vmx *vmx)
 {
 	int vpid;
@ -1638,6 +1875,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				CPU_BASED_CR8_LOAD_EXITING;
 #endif
 	}
+	if (!vm_need_ept())
+		exec_control |= CPU_BASED_CR3_STORE_EXITING |
+				CPU_BASED_CR3_LOAD_EXITING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);

 	if (cpu_has_secondary_exec_ctrls()) {
@ -1647,6 +1887,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 		if (vmx->vpid == 0)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+		if (!vm_need_ept())
+			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}

@ -1722,6 +1964,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	return 0;
 }

+static int init_rmode(struct kvm *kvm)
+{
+	if (!init_rmode_tss(kvm))
+		return 0;
+	if (!init_rmode_identity_map(kvm))
+		return 0;
+	return 1;
+}
+
 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -1729,7 +1980,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	int ret;

 	down_read(&vcpu->kvm->slots_lock);
-	if (!init_rmode_tss(vmx->vcpu.kvm)) {
+	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		goto out;
 	}
@ -1994,6 +2245,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
+		/* EPT won't cause page fault directly */
+		if (vm_need_ept())
+			BUG();
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
 			    (u32)((u64)cr2 >> 32), handler);
@ -2323,6 +2577,64 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return kvm_task_switch(vcpu, tss_selector, reason);
 }

+static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u64 exit_qualification;
+	enum emulation_result er;
+	gpa_t gpa;
+	unsigned long hva;
+	int gla_validity;
+	int r;
+
+	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+	if (exit_qualification & (1 << 6)) {
+		printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
+		return -ENOTSUPP;
+	}
+
+	gla_validity = (exit_qualification >> 7) & 0x3;
+	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
+		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			(long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+			(long unsigned int)exit_qualification);
+		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+		kvm_run->hw.hardware_exit_reason = 0;
+		return -ENOTSUPP;
+	}
+
+	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (!kvm_is_error_hva(hva)) {
+		r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+		if (r < 0) {
+			printk(KERN_ERR "EPT: Not enough memory!\n");
+			return -ENOMEM;
+		}
+		return 1;
+	} else {
+		/* must be MMIO */
+		er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+		if (er == EMULATE_FAIL) {
+			printk(KERN_ERR
+			 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
+			 er);
+			printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+			printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+				(long unsigned int)exit_qualification);
+			return -ENOTSUPP;
+		} else if (er == EMULATE_DO_MMIO)
+			return 0;
+	}
+	return 1;
+}
+
 /*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@ -2346,6 +2658,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
+	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 };

 static const int kvm_vmx_max_exit_handlers =
@ -2364,6 +2677,13 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
 		    (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);

+	/* Access CR3 don't cause VMExit in paging mode, so we need
+	 * to sync with guest real CR3. */
+	if (vm_need_ept() && is_paging(vcpu)) {
+		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+		ept_load_pdptrs(vcpu);
+	}
+
 	if (unlikely(vmx->fail)) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
@ -2372,7 +2692,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}

 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-				exit_reason != EXIT_REASON_EXCEPTION_NMI)
+			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+			exit_reason != EXIT_REASON_EPT_VIOLATION))
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
 		       "exit reason is 0x%x\n", __func__, exit_reason);
 	if (exit_reason < kvm_vmx_max_exit_handlers
@ -2674,6 +2995,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		return ERR_PTR(-ENOMEM);

 	allocate_vpid(vmx);
+	if (id == 0 && vm_need_ept()) {
+		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
+			VMX_EPT_WRITABLE_MASK |
+			VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+		kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
+				VMX_EPT_FAKE_DIRTY_MASK, 0ull,
+				VMX_EPT_EXECUTABLE_MASK);
+		kvm_enable_tdp();
+	}

 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	if (err)
@ -2706,6 +3036,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		if (alloc_apic_access_page(kvm) != 0)
 			goto free_vmcs;

+	if (vm_need_ept())
+		if (alloc_identity_pagetable(kvm) != 0)
+			goto free_vmcs;
+
 	return &vmx->vcpu;

 free_vmcs:
@ -2735,6 +3069,11 @@ static void __init vmx_check_processor_compat(void *rtn)
 	}
 }

+static int get_ept_level(void)
+{
+	return VMX_EPT_DEFAULT_GAW + 1;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@ -2791,6 +3130,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.inject_pending_vectors = do_interrupt_requests,

 	.set_tss_addr = vmx_set_tss_addr,
+	.get_tdp_level = get_ept_level,
 };

 static int __init vmx_init(void)
@ -2843,9 +3183,14 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
 	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);

+	if (cpu_has_vmx_ept())
+		bypass_guest_pf = 0;
+
 	if (bypass_guest_pf)
 		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);

+	ept_sync_global();
+
 	return 0;

 out2:
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@ -35,6 +35,8 @@
 #define CPU_BASED_MWAIT_EXITING                 0x00000400
 #define CPU_BASED_RDPMC_EXITING                 0x00000800
 #define CPU_BASED_RDTSC_EXITING                 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING		0x00008000
+#define CPU_BASED_CR3_STORE_EXITING		0x00010000
 #define CPU_BASED_CR8_LOAD_EXITING              0x00080000
 #define CPU_BASED_CR8_STORE_EXITING             0x00100000
 #define CPU_BASED_TPR_SHADOW                    0x00200000
@ -49,6 +51,7 @@
 * Definitions of Secondary Processor-Based VM-Execution Controls.
 */
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040

@ -100,10 +103,22 @@ enum vmcs_field {
 	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
 	APIC_ACCESS_ADDR		= 0x00002014,
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	EPT_POINTER                     = 0x0000201a,
+	EPT_POINTER_HIGH                = 0x0000201b,
+	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
+	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
 	VMCS_LINK_POINTER               = 0x00002800,
 	VMCS_LINK_POINTER_HIGH          = 0x00002801,
 	GUEST_IA32_DEBUGCTL             = 0x00002802,
 	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+	GUEST_PDPTR0                    = 0x0000280a,
+	GUEST_PDPTR0_HIGH               = 0x0000280b,
+	GUEST_PDPTR1                    = 0x0000280c,
+	GUEST_PDPTR1_HIGH               = 0x0000280d,
+	GUEST_PDPTR2                    = 0x0000280e,
+	GUEST_PDPTR2_HIGH               = 0x0000280f,
+	GUEST_PDPTR3                    = 0x00002810,
+	GUEST_PDPTR3_HIGH               = 0x00002811,
 	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
 	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
 	EXCEPTION_BITMAP                = 0x00004004,
@ -226,6 +241,8 @@ enum vmcs_field {
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EPT_VIOLATION       48
+#define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD		54

 /*
@ -316,15 +333,36 @@ enum vmcs_field {
 #define MSR_IA32_VMX_CR4_FIXED1                 0x489
 #define MSR_IA32_VMX_VMCS_ENUM                  0x48a
 #define MSR_IA32_VMX_PROCBASED_CTLS2            0x48b
+#define MSR_IA32_VMX_EPT_VPID_CAP               0x48c

 #define MSR_IA32_FEATURE_CONTROL                0x3a
 #define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4

 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10

 #define VMX_NR_VPIDS				(1 << 16)
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT		1
 #define VMX_VPID_EXTENT_ALL_CONTEXT		2

+#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
+#define VMX_EPT_EXTENT_CONTEXT			1
+#define VMX_EPT_EXTENT_GLOBAL			2
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)
+#define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
+#define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
+#define VMX_EPT_DEFAULT_GAW			3
+#define VMX_EPT_MAX_GAW				0x4
+#define VMX_EPT_MT_EPTE_SHIFT			3
+#define VMX_EPT_GAW_EPTP_SHIFT			3
+#define VMX_EPT_DEFAULT_MT			0x6ull
+#define VMX_EPT_READABLE_MASK			0x1ull
+#define VMX_EPT_WRITABLE_MASK			0x2ull
+#define VMX_EPT_EXECUTABLE_MASK			0x4ull
+#define VMX_EPT_FAKE_ACCESSED_MASK		(1ull << 62)
+#define VMX_EPT_FAKE_DIRTY_MASK			(1ull << 63)
+
+#define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
+
 #endif
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -2417,6 +2417,9 @@ int kvm_arch_init(void *opaque)

 	kvm_x86_ops = ops;
 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
+	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 	return 0;

 out:
@ -3019,6 +3022,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)

 	kvm_x86_ops->decache_regs(vcpu);

+	vcpu->arch.exception.pending = false;
+
 	vcpu_put(vcpu);

 	return 0;
@ -3481,7 +3486,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 	}

 	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
-		cseg_desc.type &= ~(1 << 8); //clear the B flag
+		cseg_desc.type &= ~(1 << 1); //clear the B flag
 		save_guest_segment_descriptor(vcpu, tr_seg.selector,
 					      &cseg_desc);
 	}
@ -3507,7 +3512,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 	}

 	if (reason != TASK_SWITCH_IRET) {
-		nseg_desc.type |= (1 << 8);
+		nseg_desc.type |= (1 << 1);
 		save_guest_segment_descriptor(vcpu, tss_selector,
 					      &nseg_desc);
 	}
@ -3698,10 +3703,19 @@ void fx_init(struct kvm_vcpu *vcpu)
 {
 	unsigned after_mxcsr_mask;

+	/*
+	 * Touch the fpu the first time in non atomic context as if
+	 * this is the first fpu instruction the exception handler
+	 * will fire before the instruction returns and it'll have to
+	 * allocate ram with GFP_KERNEL.
+	 */
+	if (!used_math())
+		fx_save(&vcpu->arch.host_fx_image);
+
 	/* Initialize guest FPU by resetting ours and saving into guest's */
 	preempt_disable();
 	fx_save(&vcpu->arch.host_fx_image);
-	fpu_init();
+	fx_finit();
 	fx_save(&vcpu->arch.guest_fx_image);
 	fx_restore(&vcpu->arch.host_fx_image);
 	preempt_enable();
@ -3906,6 +3920,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_free_physmem(kvm);
 	if (kvm->arch.apic_access_page)
 		put_page(kvm->arch.apic_access_page);
+	if (kvm->arch.ept_identity_pagetable)
+		put_page(kvm->arch.ept_identity_pagetable);
 	kfree(kvm);
 }

--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@ -1761,6 +1761,7 @@ twobyte_insn:
 		case 6: /* lmsw */
 			realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
 				      &ctxt->eflags);
+			c->dst.type = OP_NONE;
 			break;
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu, memop);
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@ -476,29 +476,3 @@ int memory_add_physaddr_to_nid(u64 addr)

 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
-
-#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
-/*
- * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
- *
- * These stub functions are needed to compile 32-bit NUMA when SRAT is
- * not set. There are functions in srat_64.c for parsing this table
- * and it may be possible to make them common functions.
- */
-void acpi_numa_slit_init (struct acpi_table_slit *slit)
-{
-	printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
-}
-
-void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
-{
-}
-
-void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
-{
-}
-
-void acpi_numa_arch_fixup(void)
-{
-}
-#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@ -172,10 +172,3 @@ void reserve_top_address(unsigned long reserve)
 	__FIXADDR_TOP = -reserve - PAGE_SIZE;
 	__VMALLOC_RESERVE += reserve;
 }
-
-int pmd_bad(pmd_t pmd)
-{
-	WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
-
-	return pmd_bad_v1(pmd);
-}
--- a/arch/x86/pci/Makefile_32
+++ b/arch/x86/pci/Makefile_32
@ -6,11 +6,19 @@ obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 obj-$(CONFIG_PCI_OLPC)		+= olpc.o

 pci-y				:= fixup.o
+
+# Do not change the ordering here. There is a nasty init function
+# ordering dependency which breaks when you move acpi.o below
+# legacy/irq.o
 pci-$(CONFIG_ACPI)		+= acpi.o
 pci-y				+= legacy.o irq.o

-pci-$(CONFIG_X86_VISWS)		+= visws.o fixup.o
-pci-$(CONFIG_X86_NUMAQ)		+= numa.o irq.o
+# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are
+# therefor correct. This needs a proper fix by distangling the code.
+pci-$(CONFIG_X86_VISWS)		:= visws.o fixup.o
+pci-$(CONFIG_X86_NUMAQ)		:= numa.o irq.o
+
+# Necessary for NUMAQ as well
 pci-$(CONFIG_NUMA)		+= mp_bus_to_node.o

 obj-y				+= $(pci-y) common.o early.o
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@ -6,45 +6,6 @@
 #include <asm/numa.h>
 #include "pci.h"

-static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
-{
-	pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
-	printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
-	return 0;
-}
-
-static struct dmi_system_id acpi_pciprobe_dmi_table[] __devinitdata = {
-/*
- * Systems where PCI IO resource ISA alignment can be skipped
- * when the ISA enable bit in the bridge control is not set
- */
-	{
-		.callback = can_skip_ioresource_align,
-		.ident = "IBM System x3800",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
-		},
-	},
-	{
-		.callback = can_skip_ioresource_align,
-		.ident = "IBM System x3850",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "x3850"),
-		},
-	},
-	{
-		.callback = can_skip_ioresource_align,
-		.ident = "IBM System x3950",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "x3950"),
-		},
-	},
-	{}
-};
-
 struct pci_root_info {
 	char *name;
 	unsigned int res_num;
@ -196,8 +157,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
 	int pxm;
 #endif

-	dmi_check_system(acpi_pciprobe_dmi_table);
-
 	if (domain && !pci_domains_supported) {
 		printk(KERN_WARNING "PCI: Multiple domains not supported "
 		       "(dom %d, bus %d)\n", domain, busnum);
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@ -77,17 +77,48 @@ int pcibios_scanned;
 */
 DEFINE_SPINLOCK(pci_config_lock);

-static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
+static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
 {
-	struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
+	pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+	printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
+	return 0;
+}

-	if (rom_r->parent)
-		return;
-	if (rom_r->start)
-		/* we deal with BIOS assigned ROM later */
-		return;
-	if (!(pci_probe & PCI_ASSIGN_ROMS))
-		rom_r->start = rom_r->end = rom_r->flags = 0;
+static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = {
+/*
+ * Systems where PCI IO resource ISA alignment can be skipped
+ * when the ISA enable bit in the bridge control is not set
+ */
+	{
+		.callback = can_skip_ioresource_align,
+		.ident = "IBM System x3800",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
+		},
+	},
+	{
+		.callback = can_skip_ioresource_align,
+		.ident = "IBM System x3850",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "x3850"),
+		},
+	},
+	{
+		.callback = can_skip_ioresource_align,
+		.ident = "IBM System x3950",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "x3950"),
+		},
+	},
+	{}
+};
+
+void __init dmi_check_skip_isa_align(void)
+{
+	dmi_check_system(can_skip_pciprobe_dmi_table);
 }

 /*
@ -97,11 +128,7 @@ static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)

 void __devinit  pcibios_fixup_bus(struct pci_bus *b)
 {
-	struct pci_dev *dev;
-
 	pci_read_bridge_bases(b);
-	list_for_each_entry(dev, &b->devices, bus_list)
-		pcibios_fixup_device_resources(dev);
 }

 /*
@ -318,13 +345,16 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
 	{}
 };

+void __init dmi_check_pciprobe(void)
+{
+	dmi_check_system(pciprobe_dmi_table);
+}
+
 struct pci_bus * __devinit pcibios_scan_root(int busnum)
 {
 	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;

-	dmi_check_system(pciprobe_dmi_table);
-
 	while ((bus = pci_find_next_bus(bus)) != NULL) {
 		if (bus->number == busnum) {
 			/* Already scanned */
@ -462,6 +492,9 @@ char * __devinit  pcibios_setup(char *str)
 	} else if (!strcmp(str, "routeirq")) {
 		pci_routeirq = 1;
 		return NULL;
+	} else if (!strcmp(str, "skip_isa_align")) {
+		pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+		return NULL;
 	}
 	return str;
 }
@ -489,7 +522,7 @@ void pcibios_disable_device (struct pci_dev *dev)
 		pcibios_disable_irq(dev);
 }

-struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
+struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
 {
 	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;
@ -512,7 +545,7 @@ struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
 	return bus;
 }

-struct pci_bus *pci_scan_bus_with_sysdata(int busno)
+struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
 {
 	return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
 }
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@ -502,7 +502,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
 */
 static void fam10h_pci_cfg_space_size(struct pci_dev *dev)
 {
-	dev->cfg_size = pci_cfg_space_size_ext(dev, 0);
+	dev->cfg_size = pci_cfg_space_size_ext(dev);
 }

 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, fam10h_pci_cfg_space_size);
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@ -33,6 +33,10 @@ static __init int pci_access_init(void)
 		printk(KERN_ERR
 		"PCI: Fatal: No config space access function found\n");

+	dmi_check_pciprobe();
+
+	dmi_check_skip_isa_align();
+
 	return 0;
 }
 arch_initcall(pci_access_init);
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@ -38,6 +38,9 @@ enum pci_bf_sort_state {
 	pci_dmi_bf,
 };

+extern void __init dmi_check_pciprobe(void);
+extern void __init dmi_check_skip_isa_align(void);
+
 /* pci-i386.c */

 extern unsigned int pcibios_max_latency;
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@ -162,7 +162,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
 	Elf32_Shdr *shdr;
 	int i;

-	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
 	       !elf_check_arch_ia32(ehdr) ||
 	       ehdr->e_type != ET_DYN);

--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@ -1,5 +1,4 @@
 /*
- *
 * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
 *
 * This file is subject to the terms and conditions of the GNU General Public
@ -29,3 +28,4 @@ int fb_is_primary_device(struct fb_info *info)
 	return retval;
 }
 EXPORT_SYMBOL(fb_is_primary_device);
+MODULE_LICENSE("GPL");
--- a/block/blk-core.c
+++ b/block/blk-core.c
@ -54,15 +54,16 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done);

 static void drive_stat_acct(struct request *rq, int new_io)
 {
+	struct hd_struct *part;
 	int rw = rq_data_dir(rq);

 	if (!blk_fs_request(rq) || !rq->rq_disk)
 		return;

-	if (!new_io) {
-		__all_stat_inc(rq->rq_disk, merges[rw], rq->sector);
-	} else {
-		struct hd_struct *part = get_part(rq->rq_disk, rq->sector);
+	part = get_part(rq->rq_disk, rq->sector);
+	if (!new_io)
+		__all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
+	else {
 		disk_round_stats(rq->rq_disk);
 		rq->rq_disk->in_flight++;
 		if (part) {
@ -253,9 +254,11 @@ EXPORT_SYMBOL(__generic_unplug_device);
 **/
 void generic_unplug_device(struct request_queue *q)
 {
-	spin_lock_irq(q->queue_lock);
-	__generic_unplug_device(q);
-	spin_unlock_irq(q->queue_lock);
+	if (blk_queue_plugged(q)) {
+		spin_lock_irq(q->queue_lock);
+		__generic_unplug_device(q);
+		spin_unlock_irq(q->queue_lock);
+	}
 }
 EXPORT_SYMBOL(generic_unplug_device);

@ -1536,10 +1539,11 @@ static int __end_that_request_first(struct request *req, int error,
 	}

 	if (blk_fs_request(req) && req->rq_disk) {
+		struct hd_struct *part = get_part(req->rq_disk, req->sector);
 		const int rw = rq_data_dir(req);

-		all_stat_add(req->rq_disk, sectors[rw],
-			     nr_bytes >> 9, req->sector);
+		all_stat_add(req->rq_disk, part, sectors[rw],
+				nr_bytes >> 9, req->sector);
 	}

 	total_bytes = bio_nbytes = 0;
@ -1725,8 +1729,8 @@ static void end_that_request_last(struct request *req, int error)
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part = get_part(disk, req->sector);

-		__all_stat_inc(disk, ios[rw], req->sector);
-		__all_stat_add(disk, ticks[rw], duration, req->sector);
+		__all_stat_inc(disk, part, ios[rw], req->sector);
+		__all_stat_add(disk, part, ticks[rw], duration, req->sector);
 		disk_round_stats(disk);
 		disk->in_flight--;
 		if (part) {
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@ -41,8 +41,8 @@ int put_io_context(struct io_context *ioc)
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
-		rcu_read_unlock();
 		cfq_dtor(ioc);
+		rcu_read_unlock();

 		kmem_cache_free(iocontext_cachep, ioc);
 		return 1;
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@ -149,9 +149,9 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
 				 struct bio *nxt)
 {
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
-	if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
+	if (!bio_flagged(nxt, BIO_SEG_VALID))
 		blk_recount_segments(q, nxt);
 	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
 	    BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
@ -312,9 +312,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 			q->last_merge = NULL;
 		return 0;
 	}
-	if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
+	if (!bio_flagged(req->biotail, BIO_SEG_VALID))
 		blk_recount_segments(q, req->biotail);
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
 	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
 	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
@ -352,9 +352,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		return 0;
 	}
 	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
-	if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
+	if (!bio_flagged(req->bio, BIO_SEG_VALID))
 		blk_recount_segments(q, req->bio);
 	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
 	    !BIOVEC_VIRT_OVERSIZE(len)) {
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@ -146,11 +146,13 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 	unsigned long nm;
 	ssize_t ret = queue_var_store(&nm, page, count);

+	spin_lock_irq(q->queue_lock);
 	if (nm)
-	       set_bit(QUEUE_FLAG_NOMERGES, &q->queue_flags);
+		queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 	else
-	       clear_bit(QUEUE_FLAG_NOMERGES, &q->queue_flags);
+		queue_flag_clear(QUEUE_FLAG_NOMERGES, q);

+	spin_unlock_irq(q->queue_lock);
 	return ret;
 }

--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@ -70,7 +70,7 @@ void __blk_queue_free_tags(struct request_queue *q)
 	__blk_free_tags(bqt);

 	q->queue_tags = NULL;
-	queue_flag_clear(QUEUE_FLAG_QUEUED, q);
+	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
 }

 /**
@ -98,7 +98,7 @@ EXPORT_SYMBOL(blk_free_tags);
 **/
 void blk_queue_free_tags(struct request_queue *q)
 {
-	queue_flag_clear(QUEUE_FLAG_QUEUED, q);
+	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
 }
 EXPORT_SYMBOL(blk_queue_free_tags);

@ -171,6 +171,9 @@ EXPORT_SYMBOL(blk_init_tags);
 * @q:  the request queue for the device
 * @depth:  the maximum queue depth supported
 * @tags: the tag to use
+ *
+ * Queue lock must be held here if the function is called to resize an
+ * existing map.
 **/
 int blk_queue_init_tags(struct request_queue *q, int depth,
 			struct blk_queue_tag *tags)
@ -197,7 +200,7 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
 	 * assign it, all done
 	 */
 	q->queue_tags = tags;
-	queue_flag_set(QUEUE_FLAG_QUEUED, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
 	INIT_LIST_HEAD(&q->tag_busy_list);
 	return 0;
 fail:
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@ -1142,6 +1142,17 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	kmem_cache_free(cfq_pool, cfqq);
 }

+static void
+__call_for_each_cic(struct io_context *ioc,
+		    void (*func)(struct io_context *, struct cfq_io_context *))
+{
+	struct cfq_io_context *cic;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
+		func(ioc, cic);
+}
+
 /*
 * Call func for each cic attached to this ioc.
 */
@ -1149,12 +1160,8 @@ static void
 call_for_each_cic(struct io_context *ioc,
 		  void (*func)(struct io_context *, struct cfq_io_context *))
 {
-	struct cfq_io_context *cic;
-	struct hlist_node *n;
-
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
-		func(ioc, cic);
+	__call_for_each_cic(ioc, func);
 	rcu_read_unlock();
 }

@ -1198,7 +1205,7 @@ static void cfq_free_io_context(struct io_context *ioc)
 	 * should be ok to iterate over the known list, we will see all cic's
 	 * since no new ones are added.
 	 */
-	call_for_each_cic(ioc, cic_free_func);
+	__call_for_each_cic(ioc, cic_free_func);
 }

 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@ -1296,10 +1303,10 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
 	case IOPRIO_CLASS_NONE:
 		/*
-		 * no prio set, place us in the middle of the BE classes
+		 * no prio set, inherit CPU scheduling settings
 		 */
 		cfqq->ioprio = task_nice_ioprio(tsk);
-		cfqq->ioprio_class = IOPRIO_CLASS_BE;
+		cfqq->ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
 		cfqq->ioprio = task_ioprio(ioc);
--- a/drivers/accessibility/Kconfig
+++ b/drivers/accessibility/Kconfig
@ -1,7 +1,17 @@
 menuconfig ACCESSIBILITY
 	bool "Accessibility support"
 	---help---
-	  Enable a submenu where accessibility items may be enabled.
+	  Accessibility handles all special kinds of hardware devices or
+	  software adapters which help people with disabilities (e.g.
+	  blindness) to use computers.
+
+	  That includes braille devices, speech synthesis, keyboard
+	  remapping, etc.
+
+	  Say Y here to get to see options for accessibility.
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and disabled.

 	  If unsure, say N.

--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@ -205,8 +205,8 @@ config SATA_VITESSE
 	  If unsure, say N.

 config SATA_INIC162X
-	tristate "Initio 162x SATA support (HIGHLY EXPERIMENTAL)"
-	depends on PCI && EXPERIMENTAL
+	tristate "Initio 162x SATA support"
+	depends on PCI
 	help
 	  This option enables support for Initio 162x Serial ATA.

@ -697,6 +697,15 @@ config PATA_SCC

 	  If unsure, say N.

+config PATA_SCH
+	tristate "Intel SCH PATA support"
+	depends on PCI
+	help
+	  This option enables support for Intel SCH PATA on the Intel
+	  SCH (US15W, US15L, UL11L) series host controllers.
+
+	  If unsure, say N.
+
 config PATA_BF54X
 	tristate "Blackfin 54x ATAPI support"
 	depends on BF542 || BF548 || BF549
--- a/drivers/ata/Makefile
+++ b/drivers/ata/Makefile
@ -67,6 +67,7 @@ obj-$(CONFIG_PATA_SIS)		+= pata_sis.o
 obj-$(CONFIG_PATA_TRIFLEX)	+= pata_triflex.o
 obj-$(CONFIG_PATA_IXP4XX_CF)	+= pata_ixp4xx_cf.o
 obj-$(CONFIG_PATA_SCC)		+= pata_scc.o
+obj-$(CONFIG_PATA_SCH)		+= pata_sch.o
 obj-$(CONFIG_PATA_BF54X)	+= pata_bf54x.o
 obj-$(CONFIG_PATA_PLATFORM)	+= pata_platform.o
 obj-$(CONFIG_PATA_OF_PLATFORM)	+= pata_of_platform.o
--- a/Show more
+++ b/Show more