x86/paravirt: add sysret/sysexit pvops for returning to 32-bit compatibility userspace

In a 64-bit system, we need separate sysret/sysexit operations to
return to a 32-bit userspace.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citirx.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Jeremy Fitzhardinge 2008-06-25 00:19:28 -04:00 committed by Ingo Molnar
parent c7245da6ae
commit 2be29982a0
7 changed files with 89 additions and 31 deletions

View File

@ -61,6 +61,19 @@
CFI_UNDEFINED r15 CFI_UNDEFINED r15
.endm .endm
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret32)
swapgs
sysretl
ENDPROC(native_usergs_sysret32)
ENTRY(native_irq_enable_sysexit)
swapgs
sti
sysexit
ENDPROC(native_irq_enable_sysexit)
#endif
/* /*
* 32bit SYSENTER instruction entry. * 32bit SYSENTER instruction entry.
* *
@ -151,10 +164,7 @@ sysenter_do_call:
CFI_ADJUST_CFA_OFFSET -8 CFI_ADJUST_CFA_OFFSET -8
CFI_REGISTER rsp,rcx CFI_REGISTER rsp,rcx
TRACE_IRQS_ON TRACE_IRQS_ON
swapgs ENABLE_INTERRUPTS_SYSEXIT32
sti /* sti only takes effect after the next instruction */
/* sysexit */
.byte 0xf, 0x35
sysenter_tracesys: sysenter_tracesys:
CFI_RESTORE_STATE CFI_RESTORE_STATE
@ -254,8 +264,7 @@ cstar_do_call:
TRACE_IRQS_ON TRACE_IRQS_ON
movl RSP-ARGOFFSET(%rsp),%esp movl RSP-ARGOFFSET(%rsp),%esp
CFI_RESTORE rsp CFI_RESTORE rsp
swapgs USERGS_SYSRET32
sysretl
cstar_tracesys: cstar_tracesys:
CFI_RESTORE_STATE CFI_RESTORE_STATE

View File

@ -62,7 +62,9 @@ int main(void)
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret); OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
OFFSET(PV_CPU_usergs_sysret, pv_cpu_ops, usergs_sysret); OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif #endif

View File

@ -59,7 +59,7 @@
#endif #endif
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret) ENTRY(native_usergs_sysret64)
swapgs swapgs
sysretq sysretq
#endif /* CONFIG_PARAVIRT */ #endif /* CONFIG_PARAVIRT */
@ -275,7 +275,7 @@ sysret_check:
RESTORE_ARGS 0,-ARG_SKIP,1 RESTORE_ARGS 0,-ARG_SKIP,1
/*CFI_REGISTER rflags,r11*/ /*CFI_REGISTER rflags,r11*/
movq %gs:pda_oldrsp, %rsp movq %gs:pda_oldrsp, %rsp
USERGS_SYSRET USERGS_SYSRET64
CFI_RESTORE_STATE CFI_RESTORE_STATE
/* Handle reschedules */ /* Handle reschedules */

View File

@ -141,7 +141,8 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
ret = paravirt_patch_nop(); ret = paravirt_patch_nop();
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret)) type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
/* If operation requires a jmp, then jmp */ /* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
else else
@ -193,7 +194,8 @@ static void native_flush_tlb_single(unsigned long addr)
/* These are in entry.S */ /* These are in entry.S */
extern void native_iret(void); extern void native_iret(void);
extern void native_irq_enable_sysexit(void); extern void native_irq_enable_sysexit(void);
extern void native_usergs_sysret(void); extern void native_usergs_sysret32(void);
extern void native_usergs_sysret64(void);
static int __init print_banner(void) static int __init print_banner(void)
{ {
@ -329,10 +331,10 @@ struct pv_cpu_ops pv_cpu_ops = {
.write_idt_entry = native_write_idt_entry, .write_idt_entry = native_write_idt_entry,
.load_sp0 = native_load_sp0, .load_sp0 = native_load_sp0,
#ifdef CONFIG_X86_32
.irq_enable_sysexit = native_irq_enable_sysexit, .irq_enable_sysexit = native_irq_enable_sysexit,
#else #ifdef CONFIG_X86_64
.usergs_sysret = native_usergs_sysret, .usergs_sysret32 = native_usergs_sysret32,
.usergs_sysret64 = native_usergs_sysret64,
#endif #endif
.iret = native_iret, .iret = native_iret,
.swapgs = native_swapgs, .swapgs = native_swapgs,

View File

@ -14,8 +14,9 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
/* the three commands give us more control to how to return from a syscall */ DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
DEF_NATIVE(pv_cpu_ops, usergs_sysret, "swapgs; sysretq;"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
@ -35,7 +36,9 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_irq_ops, irq_disable);
PATCH_SITE(pv_cpu_ops, iret); PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, usergs_sysret); PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
PATCH_SITE(pv_cpu_ops, swapgs); PATCH_SITE(pv_cpu_ops, swapgs);
PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, read_cr3);

View File

@ -112,9 +112,17 @@ static inline unsigned long __raw_local_irq_save(void)
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
#define INTERRUPT_RETURN iretq #define INTERRUPT_RETURN iretq
#define USERGS_SYSRET \ #define USERGS_SYSRET64 \
swapgs; \ swapgs; \
sysretq; sysretq;
#define USERGS_SYSRET32 \
swapgs; \
sysretl
#define ENABLE_INTERRUPTS_SYSEXIT32 \
swapgs; \
sti; \
sysexit
#else #else
#define INTERRUPT_RETURN iret #define INTERRUPT_RETURN iret
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit

View File

@ -141,9 +141,32 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter); u64 (*read_pmc)(int counter);
unsigned long long (*read_tscp)(unsigned int *aux); unsigned long long (*read_tscp)(unsigned int *aux);
/* These three are jmp to, not actually called. */ /*
* Atomically enable interrupts and return to userspace. This
* is only ever used to return to 32-bit processes; in a
* 64-bit kernel, it's used for 32-on-64 compat processes, but
* never native 64-bit processes. (Jump, not call.)
*/
void (*irq_enable_sysexit)(void); void (*irq_enable_sysexit)(void);
void (*usergs_sysret)(void);
/*
* Switch to usermode gs and return to 64-bit usermode using
* sysret. Only used in 64-bit kernels to return to 64-bit
* processes. Usermode register state, including %rsp, must
* already be restored.
*/
void (*usergs_sysret64)(void);
/*
* Switch to usermode gs and return to 32-bit usermode using
* sysret. Used to return to 32-on-64 compat processes.
* Other usermode register state, including %esp, must already
* be restored.
*/
void (*usergs_sysret32)(void);
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void); void (*iret)(void);
void (*swapgs)(void); void (*swapgs)(void);
@ -1481,18 +1504,24 @@ static inline unsigned long __raw_local_irq_save(void)
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
PV_RESTORE_REGS;) PV_RESTORE_REGS;)
#define ENABLE_INTERRUPTS_SYSEXIT \ #define USERGS_SYSRET32 \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
CLBR_NONE, \ CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
#define GET_CR0_INTO_EAX \ #define GET_CR0_INTO_EAX \
push %ecx; push %edx; \ push %ecx; push %edx; \
call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
pop %edx; pop %ecx pop %edx; pop %ecx
#else
#define ENABLE_INTERRUPTS_SYSEXIT \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
#else /* !CONFIG_X86_32 */
#define SWAPGS \ #define SWAPGS \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
PV_SAVE_REGS; \ PV_SAVE_REGS; \
@ -1505,11 +1534,16 @@ static inline unsigned long __raw_local_irq_save(void)
movq %rax, %rcx; \ movq %rax, %rcx; \
xorq %rax, %rax; xorq %rax, %rax;
#define USERGS_SYSRET \ #define USERGS_SYSRET64 \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret), \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \ CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret)) jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
#endif
#define ENABLE_INTERRUPTS_SYSEXIT32 \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */ #endif /* CONFIG_PARAVIRT */