The actual "emulate NX with segment limits" hack. Pretty nasty stuff. Unlikely to ever go upstream. Quoting Linus.. "That said, for all the other reasons, I'm still not convinced we want that horrid CS limit thing. I also suspect that if we _do_ want it, then we could just as well track the exec range _unconditionally_, because it really shouldn't be very expensive at all if you do it well - even if you then never even end up using it." diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index dc27705..34ed3a2 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -6,6 +6,7 @@ #include #include #include +#include static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) @@ -95,6 +96,9 @@ static inline int desc_empty(const void *ptr) #define load_TLS(t, cpu) native_load_tls(t, cpu) #define set_ldt native_set_ldt +#ifdef CONFIG_X86_32 +#define load_user_cs_desc native_load_user_cs_desc +#endif /*CONFIG_X86_32*/ #define write_ldt_entry(dt, entry, desc) \ native_write_ldt_entry(dt, entry, desc) @@ -379,6 +383,27 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } +#ifdef CONFIG_X86_32 +static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) +{ + limit = (limit - 1) / PAGE_SIZE; + desc->a = limit & 0xffff; + desc->b = (limit & 0xf0000) | 0x00c0fb00; +} + +static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs; +} + +#define arch_add_exec_range arch_add_exec_range +#define arch_remove_exec_range arch_remove_exec_range +#define arch_flush_exec_range arch_flush_exec_range +extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_flush_exec_range(struct mm_struct *mm); +#endif /* CONFIG_X86_32 */ + #else /* * GET_DESC_BASE reads the descriptor base of the specified segment. diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 80a1dee..8314c66 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -7,12 +7,19 @@ /* * The x86 doesn't have a mmu context, but * we put the segment information here. + * + * exec_limit is used to track the range PROT_EXEC + * mappings span. */ typedef struct { void *ldt; int size; struct mutex lock; void *vdso; +#ifdef CONFIG_X86_32 + struct desc_struct user_cs; + unsigned long exec_limit; +#endif } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index e299287..aaa8a35 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -113,6 +113,9 @@ struct pv_cpu_ops { void (*store_gdt)(struct desc_ptr *); void (*store_idt)(struct desc_ptr *); void (*set_ldt)(const void *desc, unsigned entries); +#ifdef CONFIG_X86_32 + void (*load_user_cs_desc)(int cpu, struct mm_struct *mm); +#endif /*CONFIG_X86_32*/ unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); #ifdef CONFIG_X86_64 @@ -860,6 +863,12 @@ static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); } +#ifdef CONFIG_X86_32 +static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm) +{ + PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm); +} +#endif /*CONFIG_X86_32*/ static inline void store_gdt(struct desc_ptr *dtr) { PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3bfd523..99c8119 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -158,6 +158,9 @@ static inline int hlt_works(int cpu) #define cache_line_size() (boot_cpu_data.x86_cache_alignment) +#define __HAVE_ARCH_ALIGN_STACK +extern unsigned long arch_align_stack(unsigned long sp); + extern void cpu_detect(struct cpuinfo_x86 *c); extern struct pt_regs *idle_regs(struct pt_regs *); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 83492b1..a84c787 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -708,6 +708,18 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) * we do "generic changes." */ + /* + * emulation of NX with segment limits unfortunately means + * we have to disable the fast system calls, due to the way that + * sysexit clears the segment limits on return. + * If we have NX, then we don't need to do this. + */ +#ifdef CONFIG_X86_PAE + if (!test_cpu_cap(c, X86_FEATURE_NX)) +#endif + clear_cpu_cap(c, X86_FEATURE_SEP); + + /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { char *p; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c6520a4..2066aa1 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -352,6 +352,9 @@ struct pv_cpu_ops pv_cpu_ops = { .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = native_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = native_load_gdt, .load_idt = native_load_idt, .store_gdt = native_store_gdt, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd4da2a..60823d4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -343,6 +343,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { + int cpu; + __asm__("movl %0, %%gs" : : "r"(0)); regs->fs = 0; set_fs(USER_DS); @@ -352,6 +354,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; + + cpu = get_cpu(); + load_user_cs_desc(cpu, current->mm); + put_cpu(); + /* * Free the old FP and other extended state */ @@ -519,7 +526,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ __unlazy_fpu(prev_p); - + if (next_p->mm) + load_user_cs_desc(cpu, next_p->mm); /* we're going to use this soon, after a few expensive things */ if (next_p->fpu_counter > 5) @@ -692,3 +700,41 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long range_end = mm->brk + 0x02000000; return randomize_range(mm->brk, range_end, 0) ? : mm->brk; } + +static void modify_cs(struct mm_struct *mm, unsigned long limit) +{ + mm->context.exec_limit = limit; + set_user_cs(&mm->context.user_cs, limit); + if (mm == current->mm) { + int cpu; + + cpu = get_cpu(); + load_user_cs_desc(cpu, mm); + put_cpu(); + } +} + +void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) +{ + if (limit > mm->context.exec_limit) + modify_cs(mm, limit); +} + +void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) +{ + struct vm_area_struct *vma; + unsigned long limit = PAGE_SIZE; + + if (old_end == mm->context.exec_limit) { + for (vma = mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + modify_cs(mm, limit); + } +} + +void arch_flush_exec_range(struct mm_struct *mm) +{ + mm->context.exec_limit = 0; + set_user_cs(&mm->context.user_cs, 0); +} diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index ce50546..bd6593a 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -2,6 +2,7 @@ #include #include +#include #include DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) @@ -91,6 +92,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) unsigned long cpu; cpu = get_cpu(); + if (current->active_mm) + load_user_cs_desc(cpu, current->active_mm); if (!cpu_isset(cpu, flush_cpumask)) goto out; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a9e7548..af0f8f0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -160,6 +160,67 @@ static int lazy_iobitmap_copy(void) return 0; } + +static inline int +__compare_user_cs_desc(const struct desc_struct *desc1, + const struct desc_struct *desc2) +{ + return ((desc1->limit0 != desc2->limit0) || + (desc1->limit != desc2->limit) || + (desc1->base0 != desc2->base0) || + (desc1->base1 != desc2->base1) || + (desc1->base2 != desc2->base2)); +} + +/* + * lazy-check for CS validity on exec-shield binaries: + * + * the original non-exec stack patch was written by + * Solar Designer . Thanks! + */ +static int +check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) +{ + struct desc_struct *desc1, *desc2; + struct vm_area_struct *vma; + unsigned long limit; + + if (current->mm == NULL) + return 0; + + limit = -1UL; + if (current->mm->context.exec_limit != -1UL) { + limit = PAGE_SIZE; + spin_lock(¤t->mm->page_table_lock); + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + vma = get_gate_vma(current); + if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + spin_unlock(¤t->mm->page_table_lock); + if (limit >= TASK_SIZE) + limit = -1UL; + current->mm->context.exec_limit = limit; + } + set_user_cs(¤t->mm->context.user_cs, limit); + + desc1 = ¤t->mm->context.user_cs; + desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; + + if (__compare_user_cs_desc(desc1, desc2)) { + /* + * The CS was not in sync - reload it and retry the + * instruction. If the instruction still faults then + * we won't hit this branch next time around. + */ + load_user_cs_desc(cpu, current->mm); + + return 1; + } + + return 0; +} #endif static void __kprobes @@ -323,6 +393,20 @@ do_general_protection(struct pt_regs *regs, long error_code) if (!user_mode(regs)) goto gp_in_kernel; +#ifdef CONFIG_X86_32 +{ + int cpu; + int ok; + + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + + if (ok) + return; +} +#endif + tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; @@ -934,19 +1027,37 @@ dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) } #ifdef CONFIG_X86_32 +/* + * The fixup code for errors in iret jumps to here (iret_exc). It loses + * the original trap number and erorr code. The bogus trap 32 and error + * code 0 are what the vanilla kernel delivers via: + * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) + * + * NOTE: Because of the final "1" in the macro we need to enable interrupts. + * + * In case of a general protection fault in the iret instruction, we + * need to check for a lazy CS update for exec-shield. + */ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { - siginfo_t info; + int ok; + int cpu; + local_irq_enable(); - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_BADSTK; - info.si_addr = 0; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) - return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + + if (!ok && notify_die(DIE_TRAP, "iret exception", regs, + error_code, 32, SIGSEGV) != NOTIFY_STOP) { + siginfo_t info; + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + do_trap(32, SIGSEGV, "iret exception", 0, error_code, &info); + } } #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cef050..a18ae07 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -892,7 +890,10 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); + else #endif + printk(KERN_INFO "Using x86 segment limits to approximate " + "NX protection\n"); /* Enable PSE if available */ if (cpu_has_pse) diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 1241f11..3f2c44c 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -331,7 +331,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (compat) addr = VDSO_HIGH_BASE; else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b58e963..cdc83ce 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -316,6 +316,24 @@ static void xen_set_ldt(const void *addr, unsigned entries) xen_mc_issue(PARAVIRT_LAZY_CPU); } +#ifdef CONFIG_X86_32 +static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + void *gdt; + xmaddr_t mgdt; + u64 descriptor; + struct desc_struct user_cs; + + gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS]; + mgdt = virt_to_machine(gdt); + + user_cs = mm->context.user_cs; + descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32; + + HYPERVISOR_update_descriptor(mgdt.maddr, descriptor); +} +#endif /*CONFIG_X86_32*/ + static void xen_load_gdt(const struct desc_ptr *dtr) { unsigned long *frames; @@ -1232,6 +1250,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = xen_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 33b7235..ce1f044 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -754,6 +759,14 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) goto out_free_dentry; +#ifdef CONFIG_X86_32 + /* + * Turn off the CS limit completely if NX active: + */ + if (executable_stack != EXSTACK_DISABLE_X || nx_enabled) + arch_add_exec_range(current->mm, -1); +#endif + /* OK, This is the point of no return */ current->flags &= ~PF_FORKNOEXEC; current->mm->def_flags = def_flags; diff --git a/include/linux/resource.h b/include/linux/resource.h index 40fc7e6..68c2549 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -59,6 +59,7 @@ struct rlimit { * (2MB more to cover randomization effects.) */ #define _STK_LIM (10*1024*1024) +#define EXEC_STACK_BIAS (2*1024*1024) /* * GPG2 wants 64kB of mlocked memory, to make sure pass phrases diff --git a/mm/mmap.c b/mm/mmap.c index 00ced3e..931bc3b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -43,6 +44,18 @@ #define arch_rebalance_pgtables(addr, len) (addr) #endif +/* No sane architecture will #define these to anything else */ +#ifndef arch_add_exec_range +#define arch_add_exec_range(mm, limit) do { ; } while (0) +#endif +#ifndef arch_flush_exec_range +#define arch_flush_exec_range(mm) do { ; } while (0) +#endif +#ifndef arch_remove_exec_range +#define arch_remove_exec_range(mm, limit) do { ; } while (0) +#endif + + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -391,6 +404,8 @@ static inline void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { + if (vma->vm_flags & VM_EXEC) + arch_add_exec_range(mm, vma->vm_end); if (prev) { vma->vm_next = prev->vm_next; prev->vm_next = vma; @@ -493,6 +508,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; + if (vma->vm_flags & VM_EXEC) + arch_remove_exec_range(mm, vma->vm_end); } /* @@ -802,6 +819,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } else /* cases 2, 5, 7 */ vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL); + if (prev->vm_flags & VM_EXEC) + arch_add_exec_range(mm, prev->vm_end); return prev; } @@ -956,7 +975,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); + addr = get_unmapped_area_prot(file, addr, len, pgoff, flags, + prot & PROT_EXEC); if (addr & ~PAGE_MASK) return addr; @@ -1532,6 +1624,14 @@ out: return prev ? prev->vm_next : vma; } +static int over_stack_limit(unsigned long sz) +{ + if (sz < EXEC_STACK_BIAS) + return 0; + return (sz - EXEC_STACK_BIAS) > + current->signal->rlim[RLIMIT_STACK].rlim_cur; +} + /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the @@ -1548,7 +1648,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns return -ENOMEM; /* Stack limit test */ - if (size > rlim[RLIMIT_STACK].rlim_cur) + if (over_stack_limit(size)) return -ENOMEM; /* mlock limit tests */ @@ -1858,10 +1958,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (new_below) + if (new_below) { + unsigned long old_end = vma->vm_end; + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), new); - else + if (vma->vm_flags & VM_EXEC) + arch_remove_exec_range(mm, old_end); + } else vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); return 0; @@ -2110,6 +2214,7 @@ void exit_mmap(struct mm_struct *mm) vm_unacct_memory(nr_accounted); free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); + arch_flush_exec_range(mm); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/mprotect.c b/mm/mprotect.c index 258197b..9af91a3 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -25,9 +25,14 @@ #include #include #include +#include #include #include +#ifndef arch_remove_exec_range +#define arch_remove_exec_range(mm, limit) do { ; } while (0) +#endif + #ifndef pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { @@ -138,7 +143,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; - unsigned long charged = 0; + unsigned long charged = 0, old_end = vma->vm_end; pgoff_t pgoff; int error; int dirty_accountable = 0; @@ -203,6 +208,9 @@ success: dirty_accountable = 1; } + if (oldflags & VM_EXEC) + arch_remove_exec_range(current->mm, old_end); + mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); diff --git a/mm/mremap.c b/mm/mremap.c index a39b7b9..6bebfde 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -400,8 +400,8 @@ unsigned long do_mremap(unsigned long addr, if (vma->vm_flags & VM_MAYSHARE) map_flags |= MAP_SHARED; - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, - vma->vm_pgoff, map_flags); + new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len, + vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC); if (new_addr & ~PAGE_MASK) { ret = new_addr; goto out;