From 7648b1330c335601b7c09c25f77a03cda128fcab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Oct 2007 11:17:10 +0200 Subject: x86_64: move vdso Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/vdso/.gitignore | 1 + arch/x86/vdso/Makefile | 49 +++++++++++++++ arch/x86/vdso/vclock_gettime.c | 121 +++++++++++++++++++++++++++++++++++ arch/x86/vdso/vdso-note.S | 12 ++++ arch/x86/vdso/vdso-start.S | 2 + arch/x86/vdso/vdso.S | 2 + arch/x86/vdso/vdso.lds.S | 77 +++++++++++++++++++++++ arch/x86/vdso/vextern.h | 16 +++++ arch/x86/vdso/vgetcpu.c | 50 +++++++++++++++ arch/x86/vdso/vma.c | 140 +++++++++++++++++++++++++++++++++++++++++ arch/x86/vdso/voffset.h | 1 + arch/x86/vdso/vvar.c | 12 ++++ 12 files changed, 483 insertions(+) create mode 100644 arch/x86/vdso/.gitignore create mode 100644 arch/x86/vdso/Makefile create mode 100644 arch/x86/vdso/vclock_gettime.c create mode 100644 arch/x86/vdso/vdso-note.S create mode 100644 arch/x86/vdso/vdso-start.S create mode 100644 arch/x86/vdso/vdso.S create mode 100644 arch/x86/vdso/vdso.lds.S create mode 100644 arch/x86/vdso/vextern.h create mode 100644 arch/x86/vdso/vgetcpu.c create mode 100644 arch/x86/vdso/vma.c create mode 100644 arch/x86/vdso/voffset.h create mode 100644 arch/x86/vdso/vvar.c (limited to 'arch/x86/vdso') diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore new file mode 100644 index 00000000000..f8b69d84238 --- /dev/null +++ b/arch/x86/vdso/.gitignore @@ -0,0 +1 @@ +vdso.lds diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile new file mode 100644 index 00000000000..8d03de029d9 --- /dev/null +++ b/arch/x86/vdso/Makefile @@ -0,0 +1,49 @@ +# +# x86-64 vDSO. +# + +# files to link into the vdso +# vdso-start.o has to be first +vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o + +# files to link into kernel +obj-y := vma.o vdso.o vdso-syms.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o + +# The DSO images are built using a special linker script. +quiet_cmd_syscall = SYSCALL $@ + cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \ + -Wl,-T,$(filter-out FORCE,$^) -o $@ + +export CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 +SYSCFLAGS_vdso.so = $(vdso-flags) + +$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so + +$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,syscall) + +CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 + +$(obj)/vclock_gettime.o: CFLAGS = $(CFL) +$(obj)/vgetcpu.o: CFLAGS = $(CFL) + +# We also create a special relocatable object that should mirror the symbol +# table and layout of the linked DSO. With ld -R we can then refer to +# these symbols in the kernel code rather than hand-coded addresses. +extra-y += vdso-syms.o +$(obj)/built-in.o: $(obj)/vdso-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o + +SYSCFLAGS_vdso-syms.o = -r -d +$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,syscall) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c new file mode 100644 index 00000000000..5b54cdfb2b0 --- /dev/null +++ b/arch/x86/vdso/vclock_gettime.c @@ -0,0 +1,121 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of clock_gettime and gettimeofday. + * + * The code should have no internal unresolved relocations. + * Check with readelf after changing. + * Also alternative() doesn't work. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vextern.h" + +#define gtod vdso_vsyscall_gtod_data + +static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); + return ret; +} + +static inline long vgetns(void) +{ + long v; + cycles_t (*vread)(void); + vread = gtod->clock.vread; + v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; + return (v * gtod->clock.mult) >> gtod->clock.shift; +} + +static noinline int do_realtime(struct timespec *ts) +{ + unsigned long seq, ns; + do { + seq = read_seqbegin(>od->lock); + ts->tv_sec = gtod->wall_time_sec; + ts->tv_nsec = gtod->wall_time_nsec; + ns = vgetns(); + } while (unlikely(read_seqretry(>od->lock, seq))); + timespec_add_ns(ts, ns); + return 0; +} + +/* Copy of the version in kernel/time.c which we cannot directly access */ +static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static noinline int do_monotonic(struct timespec *ts) +{ + unsigned long seq, ns, secs; + do { + seq = read_seqbegin(>od->lock); + secs = gtod->wall_time_sec; + ns = gtod->wall_time_nsec + vgetns(); + secs += gtod->wall_to_monotonic.tv_sec; + ns += gtod->wall_to_monotonic.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + vset_normalized_timespec(ts, secs, ns); + return 0; +} + +int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + if (likely(gtod->sysctl_enabled && gtod->clock.vread)) + switch (clock) { + case CLOCK_REALTIME: + return do_realtime(ts); + case CLOCK_MONOTONIC: + return do_monotonic(ts); + } + return vdso_fallback_gettime(clock, ts); +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + long ret; + if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { + BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != + offsetof(struct timespec, tv_nsec) || + sizeof(*tv) != sizeof(struct timespec)); + do_realtime((struct timespec *)tv); + tv->tv_usec /= 1000; + if (unlikely(tz != NULL)) { + /* This relies on gcc inlining the memcpy. We'll notice + if it ever fails to do so. */ + memcpy(tz, >od->sys_tz, sizeof(struct timezone)); + } + return 0; + } + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/vdso/vdso-note.S new file mode 100644 index 00000000000..79a071e4357 --- /dev/null +++ b/arch/x86/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/x86/vdso/vdso-start.S b/arch/x86/vdso/vdso-start.S new file mode 100644 index 00000000000..2dc2cdb84d6 --- /dev/null +++ b/arch/x86/vdso/vdso-start.S @@ -0,0 +1,2 @@ + .globl vdso_kernel_start +vdso_kernel_start: diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S new file mode 100644 index 00000000000..4b1620a1529 --- /dev/null +++ b/arch/x86/vdso/vdso.S @@ -0,0 +1,2 @@ + .section ".vdso","a" + .incbin "arch/x86/vdso/vdso.so" diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S new file mode 100644 index 00000000000..b9a60e665d0 --- /dev/null +++ b/arch/x86/vdso/vdso.lds.S @@ -0,0 +1,77 @@ +/* + * Linker script for vsyscall DSO. The vsyscall page is an ELF shared + * object prelinked to its virtual address, and with only one read-only + * segment (that fits in one page). This script controls its layout. + */ +#include +#include "voffset.h" + +#define VDSO_PRELINK 0xffffffffff700000 + +SECTIONS +{ + . = VDSO_PRELINK + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + /* This linker script is used both with -r and with -shared. + For the layouts to match, we need to skip more than enough + space for the dynamic symbol table et al. If this amount + is insufficient, ld -shared will barf. Just increase it here. */ + . = VDSO_PRELINK + VDSO_TEXT_OFFSET; + + .text : { *(.text) } :text + .text.ptr : { *(.text.ptr) } :text + . = VDSO_PRELINK + 0x900; + .data : { *(.data) } :text + .bss : { *(.bss) } :text + + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + .note : { *(.note.*) } :text :note + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .dynamic : { *(.dynamic) } :text :dynamic + .useless : { + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.dynbss) + *(.gnu.linkonce.b.*) + } :text +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + local: *; + }; +} diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h new file mode 100644 index 00000000000..1683ba2ae3e --- /dev/null +++ b/arch/x86/vdso/vextern.h @@ -0,0 +1,16 @@ +#ifndef VEXTERN +#include +#define VEXTERN(x) \ + extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden"))); +#endif + +#define VMAGIC 0xfeedbabeabcdefabUL + +/* Any kernel variables used in the vDSO must be exported in the main + kernel's vmlinux.lds.S/vsyscall.h/proper __section and + put into vextern.h and be referenced as a pointer with vdso prefix. + The main kernel later fills in the values. */ + +VEXTERN(jiffies) +VEXTERN(vgetcpu_mode) +VEXTERN(vsyscall_gtod_data) diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c new file mode 100644 index 00000000000..91f6e85d0fc --- /dev/null +++ b/arch/x86/vdso/vgetcpu.c @@ -0,0 +1,50 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of getcpu() + */ + +#include +#include +#include +#include +#include +#include +#include "vextern.h" + +long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) +{ + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) { + p = tcache->blob[1]; + } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->blob[0] = j; + tcache->blob[1] = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c new file mode 100644 index 00000000000..ff9333e5fb0 --- /dev/null +++ b/arch/x86/vdso/vma.c @@ -0,0 +1,140 @@ +/* + * Set up the VMAs to tell the VM about the vDSO. + * Copyright 2007 Andi Kleen, SUSE Labs. + * Subject to the GPL, v.2 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "voffset.h" + +int vdso_enabled = 1; + +#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x; +#include "vextern.h" +#undef VEXTERN + +extern char vdso_kernel_start[], vdso_start[], vdso_end[]; +extern unsigned short vdso_sync_cpuid; + +struct page **vdso_pages; + +static inline void *var_ref(void *vbase, char *var, char *name) +{ + unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET; + void *p = vbase + offset; + if (*(void **)p != (void *)VMAGIC) { + printk("VDSO: variable %s broken\n", name); + vdso_enabled = 0; + } + return p; +} + +static int __init init_vdso_vars(void) +{ + int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; + int i; + char *vbase; + + vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); + if (!vdso_pages) + goto oom; + for (i = 0; i < npages; i++) { + struct page *p; + p = alloc_page(GFP_KERNEL); + if (!p) + goto oom; + vdso_pages[i] = p; + copy_page(page_address(p), vdso_start + i*PAGE_SIZE); + } + + vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL); + if (!vbase) + goto oom; + + if (memcmp(vbase, "\177ELF", 4)) { + printk("VDSO: I'm broken; not ELF\n"); + vdso_enabled = 0; + } + +#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x) +#define VEXTERN(x) \ + V(vdso_ ## x) = &__ ## x; +#include "vextern.h" +#undef VEXTERN + return 0; + + oom: + printk("Cannot allocate vdso\n"); + vdso_enabled = 0; + return -ENOMEM; +} +__initcall(init_vdso_vars); + +struct linux_binprm; + +/* Put the vdso above the (randomized) stack with another randomized offset. + This way there is no hole in the middle of address space. + To save memory make sure it is still in the same PTE as the stack top. + This doesn't give that many random bits */ +static unsigned long vdso_addr(unsigned long start, unsigned len) +{ + unsigned long addr, end; + unsigned offset; + end = (start + PMD_SIZE - 1) & PMD_MASK; + if (end >= TASK_SIZE64) + end = TASK_SIZE64; + end -= len; + /* This loses some more bits than a modulo, but is cheaper */ + offset = get_random_int() & (PTRS_PER_PTE - 1); + addr = start + (offset << PAGE_SHIFT); + if (addr >= end) + addr = end; + return addr; +} + +/* Setup a VMA at program startup for the vsyscall page. + Not called for compat tasks */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret; + unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); + + if (!vdso_enabled) + return 0; + + down_write(&mm->mmap_sem); + addr = vdso_addr(mm->start_stack, len); + addr = get_unmapped_area(NULL, addr, len, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + ret = install_special_mapping(mm, addr, len, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + vdso_pages); + if (ret) + goto up_fail; + + current->mm->context.vdso = (void *)addr; +up_fail: + up_write(&mm->mmap_sem); + return ret; +} + +static __init int vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("vdso=", vdso_setup); diff --git a/arch/x86/vdso/voffset.h b/arch/x86/vdso/voffset.h new file mode 100644 index 00000000000..4af67c79085 --- /dev/null +++ b/arch/x86/vdso/voffset.h @@ -0,0 +1 @@ +#define VDSO_TEXT_OFFSET 0x600 diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c new file mode 100644 index 00000000000..6fc22219a47 --- /dev/null +++ b/arch/x86/vdso/vvar.c @@ -0,0 +1,12 @@ +/* Define pointer to external vDSO variables. + These are part of the vDSO. The kernel fills in the real addresses + at boot time. This is done because when the vdso is linked the + kernel isn't yet and we don't know the final addresses. */ +#include +#include +#include +#include +#include + +#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC; +#include "vextern.h" -- cgit v1.2.3