From: mpi Date: Fri, 28 Apr 2017 13:50:54 +0000 (+0000) Subject: Add futex(2) syscall based on a sane subset of its Linux equivalent. X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=004d4497806b353140df0424d766c0d83fcf31de;p=openbsd Add futex(2) syscall based on a sane subset of its Linux equivalent. The syscall is marked NOLOCK and only FUTEX_WAIT grabs the KERNEL_LOCK() because of PCATCH and the signal nightmare. Serialization of threads is currently done with a global & exclusive rwlock. Note that the current implementation still use copyin(9) which is not guaranteed to be atomic. Committing now such that remaining issues can be addressed in-tree. With inputs from guenther@, kettenis@ and visa@. ok deraadt@, visa@ --- diff --git a/sys/conf/files b/sys/conf/files index cc86def23f2..e9d35078631 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $OpenBSD: files,v 1.641 2017/04/20 13:57:30 visa Exp $ +# $OpenBSD: files,v 1.642 2017/04/28 13:50:54 mpi Exp $ # $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -701,6 +701,7 @@ file kern/subr_prof.c file kern/subr_userconf.c boot_config file kern/subr_witness.c witness file kern/subr_xxx.c +file kern/sys_futex.c file kern/sys_generic.c file kern/sys_pipe.c file kern/sys_process.c ptrace diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 9b6291d4352..83550d8c55c 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,4 +1,4 @@ -/* $OpenBSD: init_main.c,v 1.268 2017/04/20 12:59:36 visa Exp $ */ +/* $OpenBSD: init_main.c,v 1.269 2017/04/28 13:50:55 mpi Exp $ */ /* $NetBSD: init_main.c,v 1.84.4.1 1996/06/02 09:08:06 mrg Exp $ */ /* @@ -144,6 +144,7 @@ void db_ctf_init(void); void prof_init(void); void init_exec(void); void kqueue_init(void); +void futex_init(void); void taskq_init(void); void timeout_proc_init(void); void pool_gc_pages(void *); @@ -264,6 +265,11 @@ main(void *framep) */ kqueue_init(); + /* + * Initialize futexes. + */ + futex_init(); + /* Create credentials. */ p->p_ucred = crget(); p->p_ucred->cr_ngroups = 1; /* group 0 */ diff --git a/sys/kern/kern_pledge.c b/sys/kern/kern_pledge.c index d99a60d2a1e..4bea704b4cf 100644 --- a/sys/kern/kern_pledge.c +++ b/sys/kern/kern_pledge.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_pledge.c,v 1.205 2017/04/20 15:21:53 deraadt Exp $ */ +/* $OpenBSD: kern_pledge.c,v 1.206 2017/04/28 13:50:55 mpi Exp $ */ /* * Copyright (c) 2015 Nicholas Marriott @@ -259,6 +259,7 @@ const uint64_t pledge_syscalls[SYS_MAXSYSCALL] = { [SYS___tfork] = PLEDGE_STDIO, [SYS_sched_yield] = PLEDGE_STDIO, [SYS___thrsleep] = PLEDGE_STDIO, + [SYS_futex] = PLEDGE_ALWAYS, [SYS___thrwakeup] = PLEDGE_STDIO, [SYS___threxit] = PLEDGE_STDIO, [SYS___thrsigdivert] = PLEDGE_STDIO, diff --git a/sys/kern/sys_futex.c b/sys/kern/sys_futex.c new file mode 100644 index 00000000000..0db6a10c7f3 --- /dev/null +++ b/sys/kern/sys_futex.c @@ -0,0 +1,287 @@ +/* $OpenBSD: sys_futex.c,v 1.1 2017/04/28 13:50:55 mpi Exp $ */ + +/* + * Copyright (c) 2016-2017 Martin Pieuchot + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef KTRACE +#include +#endif + +/* + * Kernel representation of a futex. + */ +struct futex { + LIST_ENTRY(futex) ft_list; /* list of all futexes */ + TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */ + uint32_t *ft_uaddr; /* userspace address */ + pid_t ft_pid; /* process identifier */ + unsigned int ft_refcnt; /* # of references */ +}; + +/* Syscall helpers. */ +int futex_wait(uint32_t *, uint32_t, const struct timespec *); +int futex_wake(uint32_t *, uint32_t); +int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t); + +/* Flags for futex_get(). */ +#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */ + +struct futex *futex_get(uint32_t *, int); +void futex_put(struct futex *); + +/* + * The global futex lock serialize futex(2) calls such that no wakeup + * event are lost, protect the global list of all futexes and their + * states. + */ +struct rwlock ftlock = RWLOCK_INITIALIZER("futex"); +static LIST_HEAD(, futex) ftlist; +struct pool ftpool; + + +void +futex_init(void) +{ + pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE, 0, "futexpl", + NULL); +} + +int +sys_futex(struct proc *p, void *v, register_t *retval) +{ + struct sys_futex_args /* { + syscallarg(uint32_t *) f; + syscallarg(int) op; + syscallarg(inr) val; + syscallarg(const struct timespec *) timeout; + syscallarg(uint32_t *) g; + } */ *uap = v; + uint32_t *uaddr = SCARG(uap, f); + int op = SCARG(uap, op); + uint32_t val = SCARG(uap, val); + const struct timespec *timeout = SCARG(uap, timeout); + void *g = SCARG(uap, g); + int error = 0; + + switch (op) { + case FUTEX_WAIT: + KERNEL_LOCK(); + rw_enter_write(&ftlock); + *retval = futex_wait(uaddr, val, timeout); + rw_exit_write(&ftlock); + KERNEL_UNLOCK(); + break; + case FUTEX_WAKE: + rw_enter_write(&ftlock); + *retval = futex_wake(uaddr, val); + rw_exit_write(&ftlock); + break; + case FUTEX_REQUEUE: + rw_enter_write(&ftlock); + *retval = futex_requeue(uaddr, val, g, (unsigned long)timeout); + rw_exit_write(&ftlock); + break; + default: + error = ENOSYS; + break; + } + + return (error ? -1 : 0); +} + +/* + * Return an existing futex matching userspace address ``uaddr''. + * + * If such futex does not exist and FT_CREATE is given, create it. + */ +struct futex * +futex_get(uint32_t *uaddr, int flag) +{ + struct futex *f; + + rw_assert_wrlock(&ftlock); + + LIST_FOREACH(f, &ftlist, ft_list) { + if (f->ft_uaddr == uaddr && f->ft_pid == curproc->p_p->ps_pid) { + f->ft_refcnt++; + break; + } + } + + if ((f == NULL) && (flag & FT_CREATE)) { + /* + * We rely on the rwlock to ensure that no other thread + * create the same futex. + */ + f = pool_get(&ftpool, PR_WAITOK); + TAILQ_INIT(&f->ft_threads); + f->ft_uaddr = uaddr; + f->ft_pid = curproc->p_p->ps_pid; + f->ft_refcnt = 1; + LIST_INSERT_HEAD(&ftlist, f, ft_list); + } + + return f; +} + +/* + * Release a given futex. + */ +void +futex_put(struct futex *f) +{ + rw_assert_wrlock(&ftlock); + + KASSERT(f->ft_refcnt > 0); + + --f->ft_refcnt; + if (f->ft_refcnt == 0) { + KASSERT(TAILQ_EMPTY(&f->ft_threads)); + LIST_REMOVE(f, ft_list); + pool_put(&ftpool, f); + } +} + +/* + * Put the current thread on the sleep queue of the futex at address + * ``uaddr''. Let it sleep for the specified ``timeout'' time, or + * indefinitly if the argument is NULL. + */ +int +futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout) +{ + struct proc *p = curproc; + struct futex *f; + uint64_t to_ticks = 0; + uint32_t cval; + int error; + + /* + * After reading the value a race is still possible but + * we deal with it by serializing all futex syscalls. + */ + rw_assert_wrlock(&ftlock); + + /* + * Read user space futex value + * + * XXX copyin(9) is not guaranteed to be atomic. + */ + if ((error = copyin(uaddr, &cval, sizeof(cval)))) + return error; + + /* If the value changed, stop here. */ + if (cval != val) + return EAGAIN; + + if (timeout != NULL) { + struct timespec ts; + + if ((error = copyin(timeout, &ts, sizeof(ts)))) + return error; +#ifdef KTRACE + if (KTRPOINT(p, KTR_STRUCT)) + ktrabstimespec(p, timeout); +#endif + to_ticks = (uint64_t)hz * ts.tv_sec + + (ts.tv_nsec + tick * 1000 - 1) / (tick * 1000) + 1; + if (to_ticks > INT_MAX) + to_ticks = INT_MAX; + } + + f = futex_get(uaddr, FT_CREATE); + TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link); + p->p_futex = f; + + error = rwsleep(p, &ftlock, PUSER|PCATCH, "fsleep", (int)to_ticks); + if (error == ERESTART) + error = EINTR; + else if (error == EWOULDBLOCK) { + /* A race occured between a wakeup and a timeout. */ + if (p->p_futex == NULL) + error = 0; + else + error = ETIMEDOUT; + } + + /* Remove ourself if we haven't been awaken. */ + if ((f = p->p_futex) != NULL) { + p->p_futex = NULL; + TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); + futex_put(f); + } + + return error; +} + +/* + * Wakeup at most ``n'' sibling threads sleeping on a futex at address + * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at + * address ``uaddr2''. + */ +int +futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m) +{ + struct futex *f, *g; + struct proc *p; + uint32_t count = 0; + + rw_assert_wrlock(&ftlock); + + f = futex_get(uaddr, 0); + if (f == NULL) + return 0; + + while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) { + p->p_futex = NULL; + TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); + futex_put(f); + + if (count < n) { + wakeup_one(p); + } else if (uaddr2 != NULL) { + g = futex_get(uaddr2, FT_CREATE); + TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link); + p->p_futex = g; + } + count++; + } + + futex_put(f); + + return count; +} + +/* + * Wakeup at most ``n'' sibling threads sleeping on a futex at address + * ``uaddr''. + */ +int +futex_wake(uint32_t *uaddr, uint32_t n) +{ + return futex_requeue(uaddr, n, NULL, 0); +} diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 0ce9ba5238a..15cf55de433 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -1,4 +1,4 @@ -; $OpenBSD: syscalls.master,v 1.175 2017/04/13 04:06:46 guenther Exp $ +; $OpenBSD: syscalls.master,v 1.176 2017/04/28 13:50:55 mpi Exp $ ; $NetBSD: syscalls.master,v 1.32 1996/04/23 10:24:21 mycroft Exp $ ; @(#)syscalls.master 8.2 (Berkeley) 1/13/94 @@ -187,7 +187,8 @@ const gid_t *gidset); } 81 STD { int sys_getpgrp(void); } 82 STD { int sys_setpgid(pid_t pid, pid_t pgid); } -83 OBSOL osendsyslog +83 STD NOLOCK { int sys_futex(uint32_t *f, int op, int val, \ + const struct timespec *timeout, uint32_t *g); } 84 STD { int sys_utimensat(int fd, const char *path, \ const struct timespec *times, int flag); } 85 STD { int sys_futimens(int fd, \ diff --git a/sys/sys/futex.h b/sys/sys/futex.h new file mode 100644 index 00000000000..59b9a65205d --- /dev/null +++ b/sys/sys/futex.h @@ -0,0 +1,35 @@ +/* $OpenBSD: futex.h,v 1.1 2017/04/28 13:50:55 mpi Exp $ */ + +/* + * Copyright (c) 2016 Martin Pieuchot + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _SYS_FUTEX_H_ +#define _SYS_FUTEX_H_ + +#ifndef _KERNEL +#include + +__BEGIN_DECLS +int futex(volatile uint32_t *, int, int, const struct timespec *, + volatile uint32_t *); +__END_DECLS +#endif /* ! _KERNEL */ + +#define FUTEX_WAIT 1 +#define FUTEX_WAKE 2 +#define FUTEX_REQUEUE 3 + +#endif /* _SYS_FUTEX_H_ */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 5cb1cc1b9de..1558ebd03ea 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -1,4 +1,4 @@ -/* $OpenBSD: proc.h,v 1.238 2017/04/20 12:59:36 visa Exp $ */ +/* $OpenBSD: proc.h,v 1.239 2017/04/28 13:50:55 mpi Exp $ */ /* $NetBSD: proc.h,v 1.44 1996/04/22 01:23:21 christos Exp $ */ /*- @@ -280,7 +280,10 @@ struct proc { LIST_ENTRY(proc) p_list; /* List of all threads. */ struct process *p_p; /* The process of this thread. */ - TAILQ_ENTRY(proc) p_thr_link;/* Threads in a process linkage. */ + TAILQ_ENTRY(proc) p_thr_link; /* Threads in a process linkage. */ + + TAILQ_ENTRY(proc) p_fut_link; /* Threads in a futex linkage. */ + struct futex *p_futex; /* Current sleeping futex. */ /* substructures: */ struct filedesc *p_fd; /* copy of p_p->ps_fd */