From ace67ae868bba84038a6fddc4bb673d7637c8652 Mon Sep 17 00:00:00 2001 From: cheloha Date: Sun, 26 Feb 2023 23:00:42 +0000 Subject: [PATCH] clockintr: add a kernel-facing API We need an API for creating, scheduling, and rescheduling clock interrupts. - Add struct clockintr, a schedulable clock interrupt callback. - Add clockintr_establish(). Allocates a new struct clockintr and binds it to the given clockintr_queue. - Add clockintr_expiration(). Returns the clockintr's absolute expiration uptime. - Add clockintr_nsecuptime(). Returns the clockintr's parent queue's cached uptime. Using a cached timestamp is cheaper than calling nsecuptime(9) repeatedly when we don't absolutely need to. - Add clockintr_schedule(). Schedules the clock interrupt to run at or after the given absolute uptime. - Add clockintr_advance(). Reschedules the clock interrupt in the future on the given period relative to the parent queue's cached uptime. With the above pieces in place we can push most of the scheduling code for hardclock()/statclock()/schedclock() from clockintr_dispatch() into the wrapper functions clockintr_hardclock(), clockintr_statclock(), and clockintr_schedclock(). These wrappers are temporary. I don't want to muck up the wrapped functions while things are still moving around. For the moment these interfaces are internal to kern_clockintr.c. In a later patch we will move the prototypes into so anyone can use them. We first need to add a data structure for sorting the clockintr structs. We also need to add a mutex to clockintr_queue to allow arbitrary threads to safely manipulate clock interrupts established on other CPUs. Shown on hackers@. Tweaked by mlarkin@. ok mlarkin@, "no objections" kettenis@ --- sys/kern/kern_clockintr.c | 243 +++++++++++++++++++++++++++----------- sys/sys/clockintr.h | 24 +++- 2 files changed, 192 insertions(+), 75 deletions(-) diff --git a/sys/kern/kern_clockintr.c b/sys/kern/kern_clockintr.c index 3a15e2b67f3..13dbb4e928e 100644 --- a/sys/kern/kern_clockintr.c +++ b/sys/kern/kern_clockintr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_clockintr.c,v 1.2 2022/12/31 00:48:53 cheloha Exp $ */ +/* $OpenBSD: kern_clockintr.c,v 1.3 2023/02/26 23:00:42 cheloha Exp $ */ /* * Copyright (c) 2003 Dale Rahn * Copyright (c) 2020 Mark Kettenis @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,15 @@ uint32_t prof_avg; /* [I] average profhz period (ns) */ uint32_t prof_min; /* [I] minimum profhz period (ns) */ uint32_t prof_mask; /* [I] set of allowed offsets */ +uint64_t clockintr_advance(struct clockintr *, uint64_t); +struct clockintr *clockintr_establish(struct clockintr_queue *, + void (*)(struct clockintr *, void *)); +uint64_t clockintr_expiration(const struct clockintr *); +void clockintr_hardclock(struct clockintr *, void *); +uint64_t clockintr_nsecuptime(const struct clockintr *); +void clockintr_schedclock(struct clockintr *, void *); +void clockintr_schedule(struct clockintr *, uint64_t); +void clockintr_statclock(struct clockintr *, void *); void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *); uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); @@ -92,13 +102,28 @@ clockintr_init(u_int flags) void clockintr_cpu_init(const struct intrclock *ic) { - uint64_t multiplier, now; + uint64_t multiplier = 0, offset; struct cpu_info *ci = curcpu(); struct clockintr_queue *cq = &ci->ci_queue; KASSERT(ISSET(clockintr_flags, CL_INIT)); if (!ISSET(cq->cq_flags, CL_CPU_INIT)) { + cq->cq_next = 0; + cq->cq_hardclock = clockintr_establish(cq, clockintr_hardclock); + if (cq->cq_hardclock == NULL) + panic("%s: failed to establish hardclock", __func__); + cq->cq_statclock = clockintr_establish(cq, clockintr_statclock); + if (cq->cq_statclock == NULL) + panic("%s: failed to establish statclock", __func__); + if (schedhz != 0) { + cq->cq_schedclock = clockintr_establish(cq, + clockintr_schedclock); + if (cq->cq_schedclock == NULL) { + panic("%s: failed to establish schedclock", + __func__); + } + } if (ic != NULL) { cq->cq_intrclock = *ic; SET(cq->cq_flags, CL_CPU_INTRCLOCK); @@ -111,14 +136,12 @@ clockintr_cpu_init(const struct intrclock *ic) * the hardclock and statclock so they don't all happen at once. * If we have no intrclock it doesn't matter, we have no control * anyway. The primary CPU's starting offset is always zero, so - * set multiplier to zero. + * leave the multiplier zero. */ if (!CPU_IS_PRIMARY(ci) && ISSET(cq->cq_flags, CL_CPU_INTRCLOCK)) multiplier = CPU_INFO_UNIT(ci); - else - multiplier = 0; - now = nsecuptime(); + cq->cq_uptime = nsecuptime(); /* * The first time we do this, the primary CPU cannot skip any @@ -126,19 +149,21 @@ clockintr_cpu_init(const struct intrclock *ic) * the global tick value is advanced during inittodr(9) on our * behalf. */ - if (!CPU_IS_PRIMARY(ci) || ISSET(cq->cq_flags, CL_CPU_INIT)) { - cq->cq_next_hardclock = hardclock_period / ncpus * multiplier; - nsec_advance(&cq->cq_next_hardclock, hardclock_period, now); - } + offset = hardclock_period / ncpus * multiplier; + clockintr_schedule(cq->cq_hardclock, offset); + if (!CPU_IS_PRIMARY(ci) || ISSET(cq->cq_flags, CL_CPU_INIT)) + clockintr_advance(cq->cq_hardclock, hardclock_period); /* * We can always advance the statclock and schedclock. */ - cq->cq_next_statclock = stat_avg / ncpus * multiplier; - nsec_advance(&cq->cq_next_statclock, stat_avg, now); + offset = statclock_avg / ncpus * multiplier; + clockintr_schedule(cq->cq_statclock, offset); + clockintr_advance(cq->cq_statclock, statclock_avg); if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { - cq->cq_next_schedclock = schedclock_period / ncpus * multiplier; - nsec_advance(&cq->cq_next_schedclock, schedclock_period, now); + offset = schedclock_period / ncpus * multiplier; + clockintr_schedule(cq->cq_schedclock, offset); + clockintr_advance(cq->cq_schedclock, schedclock_period); } SET(cq->cq_flags, CL_CPU_INIT); @@ -164,12 +189,10 @@ clockintr_trigger(void) int clockintr_dispatch(void *frame) { - uint64_t count, i, lateness, now, run = 0, start; + uint64_t lateness, run = 0, start; struct cpu_info *ci = curcpu(); struct clockintr_queue *cq = &ci->ci_queue; - struct proc *p = curproc; - uint32_t mask, min, off; - u_int gen, ogen; + u_int ogen; if (cq->cq_dispatch != 0) panic("%s: recursive dispatch", __func__); @@ -182,66 +205,45 @@ clockintr_dispatch(void *frame) * If we arrived too early we have nothing to do. */ start = nsecuptime(); - now = start; - if (now < cq->cq_next) + cq->cq_uptime = start; + if (cq->cq_uptime < cq->cq_next) goto done; - lateness = now - cq->cq_next; + lateness = start - cq->cq_next; /* * Dispatch expired events. */ again: /* hardclock */ - count = nsec_advance(&cq->cq_next_hardclock, hardclock_period, now); - for (i = 0; i < count; i++) - hardclock(frame); - run += count; + if (cq->cq_hardclock->cl_expiration <= cq->cq_uptime) { + cq->cq_hardclock->cl_func(cq->cq_hardclock, frame); + run++; + } /* statclock */ - if (ISSET(clockintr_flags, CL_RNDSTAT)) { - do { - gen = statclock_gen; - membar_consumer(); - min = statclock_min; - mask = statclock_mask; - membar_consumer(); - } while (gen == 0 || gen != statclock_gen); - count = 0; - while (cq->cq_next_statclock <= now) { - while ((off = (random() & mask)) == 0) - continue; - cq->cq_next_statclock += min + off; - count++; - } - } else { - count = nsec_advance(&cq->cq_next_statclock, statclock_avg, - now); + if (cq->cq_statclock->cl_expiration <= cq->cq_uptime) { + cq->cq_statclock->cl_func(cq->cq_statclock, frame); + run++; } - for (i = 0; i < count; i++) - statclock(frame); - run += count; /* schedclock */ if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { - count = nsec_advance(&cq->cq_next_schedclock, - schedclock_period, now); - if (p != NULL) { - for (i = 0; i < count; i++) - schedclock(p); + if (cq->cq_schedclock->cl_expiration <= cq->cq_uptime) { + cq->cq_schedclock->cl_func(cq->cq_schedclock, frame); + run++; } - run += count; } /* Run the dispatch again if the next event has already expired. */ - cq->cq_next = cq->cq_next_hardclock; - if (cq->cq_next_statclock < cq->cq_next) - cq->cq_next = cq->cq_next_statclock; + cq->cq_next = cq->cq_hardclock->cl_expiration; + if (cq->cq_statclock->cl_expiration < cq->cq_next) + cq->cq_next = cq->cq_statclock->cl_expiration; if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { - if (cq->cq_next_schedclock < cq->cq_next) - cq->cq_next = cq->cq_next_schedclock; + if (cq->cq_schedclock->cl_expiration < cq->cq_next) + cq->cq_next = cq->cq_schedclock->cl_expiration; } - now = nsecuptime(); - if (cq->cq_next <= now) + cq->cq_uptime = nsecuptime(); + if (cq->cq_next <= cq->cq_uptime) goto again; /* @@ -250,20 +252,20 @@ again: done: /* Rearm the interrupt clock if we have one. */ if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK)) - intrclock_rearm(&cq->cq_intrclock, cq->cq_next - now); + intrclock_rearm(&cq->cq_intrclock, cq->cq_next - cq->cq_uptime); /* Update our stats. */ ogen = cq->cq_gen; cq->cq_gen = 0; membar_producer(); - cq->cq_stat.cs_dispatched += now - start; + cq->cq_stat.cs_dispatched += cq->cq_uptime - start; if (run > 0) { cq->cq_stat.cs_lateness += lateness; cq->cq_stat.cs_prompt++; cq->cq_stat.cs_run += run; } else { cq->cq_stat.cs_early++; - cq->cq_stat.cs_earliness += cq->cq_next - now; + cq->cq_stat.cs_earliness += cq->cq_next - cq->cq_uptime; } membar_producer(); cq->cq_gen = MAX(1, ogen + 1); @@ -275,6 +277,39 @@ done: return run > 0; } +uint64_t +clockintr_advance(struct clockintr *cl, uint64_t period) +{ + return nsec_advance(&cl->cl_expiration, period, + cl->cl_queue->cq_uptime); +} + +struct clockintr * +clockintr_establish(struct clockintr_queue *cq, + void (*func)(struct clockintr *, void *)) +{ + struct clockintr *cl; + + cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO); + if (cl == NULL) + return NULL; + cl->cl_func = func; + cl->cl_queue = cq; + return cl; +} + +uint64_t +clockintr_expiration(const struct clockintr *cl) +{ + return cl->cl_expiration; +} + +void +clockintr_schedule(struct clockintr *cl, uint64_t expiration) +{ + cl->cl_expiration = expiration; +} + /* * Compute the period (avg) for the given frequency and a range around * that period. The range is [min + 1, min + mask]. The range is used @@ -339,6 +374,67 @@ clockintr_setstatclockrate(int freq) mtx_leave(&clockintr_mtx); } +uint64_t +clockintr_nsecuptime(const struct clockintr *cl) +{ + return cl->cl_queue->cq_uptime; +} + +void +clockintr_hardclock(struct clockintr *cl, void *frame) +{ + uint64_t count, i; + + count = clockintr_advance(cl, hardclock_period); + for (i = 0; i < count; i++) + hardclock(frame); +} + +void +clockintr_schedclock(struct clockintr *cl, void *unused) +{ + uint64_t count, i; + struct proc *p = curproc; + + count = clockintr_advance(cl, schedclock_period); + if (p != NULL) { + for (i = 0; i < count; i++) + schedclock(p); + } +} + +void +clockintr_statclock(struct clockintr *cl, void *frame) +{ + uint64_t count, expiration, i, uptime; + uint32_t mask, min, off; + u_int gen; + + if (ISSET(clockintr_flags, CL_RNDSTAT)) { + do { + gen = statclock_gen; + membar_consumer(); + min = statclock_min; + mask = statclock_mask; + membar_consumer(); + } while (gen == 0 || gen != statclock_gen); + count = 0; + expiration = clockintr_expiration(cl); + uptime = clockintr_nsecuptime(cl); + while (expiration <= uptime) { + while ((off = (random() & mask)) == 0) + continue; + expiration += min + off; + count++; + } + clockintr_schedule(cl, expiration); + } else { + count = clockintr_advance(cl, statclock_avg); + } + for (i = 0; i < count; i++) + statclock(frame); +} + /* * Advance *next in increments of period until it exceeds now. * Returns the number of increments *next was advanced. @@ -413,7 +509,7 @@ sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, #include #include -void db_show_clockintr(uint64_t, u_int, const char *); +void db_show_clockintr(const struct clockintr *, u_int); void db_show_clockintr_cpu(struct cpu_info *); void @@ -440,19 +536,24 @@ db_show_clockintr_cpu(struct cpu_info *ci) struct clockintr_queue *cq = &ci->ci_queue; u_int cpu = CPU_INFO_UNIT(ci); - db_show_clockintr(cq->cq_next_hardclock, cpu, "hardclock"); - db_show_clockintr(cq->cq_next_statclock, cpu, "statclock"); - if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) - db_show_clockintr(cq->cq_next_schedclock, cpu, "schedclock"); + db_show_clockintr(cq->cq_hardclock, cpu); + db_show_clockintr(cq->cq_statclock, cpu); + if (cq->cq_schedclock != NULL) + db_show_clockintr(cq->cq_schedclock, cpu); } void -db_show_clockintr(uint64_t expiration, u_int cpu, const char *name) +db_show_clockintr(const struct clockintr *cl, u_int cpu) { struct timespec ts; - - NSEC_TO_TIMESPEC(expiration, &ts); - db_printf("%10lld.%09ld %3u %s\n", ts.tv_sec, ts.tv_nsec, cpu, name); + char *name; + db_expr_t offset; + + NSEC_TO_TIMESPEC(cl->cl_expiration, &ts); + db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset); + if (name == NULL) + name = "?"; + db_printf("%10lld.%09ld %3u %s", ts.tv_sec, ts.tv_nsec, cpu, name); } #endif /* DDB */ diff --git a/sys/sys/clockintr.h b/sys/sys/clockintr.h index 8021d6c0afd..09a8a8b60f0 100644 --- a/sys/sys/clockintr.h +++ b/sys/sys/clockintr.h @@ -1,4 +1,4 @@ -/* $OpenBSD: clockintr.h,v 1.1 2022/11/05 19:29:46 cheloha Exp $ */ +/* $OpenBSD: clockintr.h,v 1.2 2023/02/26 23:00:42 cheloha Exp $ */ /* * Copyright (c) 2020-2022 Scott Cheloha * @@ -53,6 +53,21 @@ intrclock_trigger(struct intrclock *ic) ic->ic_trigger(ic->ic_cookie); } +/* + * Schedulable clock interrupt callback. + * + * Struct member protections: + * + * I Immutable after initialization. + * o Owned by a single CPU. + */ +struct clockintr_queue; +struct clockintr { + uint64_t cl_expiration; /* [o] dispatch time */ + void (*cl_func)(struct clockintr *, void *); /* [I] callback */ + struct clockintr_queue *cl_queue; /* [I] parent queue */ +}; + /* * Per-CPU clock interrupt state. * @@ -62,10 +77,11 @@ intrclock_trigger(struct intrclock *ic) * o Owned by a single CPU. */ struct clockintr_queue { + uint64_t cq_uptime; /* [o] cached uptime */ uint64_t cq_next; /* [o] next event expiration */ - uint64_t cq_next_hardclock; /* [o] next hardclock expiration */ - uint64_t cq_next_schedclock; /* [o] next schedclock expiration */ - uint64_t cq_next_statclock; /* [o] next statclock expiration */ + struct clockintr *cq_hardclock; /* [o] hardclock handle */ + struct clockintr *cq_schedclock;/* [o] schedclock handle, if any */ + struct clockintr *cq_statclock; /* [o] statclock handle */ struct intrclock cq_intrclock; /* [I] local interrupt clock */ struct clockintr_stat cq_stat; /* [o] dispatch statistics */ volatile u_int cq_gen; /* [o] cq_stat update generation */ -- 2.20.1