From dbc0d5073cdfa0d7591e0ef6300f51201a8d6afa Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@cloudius-systems.com>
Date: Tue, 26 Nov 2013 11:39:18 +0200
Subject: [PATCH] sched: New scheduler algorithm

This patch replaces the algorithm which the scheduler uses to keep track of
threads' runtime, and to choose which thread to run next and for how long.

The previous algorithm used the raw cumulative runtime of a thread as its
runtime measure. But comparing these numbers directly was impossible: e.g.,
should a thread that slept for an hour now get an hour of uninterrupted CPU
time? This resulted in a hodgepodge of heuristics which "modified" and
"fixed" the runtime. These heuristics did work quite well in our test cases,
but we were forced to add more and more unjustified heuristics and constants
to fix scheduling bugs as they were discovered. The existing scheduler was
especially problematic with thread migration (moving a thread from one CPU
to another) as the runtime measure on one CPU was meaningless in another.
This bug, if not corrected, (e.g., by the patch which I sent a month
ago) can cause crucial threads to acquire exceedingly high runtimes by
mistake, and resulted in the tst-loadbalance test using only one CPU on
a two-CPU guest.

The new scheduling algorithm follows a much more rigorous design,
proposed by Avi Kivity in:
https://docs.google.com/document/d/1W7KCxOxP-1Fy5EyF2lbJGE2WuKmu5v0suYqoHas1jRM/edit?usp=sharing

To make a long story short (read the document if you want all the
details), the new algorithm is based on a runtime measure R which
is the running decaying average of the thread's running time.
It is a decaying average in the sense that the thread's act of running or
sleeping in recent history is given more weight than its behavior
a long time ago. This measure R can tell us which of the runnable
threads to run next (the one with the lowest R), and using some
highschool-level mathematics, we can calculate for how long to run
this thread until it should be preempted by the next one. R carries
the same meaning on all CPUs, so CPU migration becomes trivial.

The actual implementation uses a normalized version of R, called R''
(Rtt in the code), which is also explained in detail in the document.
This Rtt allows updating just the running thread's runtime - not all
threads' runtime - as time passes, making the whole calculation much
more tractable.

The benefits of the new scheduler code over the existing one are:

1. A more rigourous design with fewer unjustified heuristics.

2. A thread's runtime measurement correctly survives a migration to a
different CPU, unlike the existing code (which sometimes botches
it up, leading to threads hanging). In particular, tst-loadbalance
now gives good results for the "intermittent thread" test, unlike
the previous code which in 50% of the runs caused one CPU to be
completely wasted (when the load- balancing thread hung).

3. The new algorithm can look at a much longer runtime history than the
previous algorithm did. With the default tau=200ms, the one-cpu
intermittent thread test of tst-scheduler now provides good
fairness for sleep durations of 1ms-32ms.
The previous algorithm was never fair in any of those tests.

4. The new algorithm is more deterministic in its use of timers
(with thyst=2_ms: up to 500 timers a second), resulting in less
varied performance in high-context-switch benchmarks like tst-ctxsw.

This scheduler does very well on the fairness tests tst-scheduler and
fairly well on tst-loadbalance. Even better performance on that second
test will require an additional patch for the idle thread to wake other
cpus' load balanacing threads.

As expected the new scheduler is somewhat slower than the existing one
(as we now do some relatively complex calculations instead of trivial
integer operations), but thanks to using approximations when possible
and to various other optimizations, the difference is relatively small:

On my laptop, tst-ctxsw.so, which measures "context switch" time (actually,
also including the time to use mutex and condvar which this test uses to
cause context switching), on the "colocated" test I measured 355 ns with
the old scheduler, and 382 ns with the new scheduler - meaning that the
new scheduler adds 27ns of overhead to every context switch. To see that
this penalty is minor, consider that tst-ctxsw is an extreme example,
doing 3 million context switches a second, and even there it only slows
down the workload by 7%.

Signed-off-by: Nadav Har'El <nyh@cloudius-systems.com>
Signed-off-by: Avi Kivity <avi@cloudius-systems.com>
---
 core/sched.cc    | 404 ++++++++++++++++++++++++++++++++++++++---------
 include/sched.hh |  97 +++++++++++-
 2 files changed, 421 insertions(+), 80 deletions(-)

diff --git a/core/sched.cc b/core/sched.cc
index eb1b36b87..d3175d9b1 100644
--- a/core/sched.cc
+++ b/core/sched.cc
@@ -28,7 +28,7 @@ extern char _percpu_start[], _percpu_end[];
 
 namespace sched {
 
-TRACEPOINT(trace_sched_switch, "to %p vold=%d vnew=%d", thread*, s64, s64);
+TRACEPOINT(trace_sched_switch, "to %p vold=%g vnew=%g", thread*, float, float);
 TRACEPOINT(trace_sched_wait, "");
 TRACEPOINT(trace_sched_wake, "wake %p", thread*);
 TRACEPOINT(trace_sched_migrate, "thread=%p cpu=%d", thread*, unsigned);
@@ -50,10 +50,65 @@ elf::tls_data tls;
 
 inter_processor_interrupt wakeup_ipi{[] {}};
 
-constexpr s64 vruntime_bias = 4_ms;
-constexpr s64 max_slice = 10_ms;
+// "tau" controls the length of the history we consider for scheduling,
+// or more accurately the rate of decay of an exponential moving average.
+// In particular, it can be seen that if a thread has been monopolizing the
+// CPU, and a long-sleeping thread wakes up (or new thread is created),
+// the new thread will get to run for ln2*tau. (ln2 is roughly 0.7).
+constexpr s64 tau = 200_ms;
+
+// "thyst" controls the hysteresis algorithm which temporarily gives a
+// running thread some extra runtime before preempting it. We subtract thyst
+// when the thread is switched in, and add it back when the thread is switched
+// out. In particular, it can be shown that when two cpu-busy threads at equal
+// priority compete, they will alternate at time-slices of 2*thyst; Also,
+// the distance between two preemption interrupts cannot be lower than thyst.
+constexpr s64 thyst = 2_ms;
+
 constexpr s64 context_switch_penalty = 10_us;
 
+constexpr float cmax = 0x1P63;
+constexpr float cinitial = 0x1P-63;
+
+static inline float exp_tau(s64 t) {
+    // return expf((float)t/(float)tau);
+    // Approximate e^x as much faster 1+x for x<0.001 (the error is O(x^2)).
+    // Further speed up by comparing and adding integers as much as we can:
+    static constexpr int m = tau / 1000;
+    static constexpr float invtau = 1.0f / tau;
+    if (t < m && t > -m)
+        return (tau + t) * invtau;
+    else
+        return expf(t * invtau);
+}
+
+// fastlog2() is an approximation of log2, designed for speed over accuracy
+// (it is accurate to roughly 5 digits).
+// The function is copyright (C) 2012 Paul Mineiro, released under the
+// BSD license. See https://code.google.com/p/fastapprox/.
+static inline float
+fastlog2 (float x)
+{
+    union { float f; u32 i; } vx = { x };
+    union { u32 i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
+    float y = vx.i;
+    y *= 1.1920928955078125e-7f;
+    return y - 124.22551499f - 1.498030302f * mx.f
+            - 1.72587999f / (0.3520887068f + mx.f);
+}
+
+static inline float taulog(float f) {
+    //return tau * logf(f);
+    // We don't need the full accuracy of logf - we use this in time_until(),
+    // where it's fine to overshoot, even significantly, the correct time
+    // because a thread running a bit too much will "pay" in runtime.
+    // We multiply by 1.01 to ensure overshoot, not undershoot.
+    static constexpr float tau2 = tau * 0.69314718f * 1.01;
+    return tau2 * fastlog2(f);
+}
+
+static constexpr runtime_t inf = std::numeric_limits<runtime_t>::infinity();
+
 mutex cpu::notifier::_mtx;
 std::list<cpu::notifier*> cpu::notifier::_notifiers __attribute__((init_priority((int)init_prio::notifiers)));
 
@@ -80,6 +135,8 @@ cpu::cpu(unsigned _id)
     , idle_thread()
     , terminating_thread(nullptr)
     , running_since(clock::get()->time())
+    , c(cinitial)
+    , renormalize_count(0)
 {
     auto pcpu_size = _percpu_end - _percpu_start;
     // We want the want the per-cpu area to be aligned as the most strictly
@@ -96,7 +153,7 @@ cpu::cpu(unsigned _id)
 void cpu::init_idle_thread()
 {
     idle_thread = new thread([this] { idle(); }, thread::attr(this));
-    idle_thread->_vruntime = std::numeric_limits<s64>::max();
+    idle_thread->set_priority(thread::priority_idle);
 }
 
 void cpu::schedule()
@@ -106,76 +163,120 @@ void cpu::schedule()
     }
 }
 
+// In the x86 ABI, the FPU state is callee-saved, meaning that a program must
+// not call a function in the middle of an FPU calculation. But if we get a
+// preemption, i.e., the scheduler is called by an interrupt, the currently
+// running thread might be in the middle of a floating point calculation,
+// which we must not trample over.
+// When the scheduler code itself doesn't use the FPU (!scheduler_uses_fpu),
+// we need to save the running thread's FPU state only before switching to
+// a different thread (and restore this thread's FPU state when coming back
+// to this thread). However, if the scheduler itself uses the FPU in its
+// calculations (scheduler_uses_fpu), we always (when in preemption) need
+// to save and restore this thread's FPU state when we enter and exit the
+// scheduler.
+#ifdef RUNTIME_PSEUDOFLOAT
+static constexpr bool scheduler_uses_fpu = false;
+#else
+static constexpr bool scheduler_uses_fpu = true;
+#endif
+
 void cpu::reschedule_from_interrupt(bool preempt)
 {
+    if (scheduler_uses_fpu && preempt) {
+        thread::current()->_fpu.save();
+    }
+
     need_reschedule = false;
     handle_incoming_wakeups();
     auto now = clock::get()->time();
-    thread* p = thread::current();
-    // avoid cycling through the runqueue if p still has the highest priority
-    auto bias = vruntime_bias;
-    s64 current_run = now - running_since;
-    if (p->_vruntime + current_run < 0) { // overflow (idle thread)
-        current_run = 0;
-    }
-    if (current_run > max_slice) {
-        // This thread has run for a long time, or clock:time() jumped. But if
-        // we increase vruntime by the full amount, this thread might go into
-        // a huge cpu time debt and won't be scheduled again for a long time.
-        // So limit the vruntime increase.
-        current_run = max_slice;
-    }
-    if (p->_status == thread::status::running
-            && (runqueue.empty()
-                || p->_vruntime + current_run < runqueue.begin()->_vruntime + bias)) {
-        update_preemption_timer(p, now, current_run);
-        return;
+
+    auto interval = now - running_since;
+    running_since = now;
+    if (interval == 0) {
+        // During startup, the clock may be stuck and we get zero intervals.
+        // To avoid scheduler loops, let's make it non-zero.
+        interval = context_switch_penalty;
     }
-    p->_vruntime += current_run;
-    if (p->_status == thread::status::running) {
+    thread* p = thread::current();
+
+    const auto p_status = p->_status.load();
+    assert(p_status != thread::status::queued);
+
+    p->_runtime.ran_for(interval);
+
+    if (p_status == thread::status::running) {
+        // The current thread is still runnable. Check if it still has the
+        // lowest runtime, and update the timer until the next thread's turn.
+        if (runqueue.empty()) {
+            preemption_timer.cancel();
+            if (scheduler_uses_fpu && preempt) {
+                p->_fpu.restore();
+            }
+            return;
+        } else {
+            auto &t = *runqueue.begin();
+            if (p->_runtime.get_local() < t._runtime.get_local()) {
+                preemption_timer.cancel();
+                auto delta = p->_runtime.time_until(t._runtime.get_local());
+                if (delta > 0) {
+                    preemption_timer.set(now + delta);
+                }
+                if (scheduler_uses_fpu && preempt) {
+                    p->_fpu.restore();
+                }
+                return;
+            }
+        }
+        // If we're here, p no longer has the lowest runtime. Before queuing
+        // p, return the runtime it borrowed for hysteresis.
+        p->_runtime.hysteresis_run_stop();
         p->_status.store(thread::status::queued);
         enqueue(*p);
+    } else {
+        // p is no longer running, so we'll switch to a different thread.
+        // Return the runtime p borrowed for hysteresis.
+        p->_runtime.hysteresis_run_stop();
     }
+
     auto ni = runqueue.begin();
     auto n = &*ni;
     runqueue.erase(ni);
-    running_since = now;
     assert(n->_status.load() == thread::status::queued);
+    trace_sched_switch(n, p->_runtime.get_local(), n->_runtime.get_local());
     n->_status.store(thread::status::running);
-    if (n != thread::current()) {
-        if (preempt) {
-            trace_sched_preempt();
+    n->_runtime.hysteresis_run_start();
+
+    assert(n!=p);
+
+    if (preempt) {
+        trace_sched_preempt();
+        if (!scheduler_uses_fpu) {
+            // If runtime is not a float, we only need to save the FPU here,
+            // just when deciding to switch threads.
             p->_fpu.save();
         }
-        if (p->_status.load(std::memory_order_relaxed) == thread::status::queued
-                && p != idle_thread) {
-            n->_vruntime += context_switch_penalty;
-        }
-        trace_sched_switch(n, p->_vruntime, n->_vruntime);
-        update_preemption_timer(n, now, 0);
-        n->switch_to();
-        if (preempt) {
-            p->_fpu.restore();
-        }
-        if (p->_cpu->terminating_thread) {
-            p->_cpu->terminating_thread->unref();
-            p->_cpu->terminating_thread = nullptr;
-        }
     }
-}
-
-void cpu::update_preemption_timer(thread* current, s64 now, s64 run)
-{
+    if (p->_status.load(std::memory_order_relaxed) == thread::status::queued
+            && p != idle_thread) {
+        n->_runtime.add_context_switch_penalty();
+    }
     preemption_timer.cancel();
-    if (runqueue.empty()) {
-        return;
+    if (!runqueue.empty()) {
+        auto& t = *runqueue.begin();
+        auto delta = n->_runtime.time_until(t._runtime.get_local());
+        if (delta > 0) {
+            preemption_timer.set(now + delta);
+        }
     }
-    auto& t = *runqueue.begin();
-    auto delta = t._vruntime - (current->_vruntime + run);
-    auto expire = now + delta + vruntime_bias;
-    if (expire > 0) {
-        // avoid idle thread related overflow
-        preemption_timer.set(expire);
+    n->switch_to();
+    if (p->_cpu->terminating_thread) {
+        p->_cpu->terminating_thread->unref();
+        p->_cpu->terminating_thread = nullptr;
+    }
+
+    if (preempt) {
+        p->_fpu.restore();
     }
 }
 
@@ -270,6 +371,11 @@ void cpu::handle_incoming_wakeups()
                     t._status.store(thread::status::running);
                 } else {
                     t._status.store(thread::status::queued);
+                    // Make sure the CPU-local runtime measure is suitably
+                    // normalized. We may need to convert a global value to the
+                    // local value when waking up after a CPU migration, or to
+                    // perform renormalizations which we missed while sleeping.
+                    t._runtime.update_after_sleep();
                     enqueue(t);
                     t.resume_timers();
                 }
@@ -278,23 +384,9 @@ void cpu::handle_incoming_wakeups()
     }
 }
 
-void cpu::enqueue(thread& t, bool waking)
+void cpu::enqueue(thread& t)
 {
     trace_sched_queue(&t);
-    if (waking) {
-        // If a waking thread has a really low vruntime, allow it only
-        // one extra timeslice; otherwise it would dominate the runqueue
-        // and starve out other threads
-        auto current = thread::current();
-        if (current != idle_thread) {
-            auto head = current->_vruntime - max_slice;
-            t._vruntime = std::max(t._vruntime, head);
-        }
-    }
-    // special treatment for idle thread: make sure it is in the back of the queue
-    if (&t == idle_thread) {
-        t._vruntime = thread::max_vruntime;
-    }
     runqueue.insert_equal(t);
 }
 
@@ -343,6 +435,9 @@ void cpu::load_balance()
             mig._status.store(thread::status::waking);
             mig.suspend_timers();
             mig._cpu = min;
+            // Convert the CPU-local runtime measure to a globally meaningful
+            // measure
+            mig._runtime.export_runtime();
             mig.remote_thread_local_var(::percpu_base) = min->percpu_base;
             mig.remote_thread_local_var(current_cpu) = min;
             min->incoming_wakeups[id].push_front(mig);
@@ -393,14 +488,28 @@ void thread::yield()
     if (t->_cpu->runqueue.empty()) {
         return;
     }
-    // TODO: need to give up some vruntime (move to borrow) so we're last
-    // on the queue, and then we can use push_back()
-    t->_cpu->runqueue.insert_equal(*t);
     assert(t->_status.load() == status::running);
-    t->_status.store(status::queued);
+    // Do not yield to a thread with idle priority
+    thread &tnext = *(t->_cpu->runqueue.begin());
+    if (tnext.priority() == thread::priority_idle) {
+        return;
+    }
+    t->_runtime.set_local(tnext._runtime);
+    // Note that reschedule_from_interrupt will further increase t->_runtime
+    // by thyst, giving the other thread 2*thyst to run before going back to t
     t->_cpu->reschedule_from_interrupt(false);
 }
 
+void thread::set_priority(float priority)
+{
+    _runtime.set_priority(priority);
+}
+
+float thread::priority() const
+{
+    return _runtime.priority();
+}
+
 thread::stack_info::stack_info()
     : begin(nullptr), size(0), deleter(nullptr)
 {
@@ -444,8 +553,8 @@ T& thread::remote_thread_local_var(T& var)
 thread::thread(std::function<void ()> func, attr attr, bool main)
     : _func(func)
     , _status(status::unstarted)
+    , _runtime(thread::priority_default)
     , _attr(attr)
-    , _vruntime((main || !s_current) ? 0 : current()->_vruntime)
     , _ref_counter(1)
     , _joiner()
 {
@@ -945,6 +1054,151 @@ void init_tls(elf::tls_data tls_data)
     tls = tls_data;
 }
 
+// For a description of the algorithms behind the thread_runtime::*
+// implementation, please refer to:
+// https://docs.google.com/document/d/1W7KCxOxP-1Fy5EyF2lbJGE2WuKmu5v0suYqoHas1jRM
+
+void thread_runtime::export_runtime()
+{
+    _Rtt /= cpu::current()->c;;
+    _renormalize_count = -1; // special signal to update_after_sleep()
+}
+
+void thread_runtime::update_after_sleep()
+{
+    auto cpu_renormalize_count = cpu::current()->renormalize_count;
+    if (_renormalize_count == cpu_renormalize_count) {
+        return;
+    }
+    if (_renormalize_count == -1) {
+        // export_runtime() was used to convert the CPU-local runtime to
+        // a global value. We need to convert it back to a local value,
+        // suitable for this CPU.
+        _Rtt *= cpu::current()->c;
+    } else if (_renormalize_count + 1 == cpu_renormalize_count) {
+        _Rtt *= cinitial / cmax;
+    } else if (_Rtt != inf) {
+        // We need to divide by cmax^2 or even a higher power. We assume
+        // this will bring Rtt to zero anyway, so no sense in doing an
+        // accurate calculation
+        _Rtt = 0;
+    }
+    _renormalize_count = cpu_renormalize_count;
+}
+
+void thread_runtime::ran_for(s64 time)
+{
+    assert (_priority > 0);
+    assert (time >= 0);
+
+    cpu *curcpu = cpu::current();
+
+    // When a thread is created, it gets _Rtt = 0, so its _renormalize_count
+    // is irrelevant, and couldn't be set correctly in the constructor.
+    // So set it here.
+    if (!_Rtt) {
+        _renormalize_count = curcpu->renormalize_count;
+    }
+
+    const auto cold = curcpu->c;
+    const auto cnew = cold * exp_tau(time);
+
+    // During our boot process, unfortunately clock::time() jumps by the
+    // amount of host uptime, which can be huge and cause the above
+    // calculation to overflow. In that case, just ignore this time period.
+    if (cnew == inf) {
+        return;
+    }
+    curcpu->c = cnew;
+
+    _Rtt += _priority * (cnew - cold);
+
+    assert (_renormalize_count != -1); // forgot to update_after_sleep?
+
+    // As time goes by, the normalization constant c grows towards infinity.
+    // To avoid an overflow, we need to renormalize if c becomes too big.
+    // We only renormalize the runtime of the running or runnable threads.
+    // Sleeping threads will be renormalized when they wake
+    // (see update_after_sleep()), depending on the number of renormalization
+    // steps they have missed (this is why we need to keep a counter).
+    if (cnew < cmax) {
+        return;
+    }
+    if (++curcpu->renormalize_count < 0) {
+        // Don't use negative values (We use -1 to mark export_runtime())
+        curcpu->renormalize_count = 0;
+    }
+    _Rtt *= cinitial / cmax;
+    _renormalize_count = curcpu->renormalize_count;
+    for (auto &t : curcpu->runqueue) {
+        if (t._runtime._renormalize_count >= 0) {
+            t._runtime._Rtt *= cinitial / cmax;
+            t._runtime._renormalize_count++;
+        }
+    }
+    curcpu->c *= cinitial / cmax;
+}
+
+const auto hysteresis_mul_exp_tau = exp_tau(thyst);
+const auto hysteresis_div_exp_tau = exp_tau(-thyst);
+const auto penalty_exp_tau = exp_tau(context_switch_penalty);
+
+void thread_runtime::hysteresis_run_start()
+{
+    // Optimized version of ran_for(-thyst);
+    if (!_Rtt) {
+        _renormalize_count = cpu::current()->renormalize_count;
+    }
+    const auto cold = cpu::current()->c;
+    const auto cnew = cold * hysteresis_div_exp_tau;
+    cpu::current()->c = cnew;
+    if (_priority == inf) {
+        // TODO: the only reason we need this case is so that time<0
+        // will bring us to +inf, not -inf. Think if there's a cleaner
+        // alternative to doing this if.
+        _Rtt = inf;
+    } else {
+        _Rtt += _priority * (cnew - cold);
+    }
+}
+
+void thread_runtime::hysteresis_run_stop()
+{
+    // Optimized version of ran_for(thyst);
+    if (!_Rtt) {
+        _renormalize_count = cpu::current()->renormalize_count;
+    }
+    const auto cold = cpu::current()->c;
+    const auto cnew = cold * hysteresis_mul_exp_tau;
+    cpu::current()->c = cnew;
+    _Rtt += _priority * (cnew - cold);
+}
+
+void thread_runtime::add_context_switch_penalty()
+{
+    // Does the same as: ran_for(context_switch_penalty);
+    const auto cold = cpu::current()->c;
+    const auto cnew = cold * penalty_exp_tau;
+    cpu::current()->c = cnew;
+    _Rtt += _priority * (cnew - cold);
+
+}
+
+s64 thread_runtime::time_until(runtime_t target_local_runtime) const
+{
+    if (_priority == inf) {
+        return -1;
+    }
+    if (target_local_runtime == inf) {
+        return -1;
+    }
+    auto ret = taulog(runtime_t(1) +
+            (target_local_runtime - _Rtt) / _priority / cpu::current()->c);
+    if (ret > (runtime_t)std::numeric_limits<s64>::max())
+        return -1;
+    return (s64) ret;
+}
+
 }
 
 irq_lock_type irq_lock;
diff --git a/include/sched.hh b/include/sched.hh
index 0d91d8bac..97e08967f 100644
--- a/include/sched.hh
+++ b/include/sched.hh
@@ -22,6 +22,14 @@
 #include <memory>
 #include <vector>
 
+// If RUNTIME_PSEUDOFLOAT, runtime_t is a pseudofloat<>. Otherwise, a float.
+#undef RUNTIME_PSEUDOFLOAT
+#ifdef RUNTIME_PSEUDOFLOAT
+#include <osv/pseudofloat.hh>
+#else
+typedef float runtime_t;
+#endif
+
 extern "C" {
 void smp_main();
 };
@@ -175,6 +183,79 @@ public:
     explicit timer(thread& t);
 };
 
+// thread_runtime is used to maintain the scheduler's view of the thread's
+// priority relative to other threads. It knows about a static priority of the
+// thread (allowing a certain thread to get more runtime than another threads)
+// and is used to maintain the "runtime" of each thread, a number which the
+// scheduler uses to decide which thread to run next, and for how long.
+// All methods of this class should be called only from within the scheduler.
+// https://docs.google.com/document/d/1W7KCxOxP-1Fy5EyF2lbJGE2WuKmu5v0suYqoHas1jRM
+class thread_runtime {
+public:
+    // Get the thread's CPU-local runtime, a number used to sort the runqueue
+    // on this CPU (lowest runtime will be run first). local runtime cannot be
+    // compared between different different CPUs - see export_runtime().
+    inline runtime_t get_local() const
+    {
+        return _Rtt;
+    }
+    // Convert the thread's CPU-local runtime to a global scale which can be
+    // understood on any CPU. Use this function when migrating the thread to a
+    // different CPU, and the destination CPU should run update_after_sleep().
+    void export_runtime();
+    // Update the thread's local runtime after a sleep, when we potentially
+    // missed one or more renormalization steps (which were only done to
+    // runnable threads), or need to convert global runtime to local runtime.
+    void update_after_sleep();
+    // Increase thread's runtime considering that it has now run for "time"
+    // nanoseconds at the current priority.
+    // Remember that the run queue is ordered by local runtime, so never call
+    // ran_for() or hysteresis_*() on a thread which is already in the queue.
+    void ran_for(s64 time);
+    // Temporarily decrease the running thread's runtime to provide hysteresis
+    // (avoid switching threads quickly after deciding on one).
+    // Use hystersis_run_start() when switching to a thread, and
+    // hysteresis_run_stop() when switching away from a thread.
+    void hysteresis_run_start();
+    void hysteresis_run_stop();
+    void add_context_switch_penalty();
+    // Given a target local runtime higher than our own, calculate how much
+    // time (in nanoseconds) it would take until ran_for(time) would bring our
+    // thread to the given target. Returns -1 if the time is too long to
+    // express in s64.
+    s64 time_until(runtime_t target_local_runtime) const;
+
+    void set_priority(runtime_t priority) {
+        _priority = priority;
+    }
+
+    runtime_t priority() const {
+        return _priority;
+    }
+
+    // set runtime from another thread's runtime. The other thread must
+    // be on the same CPU's runqueue.
+    void set_local(thread_runtime &other) {
+        _Rtt = other._Rtt;
+        _renormalize_count = other._renormalize_count;
+    }
+
+    // When _Rtt=0, multiplicative normalization doesn't matter, so it doesn't
+    // matter what we set for _renormalize_count. We can't set it properly
+    // in the constructor (it doesn't run from the scheduler, or know which
+    // CPU's counter to copy), so we'll fix it in ran_for().
+    constexpr thread_runtime(runtime_t priority) :
+            _priority(priority),
+            _Rtt(0), _renormalize_count(-1) { };
+
+private:
+    runtime_t _priority;            // p in the document
+    runtime_t _Rtt;                 // R'' in the document
+    // If _renormalize_count == -1, it means the runtime is global
+    // (i.e., export_runtime() was called, or this is a new thread).
+    int _renormalize_count;
+};
+
 class thread : private timer_base::client {
 public:
     struct stack_info {
@@ -218,6 +299,10 @@ public:
     void* get_tls(ulong module);
     void* setup_tls(ulong module, const void* tls_template,
             size_t init_size, size_t uninit_size);
+    void set_priority(float priority);
+    static constexpr float priority_idle = std::numeric_limits<float>::infinity();
+    static constexpr float priority_default = 1.0;
+    float priority() const;
 private:
     void main();
     void switch_to();
@@ -256,13 +341,12 @@ private:
         terminated,
     };
     std::atomic<status> _status;
+    thread_runtime _runtime;
     attr _attr;
     cpu* _cpu;
     arch_thread _arch;
     arch_fpu _fpu;
     unsigned long _id;
-    s64 _vruntime;
-    static const s64 max_vruntime = std::numeric_limits<s64>::max();
     std::function<void ()> _cleanup;
     // When _ref_counter reaches 0, the thread can be deleted.
     // Starts with 1, decremented by complete() and also temporarily modified
@@ -278,6 +362,7 @@ private:
     friend class timer;
     friend class thread_runtime_compare;
     friend struct arch_cpu;
+    friend class thread_runtime;
     friend void ::smp_main();
     friend void ::smp_launch();
     friend void init(std::function<void ()> cont);
@@ -319,7 +404,7 @@ private:
 class thread_runtime_compare {
 public:
     bool operator()(const thread& t1, const thread& t2) const {
-        return t1._vruntime < t2._vruntime;
+        return t1._runtime.get_local() < t2._runtime.get_local();
     }
 };
 
@@ -362,11 +447,13 @@ struct cpu : private timer_base::client {
     void load_balance();
     unsigned load();
     void reschedule_from_interrupt(bool preempt = false);
-    void enqueue(thread& t, bool waking = false);
+    void enqueue(thread& t);
     void init_idle_thread();
-    void update_preemption_timer(thread* current, s64 now, s64 run);
     virtual void timer_fired() override;
     class notifier;
+    // For scheduler:
+    runtime_t c;
+    int renormalize_count;
 };
 
 class cpu::notifier {
-- 
GitLab