diff --git a/bootfs.manifest b/bootfs.manifest
index a95bdbc1fe54d111d76a7bcf57a0a8cf178d5c0b..5cd8270c7251229cd996ac06c1e369f1f10ff975 100644
--- a/bootfs.manifest
+++ b/bootfs.manifest
@@ -90,6 +90,7 @@
 /&/tests/tst-solaris-taskq.so: ./&
 /&/tests/tst-vfs.so: ./&
 /&/tests/tst-yield.so: ./&
+/&/tests/tst-ctxsw.so: ./&
 /testrunner.so: ./tests/testrunner.so
 /java/Hello.class: ./tests/hello/Hello.class
 /java.so: java/java.so
diff --git a/build.mak b/build.mak
index abd0d137c8df7c8b04dd13e8054d2adaae55c6a8..89e9b3b6690bccb11f7d3661093bba8dc9c266c2 100644
--- a/build.mak
+++ b/build.mak
@@ -19,7 +19,7 @@ COMMON = $(autodepend) -g -Wall -Wno-pointer-arith -Werror -Wformat=0 \
 	-D __BSD_VISIBLE=1 -U _FORTIFY_SOURCE -fno-stack-protector $(INCLUDES) \
 	$(do-sys-includes) \
 	$(arch-cflags) $(conf-opt) $(acpi-defines) $(tracing-flags) \
-	$(configuration) -nostdinc
+	$(configuration) -nostdinc -D__OSV__
 
 tracing-flags-0 =
 tracing-flags-1 = -finstrument-functions -finstrument-functions-exclude-file-list=c++,trace.cc,trace.hh,align.hh
@@ -120,6 +120,7 @@ tests += tests/tst-condvar.so
 tests += tests/tst-queue-mpsc.so
 tests += tests/tst-af-local.so
 tests += tests/tst-yield.so
+tests += tests/tst-ctxsw.so
 
 tests/hello/Hello.class: javabase=tests/hello
 
diff --git a/drivers/kvmclock.cc b/drivers/kvmclock.cc
index e175dfa067f84853e9db8be939979753ff9991ca..b3f06b9e629b95075985a832f5191bc9fe0e2e8d 100644
--- a/drivers/kvmclock.cc
+++ b/drivers/kvmclock.cc
@@ -4,6 +4,7 @@
 #include "mmu.hh"
 #include "string.h"
 #include "cpuid.hh"
+#include "barrier.hh"
 
 class kvmclock : public clock {
 private:
@@ -56,9 +57,9 @@ u64 kvmclock::wall_clock_boot()
     u64 w;
     do {
         v1 = _wall->version;
-        __sync_synchronize();
+        barrier();
         w = u64(_wall->sec) * 1000000000 + _wall->nsec;
-        __sync_synchronize();
+        barrier();
         v2 = _wall->version;
     } while (v1 != v2);
     return w;
@@ -70,7 +71,7 @@ u64 kvmclock::system_time()
     u64 time;
     do {
         v1 = _sys->version;
-        __sync_synchronize();
+        barrier();
         time = processor::rdtsc() - _sys->tsc_timestamp;
         if (_sys->tsc_shift >= 0) {
             time <<= _sys->tsc_shift;
@@ -82,7 +83,7 @@ u64 kvmclock::system_time()
                 : "rm"(u64(_sys->tsc_to_system_mul))
                 : "rdx");
         time += _sys->system_time;
-        __sync_synchronize();
+        barrier();
         v2 = _sys->version;
     } while (v1 != v2);
     return time;
diff --git a/include/sched.hh b/include/sched.hh
index dae74d3319e8350dcbf9516357eb2233a6e89a1a..cd5e995881cb11005700266d89e6b2b252daeff2 100644
--- a/include/sched.hh
+++ b/include/sched.hh
@@ -40,12 +40,12 @@ const unsigned max_cpus = sizeof(unsigned long) * 8;
 class cpu_set {
 public:
     explicit cpu_set() : _mask() {}
-    cpu_set(const cpu_set& other) : _mask(other._mask.load()) {}
+    cpu_set(const cpu_set& other) : _mask(other._mask.load(std::memory_order_relaxed)) {}
     void set(unsigned c) {
-        _mask.fetch_or(1UL << c);
+        _mask.fetch_or(1UL << c, std::memory_order_release);
     }
     void clear(unsigned c) {
-        _mask.fetch_and(~(1UL << c));
+        _mask.fetch_and(~(1UL << c), std::memory_order_release);
     }
     class iterator;
     iterator begin() {
@@ -56,7 +56,9 @@ public:
     }
     cpu_set fetch_clear() {
         cpu_set ret;
-        ret._mask = _mask.exchange(0);
+        if (_mask.load(std::memory_order_relaxed)) {
+            ret._mask = _mask.exchange(0, std::memory_order_acquire);
+        }
         return ret;
     }
     operator bool() const {
diff --git a/tests/tst-ctxsw.cc b/tests/tst-ctxsw.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c1af92f298382cd1b189f7fb282e956cdcc236d
--- /dev/null
+++ b/tests/tst-ctxsw.cc
@@ -0,0 +1,166 @@
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <pthread.h>
+#include <sys/time.h>
+#include <cinttypes>
+#include <stdio.h>
+
+#ifdef __OSV__
+
+#include <sched.hh>
+
+class pinned_thread {
+public:
+    explicit pinned_thread(std::function<void ()> f);
+    void pin(unsigned cpu);
+    void start();
+    void join();
+private:
+    std::function<void ()> _f;
+    sched::thread::attr _attr;
+    std::unique_ptr<sched::thread> _thread;
+};
+
+pinned_thread::pinned_thread(std::function<void ()> f)
+    : _f(f)
+{
+}
+
+void pinned_thread::pin(unsigned cpu)
+{
+    _attr.pinned_cpu = sched::cpus[cpu];
+}
+
+void pinned_thread::start()
+{
+    _thread.reset(new sched::thread(_f, _attr));
+    _thread->start();
+}
+
+void pinned_thread::join()
+{
+    _thread->join();
+}
+
+#else
+
+#include <thread>
+#include <sched.h>
+
+class pinned_thread {
+public:
+    explicit pinned_thread(std::function<void ()> f);
+    void pin(unsigned cpu);
+    void start();
+    void join();
+private:
+    void do_pin();
+private:
+    std::function<void ()> _f;
+    bool _is_pinned = false;
+    unsigned _cpu;
+    std::unique_ptr<std::thread> _thread;
+};
+
+pinned_thread::pinned_thread(std::function<void ()> f)
+    : _f(f)
+{
+}
+
+void pinned_thread::pin(unsigned cpu)
+{
+    _is_pinned = true;
+    _cpu = cpu;
+}
+
+void pinned_thread::start()
+{
+    _thread.reset(new std::thread([=] { do_pin(); _f(); }));
+}
+
+void pinned_thread::do_pin()
+{
+    if (_is_pinned) {
+        cpu_set_t cs;
+        CPU_ZERO(&cs);
+        CPU_SET(_cpu, &cs);
+        sched_setaffinity(0, sizeof(cs), &cs);
+    }
+}
+
+void pinned_thread::join()
+{
+    _thread->join();
+}
+
+#endif
+
+pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+unsigned owner;
+unsigned remain;
+
+void run(unsigned me)
+{
+    bool done = false;
+    while (!done) {
+        pthread_mutex_lock(&mtx);
+        while (owner != me) {
+            pthread_cond_wait(&cond, &mtx);
+        }
+        if (remain == 0) {
+            done = true;
+        } else {
+            --remain;
+        }
+        owner = !me;
+        pthread_mutex_unlock(&mtx);
+        pthread_cond_signal(&cond);
+    }
+}
+
+
+uint64_t nstime()
+{
+    timeval tv;
+    gettimeofday(&tv, nullptr);
+    return tv.tv_sec * uint64_t(1000000000) + tv.tv_usec * uint64_t(1000);
+}
+
+void test(std::string name,
+        std::function<void (pinned_thread& t)> pin0,
+        std::function<void (pinned_thread& t)> pin1)
+{
+    pinned_thread t0([] { run(0); }), t1([] { run(1); });
+    pin0(t0);
+    pin1(t1);
+    auto n_iterations = 10000000;
+    remain = n_iterations;
+
+    auto start = nstime();
+
+    t0.start();
+    t1.start();
+
+    t0.join();
+    t1.join();
+
+    auto end = nstime();
+
+    printf("%10" PRIu64 " %s\n", (end - start) / n_iterations, name.c_str());
+}
+
+int main(int ac, char** av)
+{
+    auto pin0 = [](pinned_thread& t) { t.pin(0); };
+    auto pin1 = [](pinned_thread& t) { t.pin(1); };
+    auto nopin = [](pinned_thread& t) {};
+    test("colocated", pin0, pin0);
+    test("apart", pin0, pin1);
+    test("nopin", nopin, nopin);
+}
+
+
+