From 8ef7f248887d2f39eb84f82f52ee860f78cb4643 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@cloudius-systems.com>
Date: Wed, 12 Feb 2014 13:42:34 +0200
Subject: [PATCH] Fix concurrent static initialization

When multiple threads concurrently use a function which has a static
variable with a constructor, gcc guarantees that only one thread will run
the constructor, and the other ones waits.  It uses __cxa_guard_acquire()/
release()/abort() for implementing this guarantee. Unfortunately these
functions, implemented in the C++ standard library, use the Linux futex
syscall which we don't implement. The futex system call is only used in
the rare case of contention - in the normal case, the __cxa_guard_*
functions uses just atomic memory operations.

This patch implements the bare minimum we need from futex to make the
__cxa_guard_* functions work: All we need is FUTEX_WAIT and FUTEX_WAKE,
with 0 timeout in FUTEX_WAIT and wake all in FUTEX_WAKE.

It's nice how the new waitqueues fit this implementation like glove to
a hand: We could use condvars too, but we anyway need to do the wake()
with the mutex locked (to get the futex's atomic test-and-wait), and in
that case we can use waitqueues without their additional internal mutex.

This patch also adds a test for this bug. Catching this bug in a real
application is very rare, but the test exposes it by defining an function-
static object with a very slow constructor (it sleeps for a second), and
calls the function from several threads concurrently.
Before this patch the test fails with:
   constructing
   syscall(): unimplemented system call 202. Aborting.

Fixes #199.

Signed-off-by: Nadav Har'El <nyh@cloudius-systems.com>
Signed-off-by: Pekka Enberg <penberg@cloudius-systems.com>
---
 build.mk                     |  1 +
 linux.cc                     | 71 ++++++++++++++++++++++++++-
 tests/tst-concurrent-init.cc | 95 ++++++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 1 deletion(-)
 create mode 100644 tests/tst-concurrent-init.cc

diff --git a/build.mk b/build.mk
index 89c8422bf..baae31c7a 100644
--- a/build.mk
+++ b/build.mk
@@ -241,6 +241,7 @@ tests += tests/tst-pthread-clock.so
 tests += tests/misc-procfs.so
 tests += tests/tst-chdir.so
 tests += tests/tst-hello.so
+tests += tests/tst-concurrent-init.so
 
 tests/hello/Hello.class: javabase=tests/hello
 
diff --git a/linux.cc b/linux.cc
index 554bd8510..e37cc5f5a 100644
--- a/linux.cc
+++ b/linux.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 Cloudius Systems, Ltd.
+ * Copyright (C) 2013-2014 Cloudius Systems, Ltd.
  *
  * This work is open source software, licensed under the terms of the
  * BSD license as described in the LICENSE file in the top-level directory.
@@ -10,16 +10,67 @@
 #include <osv/debug.hh>
 #include <boost/format.hpp>
 #include <osv/sched.hh>
+#include <osv/mutex.h>
+#include <osv/waitqueue.hh>
 
 #include <syscall.h>
 #include <stdarg.h>
 #include <time.h>
 
+#include <unordered_map>
+
 long gettid()
 {
     return sched::thread::current()->id();
 }
 
+// We don't expect applications to use the Linux futex() system call (it is
+// normally only used to implement higher-level synchronization mechanisms),
+// but unfortunately gcc's C++ runtime uses a subset of futex in the
+// __cxa__guard_* functions, which safeguard the concurrent initialization
+// of function-scope static objects. We only implement here this subset.
+// The __cxa_guard_* functions only call futex in the rare case of contention,
+// in fact so rarely that OSv existed for a year before anyone noticed futex
+// was missing. So the performance of this implementation is not critical.
+static std::unordered_map<void*, waitqueue> queues;
+static mutex queues_mutex;
+enum {
+    FUTEX_WAIT = 0,
+    FUTEX_WAKE = 1,
+};
+
+int futex(int *uaddr, int op, int val, const struct timespec *timeout,
+        int *uaddr2, int val3)
+{
+    switch (op) {
+    case FUTEX_WAIT:
+        assert(timeout == 0);
+        WITH_LOCK(queues_mutex) {
+            if (*uaddr == val) {
+                waitqueue &q = queues[uaddr];
+                q.wait(queues_mutex);
+            }
+        }
+        return 0;
+    case FUTEX_WAKE:
+        assert(val == INT_MAX);
+        WITH_LOCK(queues_mutex) {
+            auto i = queues.find(uaddr);
+            if (i != queues.end()) {
+                i->second.wake_all(queues_mutex);
+                queues.erase(i);
+            }
+        }
+        // FIXME: We are expected to return a count of woken threads, but
+        // wake_all doesn't have this feature, and the only user we care
+        // about, __cxa_guard_*, doesn't need this return value anyway.
+        return 0;
+    default:
+        abort("Unimplemented futex() operation %d\n", op);
+    }
+}
+
+
 long syscall(long number, ...)
 {
     switch (number) {
@@ -44,6 +95,24 @@ long syscall(long number, ...)
         va_end(args);
         return clock_getres(arg1, arg2);
         }
+    case __NR_futex: {
+        va_list args;
+        int *arg1;
+        int arg2;
+        int arg3;
+        const struct timespec *arg4;
+        int *arg5;
+        int arg6;
+        va_start(args, number);
+        arg1 = va_arg(args, typeof(arg1));
+        arg2 = va_arg(args, typeof(arg2));
+        arg3 = va_arg(args, typeof(arg3));
+        arg4 = va_arg(args, typeof(arg4));
+        arg5 = va_arg(args, typeof(arg5));
+        arg6 = va_arg(args, typeof(arg6));
+        va_end(args);
+        return futex(arg1, arg2, arg3, arg4, arg5, arg6);
+    }
     }
 
     abort("syscall(): unimplemented system call %d. Aborting.\n", number);
diff --git a/tests/tst-concurrent-init.cc b/tests/tst-concurrent-init.cc
new file mode 100644
index 000000000..7bfc359fc
--- /dev/null
+++ b/tests/tst-concurrent-init.cc
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+// Test concurrent initialization of function-static varibles.
+// Gcc implements this using __cxa_guard_acquire().
+
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <string>
+#include <vector>
+#include <atomic>
+#include <cassert>
+
+using namespace std;
+
+static int tests = 0, fails = 0;
+
+static void report(bool ok, string msg)
+{
+    ++tests;
+    fails += !ok;
+    cout << (ok ? "PASS" : "FAIL") << ": " << msg << "\n";
+}
+
+// A type with a really slow constructor
+static atomic<int> constructions {0};
+struct mycounter {
+    mycounter() {
+        // This is a really slow constructor
+        cout << "constructing\n";
+        ++constructions;
+        this_thread::sleep_for(chrono::seconds(1));
+    }
+    atomic<int> counter {};
+    void incr() {
+        counter++;
+    }
+    int get() {
+        return counter.load();
+    }
+};
+
+// A function using a static variable with a really slow constructor.
+// If this function is concurrently called from two threads, only
+// one thread will run the constructor, and the second will wait for
+// it to complete (internally, gcc uses __cxa_guard_acquire to do that).
+int f(){
+    static mycounter a;
+    return a.counter++;
+}
+
+int main(int ac, char** av)
+{
+    vector<thread> threads;
+    constexpr int nthreads = 5;
+    int results[nthreads];
+    chrono::milliseconds::rep ms[nthreads];
+
+    for (int i = 0; i < nthreads; i++) {
+        threads.push_back(thread([i, &results, &ms] {
+            auto start = chrono::high_resolution_clock::now();
+            results[i] = f();
+            auto end = chrono::high_resolution_clock::now();
+            ms[i] = chrono::duration_cast<chrono::milliseconds>(end - start).
+                    count();
+        }));
+    }
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    report(constructions.load() == 1, "mycounter constructed once");
+
+    bool seen[nthreads] {};
+    for (int i =0; i < nthreads; i++) {
+        auto r = results[i];
+        report(r >= 0 && r < nthreads && !seen[i],
+                string("thread ") + to_string(i));
+        seen[i] = true;
+        // We don't expect f to finish too quickly, it will need close to
+        // a second to complete the initialization itself, or wait for
+        // another thread doing the initalization.
+        report(ms[i] > 500, string("thread ") + to_string(i) + " slow enough");
+    }
+
+
+    cout << "SUMMARY: " << tests << " tests, " << fails << " failures\n";
+    return fails == 0 ? 0 : 1;
+}
-- 
GitLab