From d41d748f89cf6e47820e8585cef6eedb837c05be Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@cloudius-systems.com>
Date: Mon, 10 Feb 2014 10:52:10 +0200
Subject: [PATCH] epoll: Support epoll()'s EPOLLET

This patch adds support for epoll()'s edge-triggered mode, EPOLLET.
Fixes #188.

As explained in issue #188, Boost's asio uses EPOLLET heavily, and we use
that library in our management http server, and also in our image creation
tool (cpiod.so). By ignoring EPOLLET, like we did until now, the code worked,
but unnecessarily wasted CPU when epoll_wait() always returned immediately
instead of waiting until a new event.

This patch works within the confines of our existing poll mechanisms -
where epoll() call poll(). We do not change this in this patch, and it
should be changed in the future (see issue #17).

In this patch we add to each struct file a field "poll_wake_count", which
as its name suggests counts the number of poll_wake()s done on this
file. Additionally, epoll remembers the last value it saw of this counter,
so that in poll_scan(), if we see that an fp (polled with EPOLLET) has
an unchanged counter from last time, we do not return readiness on this fp
regardless on whether or not it has readable data.

We have a complication with EPOLLET on sockets. These have an "SB_SEL"
optimization, which avoids calling poll_wake() when it thinks the new
data is not interesting because the old data was not yet consumed, and
also avoids calling poll_wake() if fp->poll() was not previously done.
This optimization is counter-productive for EPOLLET (and causes missed
wakeups) so we need to work around it in the EPOLLET case.

This patch also adds a test for the EPOLLET case in tst-epoll.cc. The test
runs on both OSv and Linux, and can confirm that in the tested scenarios,
Linux and OSv behave the same, including even one same false-positive:
When epoll_wait() tells us there is data in a pipe, and we don't read it,
but then more data comes on a pipe, epoll_wait() will again return a new
event, despite this is not really being an edge event (the pipe didn't
change from empty to not-empty, as it was previously not-empty as well).

Concluding remarks:

The primary goal of this implementation is to stop EPOLLET epoll_wait()
from returning immediately despite nothing have happened on the file.
That was what caused the 100% CPU use before this patch. That being said,
the goal of this patch is NOT to avoid all false-positives or unnecessary
wakeups; When events do occur on the file, we may be doing a bit more
wakeups than strictly necessary. I think this is acceptable (our epoll()
has worse problems) but for posterity, I want to explain:

I already mentioned above one false-positive that also happens on Linux.
Another false-positive wakeup that remains is in one of EPOLLET's classic
use cases: Consider several threads sleeping on epoll() on the same socket
(e.g., TCP listening socket, or UDP socket). When one packet arrives, normal
level-triggered epoll() will wake all the threads, but only one will read
the packet and the rest will find they have nothing to read. With edge-
triggered epoll, only one thread should be woken and the rest would not.
But in our implementation, poll_wake() wakes up *all* the pollers on this
file, so we cannot currently support this optimization.

Signed-off-by: Nadav Har'El <nyh@cloudius-systems.com>
Signed-off-by: Avi Kivity <avi@cloudius-systems.com>
---
 bsd/sys/kern/uipc_socket.cc |  3 ++-
 core/epoll.cc               | 40 +++++++++++++++++++------------
 core/poll.cc                | 26 +++++++++++++++++++-
 include/osv/file.h          |  3 +++
 include/osv/poll.h          |  8 ++++---
 tests/tst-epoll.cc          | 48 ++++++++++++++++++++++++++++++++++++-
 6 files changed, 107 insertions(+), 21 deletions(-)

diff --git a/bsd/sys/kern/uipc_socket.cc b/bsd/sys/kern/uipc_socket.cc
index f5992fcbc..7182da516 100644
--- a/bsd/sys/kern/uipc_socket.cc
+++ b/bsd/sys/kern/uipc_socket.cc
@@ -104,6 +104,7 @@
 #include <stddef.h>
 
 #include <osv/poll.h>
+#include <sys/epoll.h>
 #include <osv/debug.h>
 #include <cinttypes>
 
@@ -2652,7 +2653,7 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
 		}
 	}
 
-    if (revents == 0) {
+    if (revents == 0 || events & EPOLLET) {
         if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
             so->so_rcv.sb_flags |= SB_SEL;
         }
diff --git a/core/epoll.cc b/core/epoll.cc
index e00ea8ce1..63e9940ec 100644
--- a/core/epoll.cc
+++ b/core/epoll.cc
@@ -34,9 +34,8 @@ TRACEPOINT(trace_epoll_ready, "file=%p, event=0x%x", file*, int);
 
 // We implement epoll using poll(), and therefore need to convert epoll's
 // event bits to and poll(). These are mostly the same, so the conversion
-// is trivial, but we verify this here with static_asserts. We also don't
-// support epoll-only flags (namely EPOLLET and EPOLLONESHOT) and assert
-// this at runtime.
+// is trivial, but we verify this here with static_asserts. We additionally
+// support the epoll-only EPOLLET, but not EPOLLONESHOT.
 static_assert(POLLIN == EPOLLIN, "POLLIN!=EPOLLIN");
 static_assert(POLLOUT == EPOLLOUT, "POLLOUT!=EPOLLOUT");
 static_assert(POLLRDHUP == EPOLLRDHUP, "POLLRDHUP!=EPOLLRDHUP");
@@ -44,15 +43,10 @@ static_assert(POLLPRI == EPOLLPRI, "POLLPRI!=EPOLLPRI");
 static_assert(POLLERR == EPOLLERR, "POLLERR!=EPOLLERR");
 static_assert(POLLHUP == EPOLLHUP, "POLLHUP!=EPOLLHUP");
 constexpr int SUPPORTED_EVENTS =
-        EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLPRI | EPOLLERR | EPOLLHUP;
+        EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLPRI | EPOLLERR | EPOLLHUP |
+        EPOLLET;
 inline uint32_t events_epoll_to_poll(uint32_t e)
 {
-    static bool warned;
-    if ((e & EPOLLET) && !warned) {
-        warned = true;
-        debug("EPOLLET ignored\n");
-    }
-    e &= ~EPOLLET;
     assert (!(e & ~SUPPORTED_EVENTS));
     return e;
 }
@@ -62,8 +56,14 @@ inline uint32_t events_poll_to_epoll(uint32_t e)
     return e;
 }
 
+struct registered_epoll : epoll_event {
+    int last_poll_wake_count; // For implementing EPOLLET
+    registered_epoll(epoll_event e, int c) :
+        epoll_event(e), last_poll_wake_count(c) {}
+};
+
 class epoll_file final : public special_file {
-    std::unordered_map<file*, epoll_event> map;
+    std::unordered_map<file*, registered_epoll> map;
 public:
     epoll_file() : special_file(0, DTYPE_UNSPEC) {}
     virtual int close() override {
@@ -78,8 +78,11 @@ public:
         if (map.count(fp)) {
             return EEXIST;
         }
-        map.emplace(std::move(fp), *event);
         WITH_LOCK(fp->f_lock) {
+            // I used poll_wake_count-1, to ensure EPOLLET returns once when
+            // registering an epoll after data is already available.
+            map.emplace(std::move(fp),
+                    registered_epoll(*event, fp->poll_wake_count - 1));
             if (!fp->f_epolls) {
                 fp->f_epolls.reset(new std::vector<file*>);
             }
@@ -90,7 +93,9 @@ public:
     int mod(file* fp, struct epoll_event *event)
     {
         try {
-            map.at(fp) = *event;
+            WITH_LOCK(fp->f_lock) {
+                map.at(fp) = registered_epoll(*event, fp->poll_wake_count - 1);
+            }
             return 0;
         } catch (std::out_of_range &e) {
             return ENOENT;
@@ -112,7 +117,8 @@ public:
         for (auto &i : map) {
             int eevents = i.second.events;
             auto events = events_epoll_to_poll(eevents);
-            pollfds.emplace_back(i.first, events, 0);
+            pollfds.emplace_back(i.first, events, 0,
+                    i.second.last_poll_wake_count);
         }
         int r = do_poll(pollfds, timeout_ms);
         if (r > 0) {
@@ -122,10 +128,14 @@ public:
                 if (pollfds[i].revents) {
                     --remain;
                     assert(pollfds[i].fp);
-                    events[remain].data = map[pollfds[i].fp.get()].data;
+                    events[remain].data = map.at(pollfds[i].fp.get()).data;
                     events[remain].events =
                             events_poll_to_epoll(pollfds[i].revents);
                     trace_epoll_ready(pollfds[i].fp.get(), pollfds[i].revents);
+                    if (pollfds[i].events & EPOLLET) {
+                        map.at(pollfds[i].fp.get()).last_poll_wake_count =
+                                pollfds[i].last_poll_wake_count;
+                    }
                 }
             }
         }
diff --git a/core/poll.cc b/core/poll.cc
index 1944b2d22..5b9d6ebf1 100644
--- a/core/poll.cc
+++ b/core/poll.cc
@@ -47,7 +47,7 @@
 
 #include <osv/file.h>
 #include <osv/poll.h>
-#include <osv/debug.h>
+#include <sys/epoll.h>
 
 #include <bsd/porting/netport.h>
 #include <bsd/porting/synch.h>
@@ -124,6 +124,20 @@ int poll_scan(std::vector<poll_file>& _pfd)
         }
 
         entry->revents = fp->poll(entry->events);
+
+        // Hack for implementing epoll() over poll(). Note we can't avoid the
+        // above fp->poll() call, because sockets have the SB_SEL optimization
+        // where only their poll (so_poll_generic()) turns on SB_SEL.
+        if (entry->events & EPOLLET) {
+            WITH_LOCK(fp->f_lock) {
+                if (entry->last_poll_wake_count == fp->poll_wake_count) {
+                    entry->revents = 0;
+                    continue;
+                }
+                entry->last_poll_wake_count = fp->poll_wake_count;
+            }
+        }
+
         if (entry->revents) {
             nr_events++;
         }
@@ -166,6 +180,10 @@ int poll_wake(struct file* fp, int events)
         }
     }
 
+    // poll_wake_count is used for implementing epoll()'s EPOLLET
+    // over regular poll().
+    ++fp->poll_wake_count;
+
     FD_UNLOCK(fp);
     fdrop(fp);
 
@@ -208,6 +226,12 @@ void poll_install(struct pollreq* p)
         fp->poll_install(*p);
         FD_LOCK(fp);
         TAILQ_INSERT_TAIL(&fp->f_poll_list, pl, _link);
+        if (entry->events & EPOLLET &&
+                entry->last_poll_wake_count == fp->poll_wake_count) {
+            // In this case, don't use fp->poll() to check for missed event
+            FD_UNLOCK(fp);
+            continue;
+        }
         FD_UNLOCK(fp);
         // We need to check if we missed an event on this file just before
         // installing the poll request on it above.
diff --git a/include/osv/file.h b/include/osv/file.h
index adbada86b..70d407906 100755
--- a/include/osv/file.h
+++ b/include/osv/file.h
@@ -98,6 +98,9 @@ struct file {
 	TAILQ_HEAD(, poll_link) f_poll_list; /* poll request list */
 	mutex_t		f_lock;		/* lock */
 	std::unique_ptr<std::vector<file*>> f_epolls;
+	// poll_wake_count used for implementing epoll()'s EPOLLET using poll().
+	// Once we have a real epoll() implementation, it won't be needed.
+	int poll_wake_count = 0;
 };
 
 // struct file above is an abstract class; subclasses need to implement 8
diff --git a/include/osv/poll.h b/include/osv/poll.h
index 521d6c303..54866f43e 100644
--- a/include/osv/poll.h
+++ b/include/osv/poll.h
@@ -125,11 +125,13 @@ struct poll_file;
 
 struct poll_file {
     poll_file() = default;
-    poll_file(fileref fp, short events, short revents)
-        : fp(fp), events(events), revents(revents) {}
+    // Note that events is int, not short, for EPOLLET support.
+    poll_file(fileref fp, int events, short revents, int c = 0)
+        : fp(fp), events(events), revents(revents), last_poll_wake_count(c) {}
     fileref fp;
-    short events;
+    int events;
     short revents;
+    int last_poll_wake_count;   // For implementing EPOLLET
 };
 
 /*
diff --git a/tests/tst-epoll.cc b/tests/tst-epoll.cc
index 26d5d7a03..e77d199ac 100644
--- a/tests/tst-epoll.cc
+++ b/tests/tst-epoll.cc
@@ -67,7 +67,7 @@ int main(int ac, char** av)
 
     r = epoll_wait(ep, events, MAXEVENTS, 0);
     report(r == 1 && (events[0].events & EPOLLIN) &&
-            (events[0].data.u32 == 123), "epoll_wait again");
+            (events[0].data.u32 == 123), "epoll_wait finds again (because not EPOLLET)");
 
     r = read(s[0], &c, 1);
     report(r == 1, "read after poll");
@@ -98,6 +98,52 @@ int main(int ac, char** av)
     report(r == 0 && ((te - ts) > std::chrono::milliseconds(200)),
             "epoll timeout");
 
+    ////////////////////////////////////////////////////////////////////////////
+    // Test EPOLLET (edge-triggered event notification)
+    // Also test EPOLL_CTL_MOD at the same time.
+    event.events = EPOLLIN | EPOLLET;
+    event.data.u32 = 456;
+    r = epoll_ctl(ep, EPOLL_CTL_MOD, s[0], &event);
+    report(r == 0, "epoll_ctl_mod");
+    r = epoll_wait(ep, events, MAXEVENTS, 0);
+    report(r == 0, "epoll nothing happened yet");
+    r = write(s[1], &c, 1);
+    report(r == 1, "write single character");
+    r = epoll_wait(ep, events, MAXEVENTS, 0);
+    report(r == 1 && (events[0].events & EPOLLIN) &&
+            (events[0].data.u32 == 456), "epoll_wait finds fd");
+    r = epoll_wait(ep, events, MAXEVENTS, 0);
+    report(r == 0, "epoll_wait doesn't find again (because of EPOLLET)");
+    // Also verify that epoll_wait with a timeout doesn't return immediately,
+    // (despite fp->poll() being true right after poll_install()).
+    ts = std::chrono::high_resolution_clock::now();
+    r = epoll_wait(ep, events, MAXEVENTS, 300);
+    te = std::chrono::high_resolution_clock::now();
+    report(r == 0 && ((te - ts) > std::chrono::milliseconds(200)),
+            "epoll timeout doesn't return immediately (EPOLLET)");
+    // The accurate edge-triggered behavior of EPOLLET means that until the
+    // all the data is read from the pipe, epoll should not find the pipe again
+    // even if new data comes in. However, both Linux and OSv gives a "false
+    // positive" where if new data arrives, epoll will return even if the data
+    // was not fully read previously.
+    r = write(s[1], &c, 1);
+    report(r == 1, "write single character");
+    r = epoll_wait(ep, events, MAXEVENTS, 0);
+    report(r == 1 && (events[0].events & EPOLLIN) &&
+            (events[0].data.u32 == 456), "epoll_wait false positive (fine)");
+    r = read(s[0], &c, 1);
+    report(r == 1, "read one byte out of 2 on the pipe");
+    // We only read one byte out of the 2 on the pipe, so there's still data
+    // on the pipe (and poll() verifies this), but with EPOLLET, epoll won't
+    // return it.
+    r = epoll_wait(ep, events, MAXEVENTS, 0);
+    report(r == 0, "now epoll_wait doesn't find this fd (EPOLLET)");
+    r = poll(&p, 1, 0);
+    report(r == 1 && (p.revents & POLLIN), "but poll() does find this fd");
+    r = read(s[0], &c, 1);
+    report(r == 1, "read the last byte on the pipe");
+
+
     std::cout << "SUMMARY: " << tests << ", " << fails << " failures\n";
 }
 
-- 
GitLab