diff --git a/bsd/sys/kern/uipc_socket.cc b/bsd/sys/kern/uipc_socket.cc index f5992fcbc008d1970ca901fa7c83a2f367d1ae93..7182da5164ee8fa438859bcf55d5efbf5acacefd 100644 --- a/bsd/sys/kern/uipc_socket.cc +++ b/bsd/sys/kern/uipc_socket.cc @@ -104,6 +104,7 @@ #include <stddef.h> #include <osv/poll.h> +#include <sys/epoll.h> #include <osv/debug.h> #include <cinttypes> @@ -2652,7 +2653,7 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred, } } - if (revents == 0) { + if (revents == 0 || events & EPOLLET) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { so->so_rcv.sb_flags |= SB_SEL; } diff --git a/core/epoll.cc b/core/epoll.cc index e00ea8ce144c35606178e18b48ca8b496440a6cb..63e9940ec5ed3123dc78d2c4634bd00ee1364598 100644 --- a/core/epoll.cc +++ b/core/epoll.cc @@ -34,9 +34,8 @@ TRACEPOINT(trace_epoll_ready, "file=%p, event=0x%x", file*, int); // We implement epoll using poll(), and therefore need to convert epoll's // event bits to and poll(). These are mostly the same, so the conversion -// is trivial, but we verify this here with static_asserts. We also don't -// support epoll-only flags (namely EPOLLET and EPOLLONESHOT) and assert -// this at runtime. +// is trivial, but we verify this here with static_asserts. We additionally +// support the epoll-only EPOLLET, but not EPOLLONESHOT. static_assert(POLLIN == EPOLLIN, "POLLIN!=EPOLLIN"); static_assert(POLLOUT == EPOLLOUT, "POLLOUT!=EPOLLOUT"); static_assert(POLLRDHUP == EPOLLRDHUP, "POLLRDHUP!=EPOLLRDHUP"); @@ -44,15 +43,10 @@ static_assert(POLLPRI == EPOLLPRI, "POLLPRI!=EPOLLPRI"); static_assert(POLLERR == EPOLLERR, "POLLERR!=EPOLLERR"); static_assert(POLLHUP == EPOLLHUP, "POLLHUP!=EPOLLHUP"); constexpr int SUPPORTED_EVENTS = - EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLPRI | EPOLLERR | EPOLLHUP; + EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLPRI | EPOLLERR | EPOLLHUP | + EPOLLET; inline uint32_t events_epoll_to_poll(uint32_t e) { - static bool warned; - if ((e & EPOLLET) && !warned) { - warned = true; - debug("EPOLLET ignored\n"); - } - e &= ~EPOLLET; assert (!(e & ~SUPPORTED_EVENTS)); return e; } @@ -62,8 +56,14 @@ inline uint32_t events_poll_to_epoll(uint32_t e) return e; } +struct registered_epoll : epoll_event { + int last_poll_wake_count; // For implementing EPOLLET + registered_epoll(epoll_event e, int c) : + epoll_event(e), last_poll_wake_count(c) {} +}; + class epoll_file final : public special_file { - std::unordered_map<file*, epoll_event> map; + std::unordered_map<file*, registered_epoll> map; public: epoll_file() : special_file(0, DTYPE_UNSPEC) {} virtual int close() override { @@ -78,8 +78,11 @@ public: if (map.count(fp)) { return EEXIST; } - map.emplace(std::move(fp), *event); WITH_LOCK(fp->f_lock) { + // I used poll_wake_count-1, to ensure EPOLLET returns once when + // registering an epoll after data is already available. + map.emplace(std::move(fp), + registered_epoll(*event, fp->poll_wake_count - 1)); if (!fp->f_epolls) { fp->f_epolls.reset(new std::vector<file*>); } @@ -90,7 +93,9 @@ public: int mod(file* fp, struct epoll_event *event) { try { - map.at(fp) = *event; + WITH_LOCK(fp->f_lock) { + map.at(fp) = registered_epoll(*event, fp->poll_wake_count - 1); + } return 0; } catch (std::out_of_range &e) { return ENOENT; @@ -112,7 +117,8 @@ public: for (auto &i : map) { int eevents = i.second.events; auto events = events_epoll_to_poll(eevents); - pollfds.emplace_back(i.first, events, 0); + pollfds.emplace_back(i.first, events, 0, + i.second.last_poll_wake_count); } int r = do_poll(pollfds, timeout_ms); if (r > 0) { @@ -122,10 +128,14 @@ public: if (pollfds[i].revents) { --remain; assert(pollfds[i].fp); - events[remain].data = map[pollfds[i].fp.get()].data; + events[remain].data = map.at(pollfds[i].fp.get()).data; events[remain].events = events_poll_to_epoll(pollfds[i].revents); trace_epoll_ready(pollfds[i].fp.get(), pollfds[i].revents); + if (pollfds[i].events & EPOLLET) { + map.at(pollfds[i].fp.get()).last_poll_wake_count = + pollfds[i].last_poll_wake_count; + } } } } diff --git a/core/poll.cc b/core/poll.cc index 1944b2d22e8c41ab615e29385f5ecfe37561fa64..5b9d6ebf1198c80247fb7063f1519090c250ee5f 100644 --- a/core/poll.cc +++ b/core/poll.cc @@ -47,7 +47,7 @@ #include <osv/file.h> #include <osv/poll.h> -#include <osv/debug.h> +#include <sys/epoll.h> #include <bsd/porting/netport.h> #include <bsd/porting/synch.h> @@ -124,6 +124,20 @@ int poll_scan(std::vector<poll_file>& _pfd) } entry->revents = fp->poll(entry->events); + + // Hack for implementing epoll() over poll(). Note we can't avoid the + // above fp->poll() call, because sockets have the SB_SEL optimization + // where only their poll (so_poll_generic()) turns on SB_SEL. + if (entry->events & EPOLLET) { + WITH_LOCK(fp->f_lock) { + if (entry->last_poll_wake_count == fp->poll_wake_count) { + entry->revents = 0; + continue; + } + entry->last_poll_wake_count = fp->poll_wake_count; + } + } + if (entry->revents) { nr_events++; } @@ -166,6 +180,10 @@ int poll_wake(struct file* fp, int events) } } + // poll_wake_count is used for implementing epoll()'s EPOLLET + // over regular poll(). + ++fp->poll_wake_count; + FD_UNLOCK(fp); fdrop(fp); @@ -208,6 +226,12 @@ void poll_install(struct pollreq* p) fp->poll_install(*p); FD_LOCK(fp); TAILQ_INSERT_TAIL(&fp->f_poll_list, pl, _link); + if (entry->events & EPOLLET && + entry->last_poll_wake_count == fp->poll_wake_count) { + // In this case, don't use fp->poll() to check for missed event + FD_UNLOCK(fp); + continue; + } FD_UNLOCK(fp); // We need to check if we missed an event on this file just before // installing the poll request on it above. diff --git a/include/osv/file.h b/include/osv/file.h index adbada86b8c0338cac256fe86be708c8cc06188c..70d4079069ca0f445567d1f9c594a1ea53288aeb 100755 --- a/include/osv/file.h +++ b/include/osv/file.h @@ -98,6 +98,9 @@ struct file { TAILQ_HEAD(, poll_link) f_poll_list; /* poll request list */ mutex_t f_lock; /* lock */ std::unique_ptr<std::vector<file*>> f_epolls; + // poll_wake_count used for implementing epoll()'s EPOLLET using poll(). + // Once we have a real epoll() implementation, it won't be needed. + int poll_wake_count = 0; }; // struct file above is an abstract class; subclasses need to implement 8 diff --git a/include/osv/poll.h b/include/osv/poll.h index 521d6c303e495f258c22b719866e257b7f7e7a9f..54866f43ec9878caacede8c545ff0f8098a481e9 100644 --- a/include/osv/poll.h +++ b/include/osv/poll.h @@ -125,11 +125,13 @@ struct poll_file; struct poll_file { poll_file() = default; - poll_file(fileref fp, short events, short revents) - : fp(fp), events(events), revents(revents) {} + // Note that events is int, not short, for EPOLLET support. + poll_file(fileref fp, int events, short revents, int c = 0) + : fp(fp), events(events), revents(revents), last_poll_wake_count(c) {} fileref fp; - short events; + int events; short revents; + int last_poll_wake_count; // For implementing EPOLLET }; /* diff --git a/tests/tst-epoll.cc b/tests/tst-epoll.cc index 26d5d7a036a553189230a8114fa45a4285f5f0c1..e77d199ac13472a2f3db9e5c4a557ae927214b85 100644 --- a/tests/tst-epoll.cc +++ b/tests/tst-epoll.cc @@ -67,7 +67,7 @@ int main(int ac, char** av) r = epoll_wait(ep, events, MAXEVENTS, 0); report(r == 1 && (events[0].events & EPOLLIN) && - (events[0].data.u32 == 123), "epoll_wait again"); + (events[0].data.u32 == 123), "epoll_wait finds again (because not EPOLLET)"); r = read(s[0], &c, 1); report(r == 1, "read after poll"); @@ -98,6 +98,52 @@ int main(int ac, char** av) report(r == 0 && ((te - ts) > std::chrono::milliseconds(200)), "epoll timeout"); + //////////////////////////////////////////////////////////////////////////// + // Test EPOLLET (edge-triggered event notification) + // Also test EPOLL_CTL_MOD at the same time. + event.events = EPOLLIN | EPOLLET; + event.data.u32 = 456; + r = epoll_ctl(ep, EPOLL_CTL_MOD, s[0], &event); + report(r == 0, "epoll_ctl_mod"); + r = epoll_wait(ep, events, MAXEVENTS, 0); + report(r == 0, "epoll nothing happened yet"); + r = write(s[1], &c, 1); + report(r == 1, "write single character"); + r = epoll_wait(ep, events, MAXEVENTS, 0); + report(r == 1 && (events[0].events & EPOLLIN) && + (events[0].data.u32 == 456), "epoll_wait finds fd"); + r = epoll_wait(ep, events, MAXEVENTS, 0); + report(r == 0, "epoll_wait doesn't find again (because of EPOLLET)"); + // Also verify that epoll_wait with a timeout doesn't return immediately, + // (despite fp->poll() being true right after poll_install()). + ts = std::chrono::high_resolution_clock::now(); + r = epoll_wait(ep, events, MAXEVENTS, 300); + te = std::chrono::high_resolution_clock::now(); + report(r == 0 && ((te - ts) > std::chrono::milliseconds(200)), + "epoll timeout doesn't return immediately (EPOLLET)"); + // The accurate edge-triggered behavior of EPOLLET means that until the + // all the data is read from the pipe, epoll should not find the pipe again + // even if new data comes in. However, both Linux and OSv gives a "false + // positive" where if new data arrives, epoll will return even if the data + // was not fully read previously. + r = write(s[1], &c, 1); + report(r == 1, "write single character"); + r = epoll_wait(ep, events, MAXEVENTS, 0); + report(r == 1 && (events[0].events & EPOLLIN) && + (events[0].data.u32 == 456), "epoll_wait false positive (fine)"); + r = read(s[0], &c, 1); + report(r == 1, "read one byte out of 2 on the pipe"); + // We only read one byte out of the 2 on the pipe, so there's still data + // on the pipe (and poll() verifies this), but with EPOLLET, epoll won't + // return it. + r = epoll_wait(ep, events, MAXEVENTS, 0); + report(r == 0, "now epoll_wait doesn't find this fd (EPOLLET)"); + r = poll(&p, 1, 0); + report(r == 1 && (p.revents & POLLIN), "but poll() does find this fd"); + r = read(s[0], &c, 1); + report(r == 1, "read the last byte on the pipe"); + + std::cout << "SUMMARY: " << tests << ", " << fails << " failures\n"; }