From b4e8d47d5eed133f4460aa72fe60a15400386ed4 Mon Sep 17 00:00:00 2001
From: Vlad Zolotarov <vladz@cloudius-systems.com>
Date: Mon, 10 Feb 2014 19:45:11 +0200
Subject: [PATCH] msix: thread affinity

Instead of binding all msix interrupts to cpu 0, have them chase the
interrupt service routine thread and pin themselves to the same cpu.

This patch is based on the patch from Avi Kivity <avi@cloudius-systems.com>
and used some ideas of Nadav Har'El <nyh@cloudius-systems.com>.

It improves the performance of the single thread Rx netperf test by 16%:
before - 25694 Mbps
after  - 29875 Mbps

New in V2:
 - Dropped the functor class - use lambda instead.
 - Fixed the race in a waking flow.
 - Added some comments.
 - Added the performance numbers to the patch description.

Signed-off-by: Vlad Zolotarov <vladz@cloudius-systems.com>
Signed-off-by: Pekka Enberg <penberg@cloudius-systems.com>
---
 core/interrupt.cc        | 62 +++++++++++++++++++++++++++++++++++++++-
 include/osv/interrupt.hh |  1 +
 include/osv/sched.hh     |  3 ++
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/core/interrupt.cc b/core/interrupt.cc
index ef3b18166..73248b65f 100644
--- a/core/interrupt.cc
+++ b/core/interrupt.cc
@@ -17,6 +17,8 @@
 #include <osv/trace.hh>
 
 TRACEPOINT(trace_msix_interrupt, "vector=0x%02x", unsigned);
+TRACEPOINT(trace_msix_migrate, "vector=0x%02x apic_id=0x%x",
+                               unsigned, unsigned);
 
 using namespace pci;
 
@@ -71,6 +73,14 @@ void msix_vector::interrupt(void)
     _handler();
 }
 
+void msix_vector::set_affinity(unsigned apic_id)
+{
+    msi_message msix_msg = apic->compose_msix(_vector, apic_id);
+    for (auto entry_id : _entryids) {
+        _dev->msix_write_entry(entry_id, msix_msg._addr, msix_msg._data);
+    }
+}
+
 interrupt_manager::interrupt_manager(pci::function* dev)
     : _dev(dev)
 {
@@ -81,6 +91,42 @@ interrupt_manager::~interrupt_manager()
 
 }
 
+/**
+ * Changes the affinity of the MSI-X vector to the same CPU where its service
+ * routine thread is bound and then wakes that thread.
+ *
+ * @param current The CPU to which the MSI-X vector is currently bound
+ * @param v MSI-X vector handle
+ * @param t interrupt service routine thread
+ */
+static inline void set_affinity_and_wake(
+    sched::cpu*& current, msix_vector* v, sched::thread* t)
+{
+    auto cpu = t->get_cpu();;
+
+    if (cpu != current) {
+
+        //
+        // According to PCI spec chapter 6.8.3.5 the MSI-X table entry may be
+        // updated only if the entry is masked and the new values are promissed
+        // to be read only when the entry is unmasked.
+        //
+        v->msix_mask_entries();
+
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+
+        current = cpu;
+        trace_msix_migrate(v->get_vector(), cpu->arch.apic_id);
+        v->set_affinity(cpu->arch.apic_id);
+
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+
+        v->msix_unmask_entries();
+    }
+
+    t->wake();
+}
+
 bool interrupt_manager::easy_register(std::initializer_list<msix_binding> bindings)
 {
     unsigned n = bindings.size();
@@ -103,7 +149,21 @@ bool interrupt_manager::easy_register(std::initializer_list<msix_binding> bindin
         auto isr = binding.isr;
         auto t = binding.t;
 
-        bool assign_ok = assign_isr(vec, [=] { if (isr) isr(); if (t) t->wake(); });
+        bool assign_ok;
+
+        if (t) {
+            sched::cpu* current = nullptr;
+            assign_ok =
+                assign_isr(vec,
+                    [=]() mutable {
+                                    if (isr)
+                                        isr();
+                                    set_affinity_and_wake(current, vec, t);
+                                  });
+        } else {
+            assign_ok = assign_isr(vec, [=]() { if (isr) isr(); });
+        }
+
         if (!assign_ok) {
             free_vectors(assigned);
             return false;
diff --git a/include/osv/interrupt.hh b/include/osv/interrupt.hh
index b9823ed04..2818fe8ad 100644
--- a/include/osv/interrupt.hh
+++ b/include/osv/interrupt.hh
@@ -35,6 +35,7 @@ public:
     void add_entryid(unsigned entry_id);
     void interrupt(void);
     void set_handler(std::function<void ()> handler);
+    void set_affinity(unsigned apic_id);
 
 private:
     // Handler to invoke...
diff --git a/include/osv/sched.hh b/include/osv/sched.hh
index 90a4aa946..026560e5b 100644
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -382,6 +382,9 @@ public:
     static void wait_for(mutex& mtx, waitable&&... waitables);
 
     void wake();
+    cpu* get_cpu() const {
+        return _detached_state.get()->_cpu;
+    }
     // wake up after acquiring mtx
     //
     // mtx must be locked, and wr must be a free wait_record that will
-- 
GitLab