From 4792e60c4140dea64e09d6f7b7c2c48f63e2df30 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@cloudius-systems.com>
Date: Wed, 23 Apr 2014 15:28:48 +0300
Subject: [PATCH] pagecache: change locking between mmu and ARC

Now vma_list_mutex is used to protect against races between ARC buffer
mapping by MMU and eviction by ZFS. The problem is that MMU code calls
into ZFS with vma_list_mutex held, so on that path all ZFS related locks
are taken after vma_list_mutex. An attempt to acquire vma_list_mutex
during ARC buffer eviction, while many of the same ZFS locks are already
held, causes deadlock. It was solved by using trylock() and skipping an
eviction if vma_list_mutex cannot be acquired, but it appears that some
mmapped buffers are destroyed not during eviction, but after writeback and
this destruction cannot be delayed. It calls for locking scheme redesign.

This patch introduce arc_lock that have to be held during access to
read_cache. It prevents simultaneous eviction and mapping. arc_lock should
be the most inner lock held on any code path. Code is change to adhere to
this rule. For that the patch replaces ARC_SHARED_BUF flag by new b_mmaped
field. The reason is that access to b_flags field is guarded by hash_lock
and it is impossible to guaranty same order between hash_lock and arc_lock
on all code paths. Dropping the need for hash_lock is a nice solution.

Signed-off-by: Gleb Natapov <gleb@cloudius-systems.com>
---
 bsd/porting/mmu.cc                            |  13 +-
 bsd/porting/mmu.h                             |   3 +-
 .../opensolaris/uts/common/fs/zfs/arc.c       |  32 ++--
 .../opensolaris/uts/common/fs/zfs/dmu.c       |  12 +-
 core/pagecache.cc                             | 141 ++++++++++--------
 fs/vfs/vfs_fops.cc                            |  11 +-
 include/osv/pagecache.hh                      |  14 +-
 include/osv/vfs_file.hh                       |   5 +-
 8 files changed, 113 insertions(+), 118 deletions(-)

diff --git a/bsd/porting/mmu.cc b/bsd/porting/mmu.cc
index 112b12170..e810f2384 100644
--- a/bsd/porting/mmu.cc
+++ b/bsd/porting/mmu.cc
@@ -46,16 +46,7 @@ void mmu_unmap(void* ab)
     pagecache::unmap_arc_buf((arc_buf_t*)ab);
 }
 
-namespace mmu {
-extern mutex vma_list_mutex;
-}
-
-bool mmu_vma_list_trylock()
-{
-    return mutex_trylock(&mmu::vma_list_mutex);
-}
-
-void mmu_vma_list_unlock()
+void mmu_map(void* key, void* ab, void* page)
 {
-    mutex_unlock(&mmu::vma_list_mutex);
+    pagecache::map_arc_buf((pagecache::hashkey*)key, (arc_buf_t*)ab, page);
 }
diff --git a/bsd/porting/mmu.h b/bsd/porting/mmu.h
index 79bb3982e..50b777466 100644
--- a/bsd/porting/mmu.h
+++ b/bsd/porting/mmu.h
@@ -52,8 +52,7 @@ int vm_paging_needed(void);
 int vm_throttling_needed(void);
 
 void mmu_unmap(void* ab);
-bool mmu_vma_list_trylock();
-void mmu_vma_list_unlock();
+void mmu_map(void* key, void* ab, void* page);
 
 #define vtophys(_va) virt_to_phys((void *)_va)
 __END_DECLS
diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index a7115eac0..fd3048953 100644
--- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -533,6 +533,8 @@ struct arc_buf_hdr {
 
 	l2arc_buf_hdr_t		*b_l2hdr;
 	list_node_t		b_l2node;
+
+	uint32_t         b_mmaped;
 };
 
 static arc_buf_t *arc_eviction_list;
@@ -570,7 +572,6 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
 #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
 #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
-#define	ARC_SHARED_BUF		(1 << 19)	/* shared in OS' page cache */
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -585,7 +586,6 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
-#define	HDR_SHARED_BUF(hdr)	((hdr)->b_flags & ARC_SHARED_BUF)
 
 /*
  * Other sizes
@@ -1479,24 +1479,17 @@ void
 arc_share_buf(arc_buf_t *abuf)
 {
 	arc_buf_hdr_t *hdr = abuf->b_hdr;
-	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
-	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 
-	mutex_lock(hash_lock);
-	hdr->b_flags |= ARC_SHARED_BUF;
-	mutex_unlock(hash_lock);
+	hdr->b_mmaped = 1;
 }
 
 void
 arc_unshare_buf(arc_buf_t *abuf)
 {
 	arc_buf_hdr_t *hdr = abuf->b_hdr;
-	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
-	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 
-	mutex_lock(hash_lock);
-	hdr->b_flags &= ~ARC_SHARED_BUF;
-	mutex_unlock(hash_lock);
+	ASSERT(hdr->b_mmaped != 1);
+	hdr->b_mmaped = 0;
 }
 
 /*
@@ -1620,6 +1613,10 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 		uint64_t size = buf->b_hdr->b_size;
 		arc_buf_contents_t type = buf->b_hdr->b_type;
 
+		if (buf->b_hdr->b_mmaped) {
+			mmu_unmap(buf);
+		}
+
 		arc_cksum_verify(buf);
 #ifdef illumos
 		arc_buf_unwatch(buf);
@@ -1916,17 +1913,6 @@ evict_start:
 					missed += 1;
 					break;
 				}
-				if (HDR_SHARED_BUF(ab)) {
-				    if (mmu_vma_list_trylock()) {
-				        mmu_unmap(buf);
-			            ab->b_flags &= ~ARC_SHARED_BUF;
-			            mmu_vma_list_unlock();
-				    } else {
-				        mutex_exit(&buf->b_evict_lock);
-				        missed += 1;
-				        break;
-				    }
-				}
 				if (buf->b_data) {
 					bytes_evicted += ab->b_size;
 					if (recycle && ab->b_type == type &&
diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index f2ddda329..03c893393 100644
--- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -46,6 +46,8 @@
 #include <sys/zfs_znode.h>
 #endif
 
+#include <bsd/porting/mmu.h>
+
 /*
  * Enable/disable nopwrite feature.
  */
@@ -984,7 +986,6 @@ dmu_map_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int err;
-	struct iovec *iov;
 	int numbufs = 0;
 
 	// This will acquire a reference both in the dbuf, and in the ARC buffer.
@@ -1000,14 +1001,7 @@ dmu_map_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	arc_buf_t *dbuf_abuf = dbi->db_buf;
 
-	iov = &uio->uio_iov[0];
-	iov->iov_base = dbuf_abuf->b_data + (uio->uio_loffset - db->db_offset);
-	iov->iov_len = db->db_size;
-
-	iov = &uio->uio_iov[1];
-	iov->iov_base = dbuf_abuf;
-
-	arc_share_buf(dbuf_abuf);
+	mmu_map(uio->uio_iov->iov_base, dbuf_abuf, dbuf_abuf->b_data + (uio->uio_loffset - db->db_offset));
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
diff --git a/core/pagecache.cc b/core/pagecache.cc
index d879ba510..cd3abcd51 100644
--- a/core/pagecache.cc
+++ b/core/pagecache.cc
@@ -15,21 +15,9 @@
 #include <fs/vfs/vfs.h>
 #include <osv/trace.hh>
 
-extern "C" void arc_unshare_buf(arc_buf_t*);
-
-namespace mmu {
-    extern mutex vma_list_mutex;
-};
-
-namespace pagecache {
-struct hashkey {
-    dev_t dev;
-    ino_t ino;
-    off_t offset;
-    bool operator==(const hashkey& a) const noexcept {
-        return (dev == a.dev) && (ino == a.ino) && (offset == a.offset);
-    }
-};
+extern "C" {
+void arc_unshare_buf(arc_buf_t*);
+void arc_share_buf(arc_buf_t*);
 }
 
 namespace std {
@@ -250,16 +238,29 @@ std::unordered_multimap<arc_buf_t*, cached_page_arc*> cached_page_arc::arc_cache
 static std::unordered_map<hashkey, cached_page_arc*> read_cache;
 static std::unordered_map<hashkey, cached_page_write*> write_cache;
 static std::deque<cached_page_write*> write_lru;
+static mutex arc_lock; // protects against parallel eviction, parallel creation impossible due to vma_list_lock
 
-TRACEPOINT(trace_add_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*);
-void add_mapping(cached_page_arc *cp, mmu::hw_ptep ptep)
+template<typename T>
+static T find_in_cache(std::unordered_map<hashkey, T>& cache, hashkey& key)
 {
-    trace_add_mapping(cp->arcbuf(), cp->addr(), ptep.release());
+    auto cpi = cache.find(key);
+
+    if (cpi == cache.end()) {
+        return nullptr;
+    } else {
+        return cpi->second;
+    }
+}
+
+TRACEPOINT(trace_add_read_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*);
+void add_read_mapping(cached_page_arc *cp, mmu::hw_ptep ptep)
+{
+    trace_add_read_mapping(cp->arcbuf(), cp->addr(), ptep.release());
     cp->map(ptep);
 }
 
 TRACEPOINT(trace_remove_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*);
-void remove_mapping(cached_page_arc* cp, mmu::hw_ptep ptep)
+void remove_read_mapping(cached_page_arc* cp, mmu::hw_ptep ptep)
 {
     trace_remove_mapping(cp->arcbuf(), cp->addr(), ptep.release());
     if (cp->unmap(ptep) == 0) {
@@ -268,6 +269,15 @@ void remove_mapping(cached_page_arc* cp, mmu::hw_ptep ptep)
     }
 }
 
+void remove_read_mapping(hashkey& key, mmu::hw_ptep ptep)
+{
+    SCOPE_LOCK(arc_lock);
+    cached_page_arc* cp = find_in_cache(read_cache, key);
+    if (cp) {
+        remove_read_mapping(cp, ptep);
+    }
+}
+
 TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*);
 unsigned drop_read_cached_page(cached_page_arc* cp, bool flush)
 {
@@ -284,23 +294,36 @@ unsigned drop_read_cached_page(cached_page_arc* cp, bool flush)
     return flushed;
 }
 
+void drop_read_cached_page(hashkey& key)
+{
+    SCOPE_LOCK(arc_lock);
+    cached_page_arc* cp = find_in_cache(read_cache, key);
+    if (cp) {
+        drop_read_cached_page(cp, true);
+    }
+}
+
 TRACEPOINT(trace_unmap_arc_buf, "buf=%p", void*);
 void unmap_arc_buf(arc_buf_t* ab)
 {
-    assert(mutex_owned(&mmu::vma_list_mutex));
     trace_unmap_arc_buf(ab);
+    SCOPE_LOCK(arc_lock);
     cached_page_arc::unmap_arc_buf(ab);
 }
 
-static cached_page_arc* create_read_cached_page(vfs_file* fp, hashkey& key)
+TRACEPOINT(trace_map_arc_buf, "buf=%p page=%p", void*, void*);
+void map_arc_buf(hashkey *key, arc_buf_t* ab, void *page)
 {
-    cached_page_arc *pc;
-    arc_buf_t* ab;
-    void* page;
-    fp->get_arcbuf(key.offset, &ab, &page);
-    pc = new cached_page_arc(key, page, ab);
-    read_cache.emplace(key, pc);
-    return pc;
+    trace_map_arc_buf(ab, page);
+    SCOPE_LOCK(arc_lock);
+    cached_page_arc* pc = new cached_page_arc(*key, page, ab);
+    read_cache.emplace(*key, pc);
+    arc_share_buf(ab);
+}
+
+static void create_read_cached_page(vfs_file* fp, hashkey& key)
+{
+    fp->get_arcbuf(&key, key.offset);
 }
 
 static std::unique_ptr<cached_page_write> create_write_cached_page(vfs_file* fp, hashkey& key)
@@ -333,25 +356,12 @@ static void insert(cached_page_write* cp) {
     }
 }
 
-template<typename T>
-static T find_in_cache(std::unordered_map<hashkey, T>& cache, hashkey& key)
-{
-    auto cpi = cache.find(key);
-
-    if (cpi == cache.end()) {
-        return nullptr;
-    } else {
-        return cpi->second;
-    }
-}
-
 bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, bool write, bool shared)
 {
     struct stat st;
     fp->stat(&st);
     hashkey key {st.st_dev, st.st_ino, offset};
     cached_page_write* wcp = find_in_cache(write_cache, key);
-    cached_page_arc* rcp = find_in_cache(read_cache, key);
 
     if (write) {
         if (!wcp) {
@@ -362,14 +372,10 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, boo
                 insert(wcp);
                 // page is moved from ARC to write cache
                 // drop ARC page if exists, removing all mappings
-                 if (rcp) {
-                    drop_read_cached_page(rcp);
-                }
+                drop_read_cached_page(key);
             } else {
                 // remove mapping to ARC page if exists
-                if (rcp) {
-                    remove_mapping(rcp, ptep);
-                }
+                remove_read_mapping(key, ptep);
                 // cow of private page from ARC
                 return mmu::write_pte(newcp->release(), ptep, pte);
             }
@@ -381,11 +387,17 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, boo
         }
     } else if (!wcp) {
         // read fault and page is not in write cache yet, return one from ARC, mark it cow
-        if (!rcp) {
-            rcp = create_read_cached_page(fp, key);
-        }
-        add_mapping(rcp, ptep);
-        return mmu::write_pte(rcp->addr(), ptep, mmu::pte_mark_cow(pte, true));
+        do {
+            WITH_LOCK(arc_lock) {
+                cached_page_arc* cp = find_in_cache(read_cache, key);
+                if (cp) {
+                    add_read_mapping(cp, ptep);
+                    return mmu::write_pte(cp->addr(), ptep, mmu::pte_mark_cow(pte, true));
+                }
+            }
+            // page is not in cache yet, create and try again
+            create_read_cached_page(fp, key);
+        } while (true);
     }
 
     wcp->map(ptep);
@@ -395,24 +407,29 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, boo
 
 bool release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep ptep)
 {
-    bool free = false;
     struct stat st;
     fp->stat(&st);
     hashkey key {st.st_dev, st.st_ino, offset};
-    cached_page_write *wcp = find_in_cache(write_cache, key);
-    cached_page_arc *rcp = find_in_cache(read_cache, key);
+    cached_page_write* wcp = find_in_cache(write_cache, key);
 
     // page is either in ARC cache or write cache or private page
+
     if (wcp && wcp->addr() == addr) {
         // page is in write cache
         wcp->unmap(ptep);
-    } else if (rcp && rcp->addr() == addr) {
-        // page is in ARC
-        remove_mapping(rcp, ptep);
-    } else {
-        // private page
-        free = true;
+        return false;
+    }
+
+    WITH_LOCK(arc_lock) {
+        cached_page_arc* rcp = find_in_cache(read_cache, key);
+        if (rcp && rcp->addr() == addr) {
+            // page is in ARC
+            remove_read_mapping(rcp, ptep);
+            return false;
+        }
     }
-    return free;
+
+    // private page, a caller will free it
+    return true;
 }
 }
diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc
index 475a196e2..fa0537f45 100644
--- a/fs/vfs/vfs_fops.cc
+++ b/fs/vfs/vfs_fops.cc
@@ -154,24 +154,23 @@ bool vfs_file::put_page(void *addr, uintptr_t off, size_t size, mmu::hw_ptep pte
 // eviction that will hold the mmu-side lock that protects the mappings
 // Always follow that order. We however can't just get rid of the mmu-side lock,
 // because not all invalidations will be synchronous.
-void vfs_file::get_arcbuf(uintptr_t offset, arc_buf_t** arcbuf, void** page)
+void vfs_file::get_arcbuf(void* key, off_t offset)
 {
     struct vnode *vp = f_dentry->d_vnode;
 
-    iovec io[2]; // 0 - pointer to page, 1 - pointer to ARC buffer
+    iovec io[1];
 
+    io[0].iov_base = key;
     uio data;
     data.uio_iov = io;
-    data.uio_iovcnt = 2;
-    data.uio_offset = off_t(offset);
+    data.uio_iovcnt = 1;
+    data.uio_offset = offset;
     data.uio_resid = mmu::page_size;
     data.uio_rw = UIO_READ;
 
     vn_lock(vp);
     assert(VOP_CACHE(vp, this, &data) == 0);
     vn_unlock(vp);
-    *page = io[0].iov_base;
-    *arcbuf = static_cast<arc_buf_t*>(io[1].iov_base);
 }
 
 std::unique_ptr<mmu::file_vma> vfs_file::mmap(addr_range range, unsigned flags, unsigned perm, off_t offset)
diff --git a/include/osv/pagecache.hh b/include/osv/pagecache.hh
index d87b3b4b0..181c15ffb 100644
--- a/include/osv/pagecache.hh
+++ b/include/osv/pagecache.hh
@@ -10,10 +10,22 @@
 #include <osv/mmu.hh>
 #include "arch-mmu.hh"
 
+struct arc_buf;
+typedef arc_buf arc_buf_t;
+
 namespace pagecache {
 
+struct hashkey {
+    dev_t dev;
+    ino_t ino;
+    off_t offset;
+    bool operator==(const hashkey& a) const noexcept {
+        return (dev == a.dev) && (ino == a.ino) && (offset == a.offset);
+    }
+};
+
 bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, bool write, bool shared);
 bool release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep ptep);
 void unmap_arc_buf(arc_buf_t* ab);
-
+void map_arc_buf(hashkey* key, arc_buf_t* ab, void* page);
 }
diff --git a/include/osv/vfs_file.hh b/include/osv/vfs_file.hh
index 01336b9b7..7ac85d60e 100644
--- a/include/osv/vfs_file.hh
+++ b/include/osv/vfs_file.hh
@@ -10,9 +10,6 @@
 
 #include <osv/file.h>
 
-struct arc_buf;
-typedef arc_buf arc_buf_t;
-
 class vfs_file final : public file {
 public:
     explicit vfs_file(unsigned flags);
@@ -28,7 +25,7 @@ public:
     virtual bool map_page(uintptr_t offset, size_t size, mmu::hw_ptep ptep, mmu::pt_element pte, bool write, bool shared);
     virtual bool put_page(void *addr, uintptr_t offset, size_t size, mmu::hw_ptep ptep);
 
-    void get_arcbuf(uintptr_t offset, arc_buf_t** arcbuf, void** page);
+    void get_arcbuf(void *key, off_t offset);
 };
 
 #endif /* VFS_FILE_HH_ */
-- 
GitLab