diff --git a/bsd/porting/mmu.cc b/bsd/porting/mmu.cc index 112b121704761aa9b86b240c97a089e88ea0e848..e810f238402746eaeddfb6d1b956ddb9c443cfed 100644 --- a/bsd/porting/mmu.cc +++ b/bsd/porting/mmu.cc @@ -46,16 +46,7 @@ void mmu_unmap(void* ab) pagecache::unmap_arc_buf((arc_buf_t*)ab); } -namespace mmu { -extern mutex vma_list_mutex; -} - -bool mmu_vma_list_trylock() -{ - return mutex_trylock(&mmu::vma_list_mutex); -} - -void mmu_vma_list_unlock() +void mmu_map(void* key, void* ab, void* page) { - mutex_unlock(&mmu::vma_list_mutex); + pagecache::map_arc_buf((pagecache::hashkey*)key, (arc_buf_t*)ab, page); } diff --git a/bsd/porting/mmu.h b/bsd/porting/mmu.h index 79bb3982e9a46f25e805dea07554f0422993135e..50b777466f9d7630685db8b60768868b4547e807 100644 --- a/bsd/porting/mmu.h +++ b/bsd/porting/mmu.h @@ -52,8 +52,7 @@ int vm_paging_needed(void); int vm_throttling_needed(void); void mmu_unmap(void* ab); -bool mmu_vma_list_trylock(); -void mmu_vma_list_unlock(); +void mmu_map(void* key, void* ab, void* page); #define vtophys(_va) virt_to_phys((void *)_va) __END_DECLS diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index a7115eac00aac8ec025648d8dfaff1d94c76bf6f..fd30489536f0740e16de14c53004dad61936228f 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -533,6 +533,8 @@ struct arc_buf_hdr { l2arc_buf_hdr_t *b_l2hdr; list_node_t b_l2node; + + uint32_t b_mmaped; }; static arc_buf_t *arc_eviction_list; @@ -570,7 +572,6 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ -#define ARC_SHARED_BUF (1 << 19) /* shared in OS' page cache */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) @@ -585,7 +586,6 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) -#define HDR_SHARED_BUF(hdr) ((hdr)->b_flags & ARC_SHARED_BUF) /* * Other sizes @@ -1479,24 +1479,17 @@ void arc_share_buf(arc_buf_t *abuf) { arc_buf_hdr_t *hdr = abuf->b_hdr; - uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - mutex_lock(hash_lock); - hdr->b_flags |= ARC_SHARED_BUF; - mutex_unlock(hash_lock); + hdr->b_mmaped = 1; } void arc_unshare_buf(arc_buf_t *abuf) { arc_buf_hdr_t *hdr = abuf->b_hdr; - uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); - kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - mutex_lock(hash_lock); - hdr->b_flags &= ~ARC_SHARED_BUF; - mutex_unlock(hash_lock); + ASSERT(hdr->b_mmaped != 1); + hdr->b_mmaped = 0; } /* @@ -1620,6 +1613,10 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; + if (buf->b_hdr->b_mmaped) { + mmu_unmap(buf); + } + arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); @@ -1916,17 +1913,6 @@ evict_start: missed += 1; break; } - if (HDR_SHARED_BUF(ab)) { - if (mmu_vma_list_trylock()) { - mmu_unmap(buf); - ab->b_flags &= ~ARC_SHARED_BUF; - mmu_vma_list_unlock(); - } else { - mutex_exit(&buf->b_evict_lock); - missed += 1; - break; - } - } if (buf->b_data) { bytes_evicted += ab->b_size; if (recycle && ab->b_type == type && diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index f2ddda329275d71212841b961b0ad5de24379b97..03c8933935a5b8b2f3818a5caa0913e36466e067 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -46,6 +46,8 @@ #include <sys/zfs_znode.h> #endif +#include <bsd/porting/mmu.h> + /* * Enable/disable nopwrite feature. */ @@ -984,7 +986,6 @@ dmu_map_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int err; - struct iovec *iov; int numbufs = 0; // This will acquire a reference both in the dbuf, and in the ARC buffer. @@ -1000,14 +1001,7 @@ dmu_map_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; - iov = &uio->uio_iov[0]; - iov->iov_base = dbuf_abuf->b_data + (uio->uio_loffset - db->db_offset); - iov->iov_len = db->db_size; - - iov = &uio->uio_iov[1]; - iov->iov_base = dbuf_abuf; - - arc_share_buf(dbuf_abuf); + mmu_map(uio->uio_iov->iov_base, dbuf_abuf, dbuf_abuf->b_data + (uio->uio_loffset - db->db_offset)); dmu_buf_rele_array(dbp, numbufs, FTAG); diff --git a/core/pagecache.cc b/core/pagecache.cc index d879ba510dc61fc5f963c878208f5b34f7e454a2..cd3abcd513baa8dfa4886f5bfbfa4cc6f1f902f8 100644 --- a/core/pagecache.cc +++ b/core/pagecache.cc @@ -15,21 +15,9 @@ #include <fs/vfs/vfs.h> #include <osv/trace.hh> -extern "C" void arc_unshare_buf(arc_buf_t*); - -namespace mmu { - extern mutex vma_list_mutex; -}; - -namespace pagecache { -struct hashkey { - dev_t dev; - ino_t ino; - off_t offset; - bool operator==(const hashkey& a) const noexcept { - return (dev == a.dev) && (ino == a.ino) && (offset == a.offset); - } -}; +extern "C" { +void arc_unshare_buf(arc_buf_t*); +void arc_share_buf(arc_buf_t*); } namespace std { @@ -250,16 +238,29 @@ std::unordered_multimap<arc_buf_t*, cached_page_arc*> cached_page_arc::arc_cache static std::unordered_map<hashkey, cached_page_arc*> read_cache; static std::unordered_map<hashkey, cached_page_write*> write_cache; static std::deque<cached_page_write*> write_lru; +static mutex arc_lock; // protects against parallel eviction, parallel creation impossible due to vma_list_lock -TRACEPOINT(trace_add_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*); -void add_mapping(cached_page_arc *cp, mmu::hw_ptep ptep) +template<typename T> +static T find_in_cache(std::unordered_map<hashkey, T>& cache, hashkey& key) { - trace_add_mapping(cp->arcbuf(), cp->addr(), ptep.release()); + auto cpi = cache.find(key); + + if (cpi == cache.end()) { + return nullptr; + } else { + return cpi->second; + } +} + +TRACEPOINT(trace_add_read_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*); +void add_read_mapping(cached_page_arc *cp, mmu::hw_ptep ptep) +{ + trace_add_read_mapping(cp->arcbuf(), cp->addr(), ptep.release()); cp->map(ptep); } TRACEPOINT(trace_remove_mapping, "buf=%p, addr=%p, ptep=%p", void*, void*, void*); -void remove_mapping(cached_page_arc* cp, mmu::hw_ptep ptep) +void remove_read_mapping(cached_page_arc* cp, mmu::hw_ptep ptep) { trace_remove_mapping(cp->arcbuf(), cp->addr(), ptep.release()); if (cp->unmap(ptep) == 0) { @@ -268,6 +269,15 @@ void remove_mapping(cached_page_arc* cp, mmu::hw_ptep ptep) } } +void remove_read_mapping(hashkey& key, mmu::hw_ptep ptep) +{ + SCOPE_LOCK(arc_lock); + cached_page_arc* cp = find_in_cache(read_cache, key); + if (cp) { + remove_read_mapping(cp, ptep); + } +} + TRACEPOINT(trace_drop_read_cached_page, "buf=%p, addr=%p", void*, void*); unsigned drop_read_cached_page(cached_page_arc* cp, bool flush) { @@ -284,23 +294,36 @@ unsigned drop_read_cached_page(cached_page_arc* cp, bool flush) return flushed; } +void drop_read_cached_page(hashkey& key) +{ + SCOPE_LOCK(arc_lock); + cached_page_arc* cp = find_in_cache(read_cache, key); + if (cp) { + drop_read_cached_page(cp, true); + } +} + TRACEPOINT(trace_unmap_arc_buf, "buf=%p", void*); void unmap_arc_buf(arc_buf_t* ab) { - assert(mutex_owned(&mmu::vma_list_mutex)); trace_unmap_arc_buf(ab); + SCOPE_LOCK(arc_lock); cached_page_arc::unmap_arc_buf(ab); } -static cached_page_arc* create_read_cached_page(vfs_file* fp, hashkey& key) +TRACEPOINT(trace_map_arc_buf, "buf=%p page=%p", void*, void*); +void map_arc_buf(hashkey *key, arc_buf_t* ab, void *page) { - cached_page_arc *pc; - arc_buf_t* ab; - void* page; - fp->get_arcbuf(key.offset, &ab, &page); - pc = new cached_page_arc(key, page, ab); - read_cache.emplace(key, pc); - return pc; + trace_map_arc_buf(ab, page); + SCOPE_LOCK(arc_lock); + cached_page_arc* pc = new cached_page_arc(*key, page, ab); + read_cache.emplace(*key, pc); + arc_share_buf(ab); +} + +static void create_read_cached_page(vfs_file* fp, hashkey& key) +{ + fp->get_arcbuf(&key, key.offset); } static std::unique_ptr<cached_page_write> create_write_cached_page(vfs_file* fp, hashkey& key) @@ -333,25 +356,12 @@ static void insert(cached_page_write* cp) { } } -template<typename T> -static T find_in_cache(std::unordered_map<hashkey, T>& cache, hashkey& key) -{ - auto cpi = cache.find(key); - - if (cpi == cache.end()) { - return nullptr; - } else { - return cpi->second; - } -} - bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, bool write, bool shared) { struct stat st; fp->stat(&st); hashkey key {st.st_dev, st.st_ino, offset}; cached_page_write* wcp = find_in_cache(write_cache, key); - cached_page_arc* rcp = find_in_cache(read_cache, key); if (write) { if (!wcp) { @@ -362,14 +372,10 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, boo insert(wcp); // page is moved from ARC to write cache // drop ARC page if exists, removing all mappings - if (rcp) { - drop_read_cached_page(rcp); - } + drop_read_cached_page(key); } else { // remove mapping to ARC page if exists - if (rcp) { - remove_mapping(rcp, ptep); - } + remove_read_mapping(key, ptep); // cow of private page from ARC return mmu::write_pte(newcp->release(), ptep, pte); } @@ -381,11 +387,17 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, boo } } else if (!wcp) { // read fault and page is not in write cache yet, return one from ARC, mark it cow - if (!rcp) { - rcp = create_read_cached_page(fp, key); - } - add_mapping(rcp, ptep); - return mmu::write_pte(rcp->addr(), ptep, mmu::pte_mark_cow(pte, true)); + do { + WITH_LOCK(arc_lock) { + cached_page_arc* cp = find_in_cache(read_cache, key); + if (cp) { + add_read_mapping(cp, ptep); + return mmu::write_pte(cp->addr(), ptep, mmu::pte_mark_cow(pte, true)); + } + } + // page is not in cache yet, create and try again + create_read_cached_page(fp, key); + } while (true); } wcp->map(ptep); @@ -395,24 +407,29 @@ bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, boo bool release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep ptep) { - bool free = false; struct stat st; fp->stat(&st); hashkey key {st.st_dev, st.st_ino, offset}; - cached_page_write *wcp = find_in_cache(write_cache, key); - cached_page_arc *rcp = find_in_cache(read_cache, key); + cached_page_write* wcp = find_in_cache(write_cache, key); // page is either in ARC cache or write cache or private page + if (wcp && wcp->addr() == addr) { // page is in write cache wcp->unmap(ptep); - } else if (rcp && rcp->addr() == addr) { - // page is in ARC - remove_mapping(rcp, ptep); - } else { - // private page - free = true; + return false; + } + + WITH_LOCK(arc_lock) { + cached_page_arc* rcp = find_in_cache(read_cache, key); + if (rcp && rcp->addr() == addr) { + // page is in ARC + remove_read_mapping(rcp, ptep); + return false; + } } - return free; + + // private page, a caller will free it + return true; } } diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc index 475a196e2caed94cd8028a93bfcf4e864ffed1eb..fa0537f4500866a637305b28355eb56de65157dd 100644 --- a/fs/vfs/vfs_fops.cc +++ b/fs/vfs/vfs_fops.cc @@ -154,24 +154,23 @@ bool vfs_file::put_page(void *addr, uintptr_t off, size_t size, mmu::hw_ptep pte // eviction that will hold the mmu-side lock that protects the mappings // Always follow that order. We however can't just get rid of the mmu-side lock, // because not all invalidations will be synchronous. -void vfs_file::get_arcbuf(uintptr_t offset, arc_buf_t** arcbuf, void** page) +void vfs_file::get_arcbuf(void* key, off_t offset) { struct vnode *vp = f_dentry->d_vnode; - iovec io[2]; // 0 - pointer to page, 1 - pointer to ARC buffer + iovec io[1]; + io[0].iov_base = key; uio data; data.uio_iov = io; - data.uio_iovcnt = 2; - data.uio_offset = off_t(offset); + data.uio_iovcnt = 1; + data.uio_offset = offset; data.uio_resid = mmu::page_size; data.uio_rw = UIO_READ; vn_lock(vp); assert(VOP_CACHE(vp, this, &data) == 0); vn_unlock(vp); - *page = io[0].iov_base; - *arcbuf = static_cast<arc_buf_t*>(io[1].iov_base); } std::unique_ptr<mmu::file_vma> vfs_file::mmap(addr_range range, unsigned flags, unsigned perm, off_t offset) diff --git a/include/osv/pagecache.hh b/include/osv/pagecache.hh index d87b3b4b00f174f83d80d574c57e2c91a7a96885..181c15ffb97de8bb79d59995255818d18777036b 100644 --- a/include/osv/pagecache.hh +++ b/include/osv/pagecache.hh @@ -10,10 +10,22 @@ #include <osv/mmu.hh> #include "arch-mmu.hh" +struct arc_buf; +typedef arc_buf arc_buf_t; + namespace pagecache { +struct hashkey { + dev_t dev; + ino_t ino; + off_t offset; + bool operator==(const hashkey& a) const noexcept { + return (dev == a.dev) && (ino == a.ino) && (offset == a.offset); + } +}; + bool get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, mmu::pt_element pte, bool write, bool shared); bool release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep ptep); void unmap_arc_buf(arc_buf_t* ab); - +void map_arc_buf(hashkey* key, arc_buf_t* ab, void* page); } diff --git a/include/osv/vfs_file.hh b/include/osv/vfs_file.hh index 01336b9b750c2acc893a32dfbc15d09b60816db6..7ac85d60e1ac735e93b64c1c4315e886dcc249fb 100644 --- a/include/osv/vfs_file.hh +++ b/include/osv/vfs_file.hh @@ -10,9 +10,6 @@ #include <osv/file.h> -struct arc_buf; -typedef arc_buf arc_buf_t; - class vfs_file final : public file { public: explicit vfs_file(unsigned flags); @@ -28,7 +25,7 @@ public: virtual bool map_page(uintptr_t offset, size_t size, mmu::hw_ptep ptep, mmu::pt_element pte, bool write, bool shared); virtual bool put_page(void *addr, uintptr_t offset, size_t size, mmu::hw_ptep ptep); - void get_arcbuf(uintptr_t offset, arc_buf_t** arcbuf, void** page); + void get_arcbuf(void *key, off_t offset); }; #endif /* VFS_FILE_HH_ */